Skip to content
Snippets Groups Projects
Commit 325e7245 authored by Peukert's avatar Peukert
Browse files

writing to result file finalized

parent 83b0a69a
Branches
No related tags found
No related merge requests found
......@@ -4,16 +4,24 @@ import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.stream.Stream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
public class IO {
......@@ -67,4 +75,107 @@ public class IO {
bw.close();
} catch(IOException e){}
}
static Map<String, ArrayList> readMorphemeWordListFromCSVFile(String filepath) throws IOException
{
Map<String, ArrayList> morphemeWordList = new HashMap<String, ArrayList>();
try(BufferedReader br = new BufferedReader(new FileReader(filepath))) {
String line = "";
while ((line = br.readLine()) != null) {
ArrayList<String> content = new ArrayList<String>();
String [] data = line.split(";");
for (int i=1;i<data.length;i++)
{
content.add(data[i]);
}
morphemeWordList.put(data[0],content);
}
} catch (FileNotFoundException e) {
//Some error logging
}
return morphemeWordList;
}
static void writeMorphemeWordListToCSVFile(String filepath, Map<String, ArrayList> morphemeWordList)
{
try
{
FileWriter file = new FileWriter(filepath);
PrintWriter write = new PrintWriter(file);
for (String s : morphemeWordList.keySet())
{
String key = s.toString();
ArrayList<String> value = morphemeWordList.get(s);
write.print(key + ";");
for (String v : value)
{
write.print(v + ";");
}
write.println();
}
write.close();
}
catch (IOException e)
{
System.out.println(e.getMessage());
}
}
static void appendResultsToCSVFile(String filepath, String postprocessingfilepath, Set<String> wordClassTypes, ArrayList<String> allWordClassOfCorpus, ArrayList<String> allWordsOfCorpus) throws IOException
{
Map<String, ArrayList> morphemeWordList = new HashMap<String,ArrayList>();
//read from file the postprocessed data
morphemeWordList = readMorphemeWordListFromCSVFile(postprocessingfilepath);
// header of the result file
String header = "Suffixes: Nouns in PPCMBE\nTotal noun types: " + wordClassTypes.size() + " of "
+ allWordClassOfCorpus.size() + " nouns and of " + allWordsOfCorpus.size()
+ " word in total"
+ "\n\nMorpheme;Contained in Words;Hapaxes;Types (V);Tokens;No Hapaxes;P";
try
{
FileWriter file = new FileWriter(filepath, true); //for overwriting set boolean to false
PrintWriter write = new PrintWriter(file);
write.println(header);
for (String s : morphemeWordList.keySet())
{
String key = s.toString();
ArrayList<String> allWordsContainingAffix = morphemeWordList.get(s);
//System.out.println(key + " " + value);
//System.out.print("From which Hapax: ");
ArrayList<String> HapaxLegonema = new ArrayList<String>();
//ArrayList<String> allWordsContainingAffix = new ArrayList<String>(value);
int numberOfAffixInCorpus = 0;
for (String wordContainingAffix : allWordsContainingAffix)
{
numberOfAffixInCorpus += Collections.frequency(allWordClassOfCorpus, wordContainingAffix);
if (Collections.frequency(allWordClassOfCorpus, wordContainingAffix) == 1)
{
HapaxLegonema.add(wordContainingAffix);
//System.out.print(wordContainingAffix + " ");
}
}
//calculte the p-value as a productivity measure
int hapaxtypes = HapaxLegonema.size();
double p_value = 0.0;
if (numberOfAffixInCorpus != 0)
{
p_value = (double)hapaxtypes / (double)numberOfAffixInCorpus;
}
write.println(key + ";" + allWordsContainingAffix + ";" + HapaxLegonema + ";"
+ allWordsContainingAffix.size() + ";" + numberOfAffixInCorpus
+ ";" + hapaxtypes + ";" + p_value);
}
write.close();
}
catch (IOException e)
{
System.out.println(e.getMessage());
}
}
}
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
......@@ -9,11 +10,25 @@ import java.util.Set;
import java.util.Collections;
public class Init {
public static void main(String[] args)
/**
* Short description of the algorithm given a PENN tagged text corpus
* loop through list: for each word do:
* 0. reduce to verbs, nouns, and adjectives in three different lists
* 1. Hashmap with number of each word
* 2. reduce to word types
* 3. instantiate AffixStripper
* 4. delete if no lexical affix is present
* 5. write all words that contain affix to list
* 6. Check with Token-Hashmap if word in 5 is hapax legonoma
*/
public static void main(String[] args) throws IOException
{
// read all texts of the corpus file in list
String directory = "C:\\Users\\Peukert\\Corpora\\PENN Corpus\\PENN-CORPORA\\PPCMBE-RELEASE-1\\corpus\\pos";
// write file to postprocessing file
String postprocessingfile = "C:\\Users\\Peukert\\Documents\\postprocessingfile.csv";
// write file to final result file
String finalresultsfile = "C:\\Users\\Peukert\\Documents\\resultsMorphochron.csv";
IO io = new IO();
ArrayList<String> allWordsOfCorpus = new ArrayList<String>();
allWordsOfCorpus = io.readFilesFromDirectory(directory);
......@@ -140,9 +155,9 @@ public class Init {
}
*/
Map<String, Integer> suffixMorpheme = new HashMap<String,Integer>();
Map<String, ArrayList> morphemeWordList = new HashMap<String,ArrayList>();
for (String noun : nounTypes)
{
AffixStripper as = new AffixStripper(noun);
......@@ -167,50 +182,11 @@ public class Init {
//System.out.println(noun + ": " + suffixMorpheme.keySet());
}
}
//write csv file to manually postprocess the data
//io.writeMorphemeWordListToCSVFile(postprocessingfile, morphemeWordList);
for (String s : morphemeWordList.keySet())
{
String key = s.toString();
String value = morphemeWordList.get(s).toString();
System.out.println(key + " " + value);
System.out.print("From which Hapax: ");
ArrayList<String> HapaxLegonoma = new ArrayList<String>();
ArrayList<String> allWordsContainingAffix = new ArrayList<String>(morphemeWordList.get(s));
int numberOfAffixInCorpus = 0;
for (String wordContainingAffix : allWordsContainingAffix)
{
numberOfAffixInCorpus += Collections.frequency(allNounsOfCorpus, wordContainingAffix);
if (Collections.frequency(allNounsOfCorpus, wordContainingAffix) == 1)
{
HapaxLegonoma.add(wordContainingAffix);
System.out.print(wordContainingAffix + " ");
}
}
System.out.println();
System.out.println("Number of Hapaxes: " + HapaxLegonoma.size());
System.out.println("Total number of word types containing the " + key + "-morpheme: " + morphemeWordList.get(s).size());
System.out.println("Total number of word tokens containing the " + key + "-morpheme: " + numberOfAffixInCorpus);
}
System.out.println("size noun types: " + nounTypes.size());
System.out.println("size morphemes: " + morphemeWordList.size());
//remove inflected forms
/**
* loop through list: for each word do:
* 0. reduce to verbs, nouns, and adjectives in three different lists
* 1. Hashmap with number of each word
* 2. reduce to word types
* 3. instantiate AffixStripper
* 4. delete if no lexical affix is present
* 5. write all words that contain affix to list
* 6. Check with Token-Hashmap if word in 5 is hapax legonoma
*/
// write to CSV file
// write all results to CSV file
io.appendResultsToCSVFile(finalresultsfile, postprocessingfile, nounTypes, allNounsOfCorpus, allWordsOfCorpus);
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment