From 325e7245d9dc19b3550aba3891e2807ff5766c82 Mon Sep 17 00:00:00 2001 From: Peukert <Peukert@172.21.60.86> Date: Wed, 25 Aug 2021 13:02:38 +0200 Subject: [PATCH] writing to result file finalized --- Morphochron/src/IO.java | 111 ++++++++++++++++++++++++++++++++ Morphochron/src/Init.java | 70 +++++++------------- Morphochron/src/PrefixEnum.java | 2 +- 3 files changed, 135 insertions(+), 48 deletions(-) diff --git a/Morphochron/src/IO.java b/Morphochron/src/IO.java index ecbc61f..a1a2cf0 100644 --- a/Morphochron/src/IO.java +++ b/Morphochron/src/IO.java @@ -4,16 +4,24 @@ import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; +import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; +import java.io.PrintWriter; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.stream.Stream; import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; import java.util.Scanner; +import java.util.Set; + public class IO { @@ -67,4 +75,107 @@ public class IO { bw.close(); } catch(IOException e){} } + + static Map<String, ArrayList> readMorphemeWordListFromCSVFile(String filepath) throws IOException + { + + Map<String, ArrayList> morphemeWordList = new HashMap<String, ArrayList>(); + try(BufferedReader br = new BufferedReader(new FileReader(filepath))) { + String line = ""; + while ((line = br.readLine()) != null) { + ArrayList<String> content = new ArrayList<String>(); + String [] data = line.split(";"); + for (int i=1;i<data.length;i++) + { + content.add(data[i]); + } + morphemeWordList.put(data[0],content); + } + } catch (FileNotFoundException e) { + //Some error logging + } + + return morphemeWordList; + } + + static void writeMorphemeWordListToCSVFile(String filepath, Map<String, ArrayList> morphemeWordList) + { + try + { + FileWriter file = new FileWriter(filepath); + PrintWriter write = new PrintWriter(file); + + for (String s : morphemeWordList.keySet()) + { + String key = s.toString(); + ArrayList<String> value = morphemeWordList.get(s); + + write.print(key + ";"); + for (String v : value) + { + write.print(v + ";"); + } + write.println(); + } + write.close(); + } + catch (IOException e) + { + System.out.println(e.getMessage()); + } + } + + static void appendResultsToCSVFile(String filepath, String postprocessingfilepath, Set<String> wordClassTypes, ArrayList<String> allWordClassOfCorpus, ArrayList<String> allWordsOfCorpus) throws IOException + { + Map<String, ArrayList> morphemeWordList = new HashMap<String,ArrayList>(); + //read from file the postprocessed data + morphemeWordList = readMorphemeWordListFromCSVFile(postprocessingfilepath); + // header of the result file + String header = "Suffixes: Nouns in PPCMBE\nTotal noun types: " + wordClassTypes.size() + " of " + + allWordClassOfCorpus.size() + " nouns and of " + allWordsOfCorpus.size() + + " word in total" + + "\n\nMorpheme;Contained in Words;Hapaxes;Types (V);Tokens;No Hapaxes;P"; + try + { + FileWriter file = new FileWriter(filepath, true); //for overwriting set boolean to false + PrintWriter write = new PrintWriter(file); + write.println(header); + for (String s : morphemeWordList.keySet()) + { + String key = s.toString(); + ArrayList<String> allWordsContainingAffix = morphemeWordList.get(s); + + //System.out.println(key + " " + value); + //System.out.print("From which Hapax: "); + ArrayList<String> HapaxLegonema = new ArrayList<String>(); + //ArrayList<String> allWordsContainingAffix = new ArrayList<String>(value); + int numberOfAffixInCorpus = 0; + for (String wordContainingAffix : allWordsContainingAffix) + { + numberOfAffixInCorpus += Collections.frequency(allWordClassOfCorpus, wordContainingAffix); + if (Collections.frequency(allWordClassOfCorpus, wordContainingAffix) == 1) + { + HapaxLegonema.add(wordContainingAffix); + //System.out.print(wordContainingAffix + " "); + } + } + //calculte the p-value as a productivity measure + int hapaxtypes = HapaxLegonema.size(); + double p_value = 0.0; + if (numberOfAffixInCorpus != 0) + { + p_value = (double)hapaxtypes / (double)numberOfAffixInCorpus; + } + + write.println(key + ";" + allWordsContainingAffix + ";" + HapaxLegonema + ";" + + allWordsContainingAffix.size() + ";" + numberOfAffixInCorpus + + ";" + hapaxtypes + ";" + p_value); + } + write.close(); + } + catch (IOException e) + { + System.out.println(e.getMessage()); + } + } } diff --git a/Morphochron/src/Init.java b/Morphochron/src/Init.java index ace72be..f51480c 100644 --- a/Morphochron/src/Init.java +++ b/Morphochron/src/Init.java @@ -1,4 +1,5 @@ +import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; @@ -9,11 +10,25 @@ import java.util.Set; import java.util.Collections; public class Init { - - public static void main(String[] args) + /** + * Short description of the algorithm given a PENN tagged text corpus + * loop through list: for each word do: + * 0. reduce to verbs, nouns, and adjectives in three different lists + * 1. Hashmap with number of each word + * 2. reduce to word types + * 3. instantiate AffixStripper + * 4. delete if no lexical affix is present + * 5. write all words that contain affix to list + * 6. Check with Token-Hashmap if word in 5 is hapax legonoma + */ + public static void main(String[] args) throws IOException { // read all texts of the corpus file in list String directory = "C:\\Users\\Peukert\\Corpora\\PENN Corpus\\PENN-CORPORA\\PPCMBE-RELEASE-1\\corpus\\pos"; + // write file to postprocessing file + String postprocessingfile = "C:\\Users\\Peukert\\Documents\\postprocessingfile.csv"; + // write file to final result file + String finalresultsfile = "C:\\Users\\Peukert\\Documents\\resultsMorphochron.csv"; IO io = new IO(); ArrayList<String> allWordsOfCorpus = new ArrayList<String>(); allWordsOfCorpus = io.readFilesFromDirectory(directory); @@ -139,10 +154,10 @@ public class Init { System.out.println(key + " " + value); } */ - - + Map<String, Integer> suffixMorpheme = new HashMap<String,Integer>(); Map<String, ArrayList> morphemeWordList = new HashMap<String,ArrayList>(); + for (String noun : nounTypes) { AffixStripper as = new AffixStripper(noun); @@ -167,50 +182,11 @@ public class Init { //System.out.println(noun + ": " + suffixMorpheme.keySet()); } } + //write csv file to manually postprocess the data + //io.writeMorphemeWordListToCSVFile(postprocessingfile, morphemeWordList); - for (String s : morphemeWordList.keySet()) - { - String key = s.toString(); - String value = morphemeWordList.get(s).toString(); - - System.out.println(key + " " + value); - System.out.print("From which Hapax: "); - ArrayList<String> HapaxLegonoma = new ArrayList<String>(); - ArrayList<String> allWordsContainingAffix = new ArrayList<String>(morphemeWordList.get(s)); - int numberOfAffixInCorpus = 0; - for (String wordContainingAffix : allWordsContainingAffix) - { - numberOfAffixInCorpus += Collections.frequency(allNounsOfCorpus, wordContainingAffix); - if (Collections.frequency(allNounsOfCorpus, wordContainingAffix) == 1) - { - HapaxLegonoma.add(wordContainingAffix); - System.out.print(wordContainingAffix + " "); - } - } - - System.out.println(); - System.out.println("Number of Hapaxes: " + HapaxLegonoma.size()); - System.out.println("Total number of word types containing the " + key + "-morpheme: " + morphemeWordList.get(s).size()); - System.out.println("Total number of word tokens containing the " + key + "-morpheme: " + numberOfAffixInCorpus); - } - - System.out.println("size noun types: " + nounTypes.size()); - System.out.println("size morphemes: " + morphemeWordList.size()); - - //remove inflected forms - - /** - * loop through list: for each word do: - * 0. reduce to verbs, nouns, and adjectives in three different lists - * 1. Hashmap with number of each word - * 2. reduce to word types - * 3. instantiate AffixStripper - * 4. delete if no lexical affix is present - * 5. write all words that contain affix to list - * 6. Check with Token-Hashmap if word in 5 is hapax legonoma - */ - - // write to CSV file + // write all results to CSV file + io.appendResultsToCSVFile(finalresultsfile, postprocessingfile, nounTypes, allNounsOfCorpus, allWordsOfCorpus); } diff --git a/Morphochron/src/PrefixEnum.java b/Morphochron/src/PrefixEnum.java index 337960d..3aa7088 100644 --- a/Morphochron/src/PrefixEnum.java +++ b/Morphochron/src/PrefixEnum.java @@ -65,7 +65,7 @@ public enum PrefixEnum { this.morpheme = morpheme; } //getter Method - + public String getMorpheme() { return this.morpheme; } -- GitLab