diff --git a/Morphochron/src/AffixStripper.java b/Morphochron/src/AffixStripper.java index ca16cf4052cb378ec719185a0df7070b97a50135..c2488b1710f78e28f17ce895026c630662b15a25 100644 --- a/Morphochron/src/AffixStripper.java +++ b/Morphochron/src/AffixStripper.java @@ -215,7 +215,7 @@ public class AffixStripper { private void analyzeWord() { //analyze inflection first because it always occurs at the end of a word - inflection = analyzeInflection(wordtoken); + inflection = "";//analyzeInflection(wordtoken); lemma = analyzeLemma(wordtoken, inflection); analyzePrefix(lemma); analyzeSuffix(lemma); @@ -390,9 +390,11 @@ public class AffixStripper { for (SuffixEnum sufEnum : SuffixEnum.values()) { String s = sufEnum.toString(); + //System.out.println("morpheme: " + sufEnum.name() + " allomorph: " + sufEnum.getMorpheme()); if (restword.endsWith(s)) { - suffixMorpheme.put(s, suffixMorpheme.size() + 1); + //if the allomorphs are supposed be given to the map, use s instead of sufEnum.getMorpheme() + suffixMorpheme.put(sufEnum.getMorpheme(), suffixMorpheme.size() + 1); //suffixAllomorph.add(0, restword.substring(sufEnum.toString().length())); //cut off the suffix that is added to the list analyzeSuffix(restword.substring(0, restword.length() - s.length())); diff --git a/Morphochron/src/Init.java b/Morphochron/src/Init.java index 16deed83c8a3da7d6c30306d649163871d06ebca..ace72be64e13de5f9ad68d2b6da9919d8caa970b 100644 --- a/Morphochron/src/Init.java +++ b/Morphochron/src/Init.java @@ -1,22 +1,35 @@ import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.Collections; public class Init { public static void main(String[] args) { - // read corpus file as list + // read all texts of the corpus file in list String directory = "C:\\Users\\Peukert\\Corpora\\PENN Corpus\\PENN-CORPORA\\PPCMBE-RELEASE-1\\corpus\\pos"; IO io = new IO(); ArrayList<String> allWordsOfCorpus = new ArrayList<String>(); + allWordsOfCorpus = io.readFilesFromDirectory(directory); + + // create 6 lists for each word class (A,V,N) and compounds each ArrayList<String> allNounsOfCorpus = new ArrayList<String>(); ArrayList<String> allVerbsOfCorpus = new ArrayList<String>(); ArrayList<String> allAdjectivesOfCorpus = new ArrayList<String>(); - allWordsOfCorpus = io.readFilesFromDirectory(directory); - // make 3 lists for A,V,N - //String nl = System.getProperty("line.separator"); + ArrayList<String> allCompoundNounsOfCorpus = new ArrayList<String>(); + ArrayList<String> allComoundVerbsOfCorpus = new ArrayList<String>(); + ArrayList<String> allCompoundAdjectivesOfCorpus = new ArrayList<String>(); + ArrayList<String> allPluralnounsOfCorpus = new ArrayList<String>(); + //remove inflected forms for (String word : allWordsOfCorpus) { + //handling for verbs if (word.endsWith("MD") || word.endsWith("MD0") || word.endsWith("VAG") || word.endsWith("VAN") || word.endsWith("VB") || word.endsWith("VBI") || @@ -25,14 +38,78 @@ public class Init { { allVerbsOfCorpus.add(word); } - else if (word.endsWith("N") || word.endsWith("N$") || - word.endsWith("NPR") || word.endsWith("NPR$") || - word.endsWith("NPRS") || word.endsWith("NPRS$") || - word.endsWith("NS") || word.endsWith("NS$") || - word.endsWith("OTHER") || word.endsWith("OTHER$") || - word.endsWith("OTHERS$") || word.endsWith("OTHERS$")) + // handling for compound nouns + else if (word.endsWith("+N") || word.endsWith("+N$") + || word.endsWith("+NS") || word.endsWith("+NS$") + ) { - allNounsOfCorpus.add(word); + allCompoundNounsOfCorpus.add(word); + //System.out.println(word); + } + + else if (word.endsWith("/N") + //Proper Nouns of all kinds are excluded + // || word.endsWith("NPR") || word.endsWith("NPR$") + // || word.endsWith("NPRS") || word.endsWith("NPRS$") + // all forms of nominalized other, e.g. the other are excluded + // || word.endsWith("OTHER") || word.endsWith("OTHER$") + // || word.endsWith("OTHERS") || word.endsWith("OTHERS$") + ) + { + word = word.replace("/N", ""); + allNounsOfCorpus.add(word.toLowerCase()); + } + /*get rid of Possessives and Plural + * (Plural nouns cannot be sorted out, + * possible with a second loop but not + * worthwhile since not containing lexical morphemes)*/ + + else if (word.endsWith("/NS$")) + { + word = word.replace("ies/NS$", "y"); + word = word.replace("ies'/NS$", "y"); + word = word.replace("ches/NS$", "ch"); + word = word.replace("ches'/NS$", "ch"); + word = word.replace("ses/NS$", "s"); + word = word.replace("ses'/NS$", "s"); + word = word.replace("shes/NS$", "sh"); + word = word.replace("shes'/NS$", "sh"); + word = word.replace("./NS$", ""); + word = word.replace("s'/NS$", ""); + word = word.replace("'/NS$", ""); + word = word.replace("'s/NS$", ""); + word = word.replace("s/NS$", ""); + allNounsOfCorpus.add(word.toLowerCase()); + } + //get rid of Possessives + else if (word.endsWith("/N$")) + { + word = word.replace("'s./N$", ""); + word = word.replace("./N$", ""); + word = word.replace("'s/N$", ""); + word = word.replace("s/N$", ""); + word = word.replace("'/N$", ""); + word = word.replace("/N$", ""); + allNounsOfCorpus.add(word.toLowerCase()); + } + //get rid of Plural + else if (word.endsWith("/NS")) + { + //System.out.println(word); + word = word.replace("ies/NS", "y"); + word = word.replace("ches/NS", "ch"); + word = word.replace("ses/NS", "s"); + word = word.replace("shes/NS", "sh"); + word = word.replace("./NS", ""); + word = word.replace("s/NS", ""); + word = word.replace("s'/NS", ""); + word = word.replace("'/NS", ""); + if (word.endsWith("/NS")) + { + word = word.replace("/NS", ""); + allPluralnounsOfCorpus.add(word); + } + allNounsOfCorpus.add(word.toLowerCase()); } else if (word.endsWith("ADJ") || word.endsWith("ADJR") || word.endsWith("ADJS") || word.endsWith("ADV") || @@ -40,16 +117,88 @@ public class Init { { allAdjectivesOfCorpus.add(word); } - //System.out.println(word); } + /* System.out.println("Gesamt Wortanzahl: " + allWordsOfCorpus.size()); System.out.println("Anzahl Verben: " + allVerbsOfCorpus.size()); System.out.println("Anzahl Adjektive: " + allAdjectivesOfCorpus.size()); System.out.println("Anzahl Substantive: " + allNounsOfCorpus.size()); - for (String noun : allVerbsOfCorpus) + */ + // create word frequency list of the nouns + Map<String, Integer> frequencyNouns = new HashMap<String,Integer>(); + Set<String> nounTypes = new HashSet<String>(allNounsOfCorpus); + for (String key : nounTypes) + { + frequencyNouns.put(key, Collections.frequency(allNounsOfCorpus, key)); + //System.out.println(key + ": " + Collections.frequency(allNounsOfCorpus, key)); + } + /* + for (String noun: frequencyNouns.keySet()) { + String key = noun.toString(); + String value = frequencyNouns.get(noun).toString(); + System.out.println(key + " " + value); + } + */ + + + Map<String, Integer> suffixMorpheme = new HashMap<String,Integer>(); + Map<String, ArrayList> morphemeWordList = new HashMap<String,ArrayList>(); + for (String noun : nounTypes) { - System.out.println(noun); + AffixStripper as = new AffixStripper(noun); + suffixMorpheme = as.getSuffixMorphem(); + if (!suffixMorpheme.isEmpty()) + { + + for (String morpheme : suffixMorpheme.keySet()) + { + ArrayList<String> WordListOfNounsWithSuffix = new ArrayList<String>(); + + if (morphemeWordList.get(morpheme)!=null)//only for the first iteration when the morphemeWordList does not contain any data + { + // keep the values of morphemeWordList that were written to it previously + WordListOfNounsWithSuffix = morphemeWordList.get(morpheme); + } + + WordListOfNounsWithSuffix.add(noun); + + morphemeWordList.put(morpheme, WordListOfNounsWithSuffix); + } + //System.out.println(noun + ": " + suffixMorpheme.keySet()); + } } + + for (String s : morphemeWordList.keySet()) + { + String key = s.toString(); + String value = morphemeWordList.get(s).toString(); + + System.out.println(key + " " + value); + System.out.print("From which Hapax: "); + ArrayList<String> HapaxLegonoma = new ArrayList<String>(); + ArrayList<String> allWordsContainingAffix = new ArrayList<String>(morphemeWordList.get(s)); + int numberOfAffixInCorpus = 0; + for (String wordContainingAffix : allWordsContainingAffix) + { + numberOfAffixInCorpus += Collections.frequency(allNounsOfCorpus, wordContainingAffix); + if (Collections.frequency(allNounsOfCorpus, wordContainingAffix) == 1) + { + HapaxLegonoma.add(wordContainingAffix); + System.out.print(wordContainingAffix + " "); + } + } + + System.out.println(); + System.out.println("Number of Hapaxes: " + HapaxLegonoma.size()); + System.out.println("Total number of word types containing the " + key + "-morpheme: " + morphemeWordList.get(s).size()); + System.out.println("Total number of word tokens containing the " + key + "-morpheme: " + numberOfAffixInCorpus); + } + + System.out.println("size noun types: " + nounTypes.size()); + System.out.println("size morphemes: " + morphemeWordList.size()); + + //remove inflected forms + /** * loop through list: for each word do: * 0. reduce to verbs, nouns, and adjectives in three different lists diff --git a/Morphochron/src/SuffixEnum.java b/Morphochron/src/SuffixEnum.java index e32385c1207e873dc429d3ce2e9a7e8cc0324b65..209358873d741396f1240abada686c21a9c67a3f 100644 --- a/Morphochron/src/SuffixEnum.java +++ b/Morphochron/src/SuffixEnum.java @@ -88,7 +88,7 @@ public enum SuffixEnum { ward("ward"), wards("wards"), ware("ware"), uaeras("ware"), uaras("ware"), uaro("ware"), waeras("ware"), wara("ware"), waran("ware"), waras("ware"), waru("ware"), wearan("ware"), waeren("ware"), warae("ware"), wick("wick"), y("y"), ig("y"), ye("y"), igan("y"), izen("y"), ezen("y"), yen("y"), ey("y"), yl("yl"), yne("yne"); - private String morpheme; + private final String morpheme; //constructor SuffixEnum(String morpheme) {