data extraction for nouns complete

83b0a69a · Peukert · 48436ba6 · 83b0a69a · 83b0a69a · 83b0a69a
Commit 83b0a69a authored 3 years ago by Peukert
--- a/Morphochron/src/AffixStripper.java
+++ b/Morphochron/src/AffixStripper.java
@@ -215,7 +215,7 @@ public class AffixStripper {
 	private void analyzeWord()
 	{
 		//analyze inflection first because it always occurs at the end of a word
-		inflection = analyzeInflection(wordtoken);
+		inflection = "";//analyzeInflection(wordtoken);
 		lemma = analyzeLemma(wordtoken, inflection);
 		analyzePrefix(lemma);
 		analyzeSuffix(lemma);
@@ -390,9 +390,11 @@ public class AffixStripper {
 			for (SuffixEnum sufEnum : SuffixEnum.values())
 			{
 				String s = sufEnum.toString();
+				//System.out.println("morpheme: " + sufEnum.name() + " allomorph: " + sufEnum.getMorpheme());
 				if (restword.endsWith(s))
 				{
-					suffixMorpheme.put(s, suffixMorpheme.size() + 1);
+					//if the allomorphs are supposed be given to the map, use s instead of sufEnum.getMorpheme()
+					suffixMorpheme.put(sufEnum.getMorpheme(), suffixMorpheme.size() + 1);
 					//suffixAllomorph.add(0, restword.substring(sufEnum.toString().length()));
 					//cut off the suffix that is added to the list
 					analyzeSuffix(restword.substring(0, restword.length() - s.length()));

--- a/Morphochron/src/Init.java
+++ b/Morphochron/src/Init.java

 import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.Collections;

 public class Init {

 	public static void main(String[] args)
 	{
-		// read corpus file as list
+		// read all texts of the corpus file in list
 		String directory = "C:\\Users\\Peukert\\Corpora\\PENN Corpus\\PENN-CORPORA\\PPCMBE-RELEASE-1\\corpus\\pos";
 		IO io = new IO();
 		ArrayList<String> allWordsOfCorpus = new ArrayList<String>();
+		allWordsOfCorpus = io.readFilesFromDirectory(directory);
+		
+		// create 6 lists for each word class (A,V,N) and compounds each
 		ArrayList<String> allNounsOfCorpus = new ArrayList<String>();
 		ArrayList<String> allVerbsOfCorpus = new ArrayList<String>();
 		ArrayList<String> allAdjectivesOfCorpus = new ArrayList<String>();
-		allWordsOfCorpus = io.readFilesFromDirectory(directory);
-		// make 3 lists for A,V,N
-		//String nl = System.getProperty("line.separator");
+		ArrayList<String> allCompoundNounsOfCorpus = new ArrayList<String>();
+		ArrayList<String> allComoundVerbsOfCorpus = new ArrayList<String>();
+		ArrayList<String> allCompoundAdjectivesOfCorpus = new ArrayList<String>();
+		ArrayList<String> allPluralnounsOfCorpus = new ArrayList<String>();
+		//remove inflected forms		
 		for (String word : allWordsOfCorpus)
 		{
+			//handling for verbs
 			if (word.endsWith("MD") || word.endsWith("MD0") ||
 					word.endsWith("VAG") || word.endsWith("VAN") || 
 					word.endsWith("VB") || word.endsWith("VBI") ||
@@ -25,14 +38,78 @@ public class Init {
 			{
 				allVerbsOfCorpus.add(word);
 			}
-			else if (word.endsWith("N") || word.endsWith("N$") || 
-					word.endsWith("NPR") || word.endsWith("NPR$") ||
-					word.endsWith("NPRS") || word.endsWith("NPRS$") ||
-					word.endsWith("NS") || word.endsWith("NS$") || 
-					word.endsWith("OTHER") || word.endsWith("OTHER$") ||
-					word.endsWith("OTHERS$") || word.endsWith("OTHERS$"))
+			// handling for compound nouns
+			else if (word.endsWith("+N") || word.endsWith("+N$")
+					|| word.endsWith("+NS") || word.endsWith("+NS$")
+					)
+			{
+				allCompoundNounsOfCorpus.add(word);
+				//System.out.println(word);
+			}
+			
+			else if (word.endsWith("/N") 
+					//Proper Nouns of all kinds are excluded
+					// || word.endsWith("NPR") || word.endsWith("NPR$")
+					// || word.endsWith("NPRS") || word.endsWith("NPRS$")
+					// all forms of nominalized other, e.g. the other are excluded
+					// || word.endsWith("OTHER") || word.endsWith("OTHER$") 
+					// || word.endsWith("OTHERS") || word.endsWith("OTHERS$")
+					)
+			{	
+				word = word.replace("/N", "");
+				allNounsOfCorpus.add(word.toLowerCase());	
+			}
+			/*get rid of Possessives and Plural 
+			 * (Plural nouns cannot be sorted out, 
+			 * possible with a second loop but not 
+			 * worthwhile since not containing lexical morphemes)*/
+			
+			else if (word.endsWith("/NS$"))
+			{
+				word = word.replace("ies/NS$", "y");
+				word = word.replace("ies'/NS$", "y");
+				word = word.replace("ches/NS$", "ch");
+				word = word.replace("ches'/NS$", "ch");
+				word = word.replace("ses/NS$", "s");
+				word = word.replace("ses'/NS$", "s");
+				word = word.replace("shes/NS$", "sh");
+				word = word.replace("shes'/NS$", "sh");
+				word = word.replace("./NS$", "");
+				word = word.replace("s'/NS$", "");
+				word = word.replace("'/NS$", "");
+				word = word.replace("'s/NS$", "");
+				word = word.replace("s/NS$", "");
+				allNounsOfCorpus.add(word.toLowerCase());
+			}
+			//get rid of Possessives
+			else if (word.endsWith("/N$"))
 			{
-				allNounsOfCorpus.add(word);
+				word = word.replace("'s./N$", "");
+				word = word.replace("./N$", "");
+				word = word.replace("'s/N$", "");
+				word = word.replace("s/N$", "");
+				word = word.replace("'/N$", "");
+				word = word.replace("/N$", "");
+				allNounsOfCorpus.add(word.toLowerCase());
+			}
+			//get rid of Plural
+			else if (word.endsWith("/NS"))		
+			{
+				//System.out.println(word);
+				word = word.replace("ies/NS", "y");
+				word = word.replace("ches/NS", "ch");
+				word = word.replace("ses/NS", "s");
+				word = word.replace("shes/NS", "sh");
+				word = word.replace("./NS", "");
+				word = word.replace("s/NS", "");
+				word = word.replace("s'/NS", "");
+				word = word.replace("'/NS", "");
+				if (word.endsWith("/NS"))
+				{
+					word = word.replace("/NS", "");
+					allPluralnounsOfCorpus.add(word);
+				}
+				allNounsOfCorpus.add(word.toLowerCase());
 			}
 			else if (word.endsWith("ADJ") || word.endsWith("ADJR") || 
 					word.endsWith("ADJS") || word.endsWith("ADV") ||
@@ -40,16 +117,88 @@ public class Init {
 			{
 				allAdjectivesOfCorpus.add(word);
 			}
-			//System.out.println(word);
 		}
+		/*
 		System.out.println("Gesamt Wortanzahl: " + allWordsOfCorpus.size());
 		System.out.println("Anzahl Verben: " + allVerbsOfCorpus.size());
 		System.out.println("Anzahl Adjektive: " + allAdjectivesOfCorpus.size());
 		System.out.println("Anzahl Substantive: " + allNounsOfCorpus.size());
-		for (String noun : allVerbsOfCorpus)
+		*/
+		// create word frequency list of the nouns
+		Map<String, Integer> frequencyNouns = new HashMap<String,Integer>();
+		Set<String> nounTypes = new HashSet<String>(allNounsOfCorpus);
+		for (String key : nounTypes) 
 		{
-			System.out.println(noun);
+			frequencyNouns.put(key, Collections.frequency(allNounsOfCorpus, key));
+		    //System.out.println(key + ": " + Collections.frequency(allNounsOfCorpus, key));
+		}
+		/*
+		for (String noun: frequencyNouns.keySet()) {
+		    String key = noun.toString();
+		    String value = frequencyNouns.get(noun).toString();
+		    System.out.println(key + " " + value);
 		}
+		*/
+		
+		
+		Map<String, Integer> suffixMorpheme = new HashMap<String,Integer>();
+		Map<String, ArrayList> morphemeWordList = new HashMap<String,ArrayList>();
+		for (String noun : nounTypes)
+		{
+			AffixStripper as = new AffixStripper(noun);
+			suffixMorpheme = as.getSuffixMorphem();
+			if (!suffixMorpheme.isEmpty())
+			{
+				
+				for (String morpheme : suffixMorpheme.keySet())
+				{
+					ArrayList<String> WordListOfNounsWithSuffix = new ArrayList<String>();
+
+					if (morphemeWordList.get(morpheme)!=null)//only for the first iteration when the morphemeWordList does not contain any data
+					{
+						// keep the values of morphemeWordList that were written to it previously
+						WordListOfNounsWithSuffix = morphemeWordList.get(morpheme);
+					}
+					
+					WordListOfNounsWithSuffix.add(noun);
+					
+					morphemeWordList.put(morpheme, WordListOfNounsWithSuffix);
+				}
+				//System.out.println(noun + ": " + suffixMorpheme.keySet());
+			}
+		}
+		
+		for (String s : morphemeWordList.keySet())
+		{
+			String key = s.toString();
+		    String value = morphemeWordList.get(s).toString();
+		   
+		    System.out.println(key + " " + value);
+		    System.out.print("From which Hapax: ");
+			ArrayList<String> HapaxLegonoma = new ArrayList<String>();
+			ArrayList<String> allWordsContainingAffix = new ArrayList<String>(morphemeWordList.get(s));
+			int numberOfAffixInCorpus = 0;
+			for (String wordContainingAffix  : allWordsContainingAffix) 
+			{
+				numberOfAffixInCorpus += Collections.frequency(allNounsOfCorpus, wordContainingAffix);
+				if (Collections.frequency(allNounsOfCorpus, wordContainingAffix)  == 1)
+				{
+					    HapaxLegonoma.add(wordContainingAffix);
+					    System.out.print(wordContainingAffix + " ");
+				}
+			}
+				    	
+			System.out.println();
+			System.out.println("Number of Hapaxes: " + HapaxLegonoma.size());
+		    System.out.println("Total number of word types containing the " + key + "-morpheme: " + morphemeWordList.get(s).size());
+		    System.out.println("Total number of word tokens containing the " + key + "-morpheme: " + numberOfAffixInCorpus);
+		}
+		
+		System.out.println("size noun types: " + nounTypes.size());
+		System.out.println("size morphemes: " + morphemeWordList.size());
+		
+		//remove inflected forms
+		
 		/**
 		 *  loop through list: for each word do:
 		 *  0. reduce to verbs, nouns, and adjectives in three different lists

--- a/Morphochron/src/SuffixEnum.java
+++ b/Morphochron/src/SuffixEnum.java
@@ -88,7 +88,7 @@ public enum SuffixEnum {
    ward("ward"), wards("wards"), ware("ware"), uaeras("ware"), uaras("ware"), uaro("ware"), waeras("ware"), wara("ware"), waran("ware"),
    waras("ware"), waru("ware"), wearan("ware"), waeren("ware"), warae("ware"), wick("wick"), y("y"), ig("y"), ye("y"), igan("y"), izen("y"),
    ezen("y"), yen("y"), ey("y"), yl("yl"), yne("yne");
-    private String morpheme;
+    private final String morpheme;

    //constructor
    SuffixEnum(String morpheme) {