writing to result file finalized

325e7245 · Peukert · 83b0a69a · 325e7245 · 325e7245 · 325e7245
Commit 325e7245 authored Aug 25, 2021 by Peukert
--- a/Morphochron/src/IO.java
+++ b/Morphochron/src/IO.java
@@ -4,16 +4,24 @@ import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
 import java.io.FileOutputStream;
+import java.io.FileReader;
 import java.io.FileWriter;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.stream.Stream;
 import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
 import java.util.Scanner;
+import java.util.Set;
+

 public class IO {
 	
@@ -67,4 +75,107 @@ public class IO {
 	      bw.close();
 	    } catch(IOException e){}
 	  }
+	  
+	  static Map<String, ArrayList> readMorphemeWordListFromCSVFile(String filepath) throws IOException
+	  {
+		  
+		  Map<String, ArrayList> morphemeWordList = new HashMap<String, ArrayList>();
+		    try(BufferedReader br = new BufferedReader(new FileReader(filepath))) {
+		        String line = "";
+		        while ((line = br.readLine()) != null) {
+		        	ArrayList<String> content = new ArrayList<String>();
+		        	String [] data = line.split(";");
+		        	for (int i=1;i<data.length;i++) 
+		        	{
+		        		content.add(data[i]);
+		        	}
+		            morphemeWordList.put(data[0],content);
+		        }
+		    } catch (FileNotFoundException e) {
+		      //Some error logging
+		    }
+		    
+		  return morphemeWordList;
+	  }
+	  
+	  static void writeMorphemeWordListToCSVFile(String filepath, Map<String, ArrayList> morphemeWordList)
+	  {
+		  try
+		  {
+	            FileWriter file = new FileWriter(filepath);
+	            PrintWriter write = new PrintWriter(file);
+	            
+	            for (String s : morphemeWordList.keySet())
+	    		{
+	    			String key = s.toString();
+	    		    ArrayList<String> value = morphemeWordList.get(s);
+	    		   
+	    		    write.print(key + ";");
+	    		    for (String v : value)
+	    		    {
+	    		    	write.print(v + ";");
+	    		    }
+	    		    write.println();
+	    		}
+	            write.close();
+		   } 
+		  catch (IOException e) 
+		  {
+			  System.out.println(e.getMessage());
+		  }
+	  }
+	  
+	  static void appendResultsToCSVFile(String filepath, String postprocessingfilepath, Set<String> wordClassTypes, ArrayList<String> allWordClassOfCorpus, ArrayList<String> allWordsOfCorpus) throws IOException
+	  {
+		  Map<String, ArrayList> morphemeWordList = new HashMap<String,ArrayList>();
+		  //read from file the postprocessed data
+		  morphemeWordList = readMorphemeWordListFromCSVFile(postprocessingfilepath);
+		  // header of the result file
+		  String header = "Suffixes: Nouns in PPCMBE\nTotal noun types: " + wordClassTypes.size() + " of " 
+					+ allWordClassOfCorpus.size() + " nouns and of " + allWordsOfCorpus.size()  
+					+ " word in total"
+					+ "\n\nMorpheme;Contained in Words;Hapaxes;Types (V);Tokens;No Hapaxes;P";
+		  try
+		  {
+	            FileWriter file = new FileWriter(filepath, true); //for overwriting set boolean to false
+	            PrintWriter write = new PrintWriter(file);
+	            write.println(header);
+	            for (String s : morphemeWordList.keySet())
+	    		{
+	    			String key = s.toString();
+	    		    ArrayList<String> allWordsContainingAffix = morphemeWordList.get(s);
+	    		   
+	    		    //System.out.println(key + " " + value);
+	    		    //System.out.print("From which Hapax: ");
+	    			ArrayList<String> HapaxLegonema = new ArrayList<String>();
+	    			//ArrayList<String> allWordsContainingAffix = new ArrayList<String>(value);
+	    			int numberOfAffixInCorpus = 0;
+	    			for (String wordContainingAffix  : allWordsContainingAffix) 
+	    			{
+	    				numberOfAffixInCorpus += Collections.frequency(allWordClassOfCorpus, wordContainingAffix);
+	    				if (Collections.frequency(allWordClassOfCorpus, wordContainingAffix)  == 1)
+	    				{
+	    					    HapaxLegonema.add(wordContainingAffix);
+	    					    //System.out.print(wordContainingAffix + " ");
+	    				}
+	    			}
+	    			//calculte the p-value as a productivity measure
+	    			int hapaxtypes = HapaxLegonema.size();
+	    			double p_value = 0.0;
+	    			if (numberOfAffixInCorpus != 0)
+	    			{
+	    				p_value = (double)hapaxtypes / (double)numberOfAffixInCorpus; 
+	    			}
+	    			
+	    			write.println(key + ";" + allWordsContainingAffix + ";" + HapaxLegonema + ";" 
+	    							+ allWordsContainingAffix.size() + ";" + numberOfAffixInCorpus 
+	    							+ ";" + hapaxtypes + ";" + p_value);	    	
+	    		}
+	            write.close();
+		   } 
+		  catch (IOException e) 
+		  {
+			  System.out.println(e.getMessage());
+		  }
+	  }
 }
--- a/Morphochron/src/Init.java
+++ b/Morphochron/src/Init.java

+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
@@ -9,11 +10,25 @@ import java.util.Set;
 import java.util.Collections;

 public class Init {
-
-	public static void main(String[] args)
+	/**
+	 *  Short description of the algorithm given a PENN tagged text corpus
+	 *  loop through list: for each word do:
+	 *  0. reduce to verbs, nouns, and adjectives in three different lists
+	 *  1. Hashmap with number of each word
+	 *  2. reduce to word types
+	 *  3. instantiate AffixStripper
+	 *  4. delete if no lexical affix is present
+	 *  5. write all words that contain affix to list
+	 *  6. Check with Token-Hashmap if word in 5 is hapax legonoma
+	 */
+	public static void main(String[] args) throws IOException
 	{
 		// read all texts of the corpus file in list
 		String directory = "C:\\Users\\Peukert\\Corpora\\PENN Corpus\\PENN-CORPORA\\PPCMBE-RELEASE-1\\corpus\\pos";
+		// write file to postprocessing file
+		String postprocessingfile = "C:\\Users\\Peukert\\Documents\\postprocessingfile.csv";
+		// write file to final result file
+		String finalresultsfile = "C:\\Users\\Peukert\\Documents\\resultsMorphochron.csv";
 		IO io = new IO();
 		ArrayList<String> allWordsOfCorpus = new ArrayList<String>();
 		allWordsOfCorpus = io.readFilesFromDirectory(directory);
@@ -140,9 +155,9 @@ public class Init {
 		}
 		*/
 			
-		
 		Map<String, Integer> suffixMorpheme = new HashMap<String,Integer>();
 		Map<String, ArrayList> morphemeWordList = new HashMap<String,ArrayList>();
+		
 		for (String noun : nounTypes)
 		{
 			AffixStripper as = new AffixStripper(noun);
@@ -167,50 +182,11 @@ public class Init {
 				//System.out.println(noun + ": " + suffixMorpheme.keySet());
 			}
 		}
+		//write csv file to manually postprocess the data
+		//io.writeMorphemeWordListToCSVFile(postprocessingfile, morphemeWordList);
 		
-		for (String s : morphemeWordList.keySet())
-		{
-			String key = s.toString();
-		    String value = morphemeWordList.get(s).toString();
-		   
-		    System.out.println(key + " " + value);
-		    System.out.print("From which Hapax: ");
-			ArrayList<String> HapaxLegonoma = new ArrayList<String>();
-			ArrayList<String> allWordsContainingAffix = new ArrayList<String>(morphemeWordList.get(s));
-			int numberOfAffixInCorpus = 0;
-			for (String wordContainingAffix  : allWordsContainingAffix) 
-			{
-				numberOfAffixInCorpus += Collections.frequency(allNounsOfCorpus, wordContainingAffix);
-				if (Collections.frequency(allNounsOfCorpus, wordContainingAffix)  == 1)
-				{
-					    HapaxLegonoma.add(wordContainingAffix);
-					    System.out.print(wordContainingAffix + " ");
-				}
-			}
-				    	
-			System.out.println();
-			System.out.println("Number of Hapaxes: " + HapaxLegonoma.size());
-		    System.out.println("Total number of word types containing the " + key + "-morpheme: " + morphemeWordList.get(s).size());
-		    System.out.println("Total number of word tokens containing the " + key + "-morpheme: " + numberOfAffixInCorpus);
-		}
-		
-		System.out.println("size noun types: " + nounTypes.size());
-		System.out.println("size morphemes: " + morphemeWordList.size());
-		
-		//remove inflected forms
-		
-		/**
-		 *  loop through list: for each word do:
-		 *  0. reduce to verbs, nouns, and adjectives in three different lists
-		 *  1. Hashmap with number of each word
-		 *  2. reduce to word types
-		 *  3. instantiate AffixStripper
-		 *  4. delete if no lexical affix is present
-		 *  5. write all words that contain affix to list
-		 *  6. Check with Token-Hashmap if word in 5 is hapax legonoma
-		 */
-		
-		// write to CSV file
+		// write all results to CSV file
+		io.appendResultsToCSVFile(finalresultsfile, postprocessingfile, nounTypes, allNounsOfCorpus, allWordsOfCorpus);
 		
 	}


--- a/Morphochron/src/PrefixEnum.java
+++ b/Morphochron/src/PrefixEnum.java