Select Git revision
Result.java
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
Result.java 5.87 KiB
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
public class Result
{
private Corpus cp;
private String affixtype = "";
private String wordclass = "";
private String postprocessingfile = "";
private String finalresultsfile = "";
private String resultPath = "";
private ArrayList<String> filteredWords = new ArrayList<String>();
private ArrayList<String> allWordsOfCorpus = new ArrayList<String>();
private Map<String, ArrayList<String>> morphemeWordList = new HashMap<String,ArrayList<String>>();
private Map<String, ArrayList<String>> notInOEDWordList = new HashMap<String,ArrayList<String>>();
public Result(Map<String, ArrayList<String>> morphemeWordList, Map<String, ArrayList<String>> notInOEDWordList, Corpus cp, ArrayList<String> filteredWords, String wordtype, String affixtype, String resultPath) throws IOException
{
this.morphemeWordList = morphemeWordList; //the mapping between the morpheme and all word types it is contained in
this.notInOEDWordList = notInOEDWordList;
this.filteredWords = filteredWords; //these are all the words of the respective word class without tags)
this.affixtype = affixtype;
this.wordclass = wordtype;
this.cp = cp;
this.allWordsOfCorpus = cp.getCorpus(); //all words of the corpus irrespective of the word class (words contain tags!)
this.resultPath = resultPath;
setFileNames();
//generateDataSet(morphemeWordList);
writeAllMeasuresFile();
writePostProcessingFile();
}
private ArrayList<String> findHapaxes(ArrayList<String> allWordsContainingAffix)
{
ArrayList<String> hapaxes = new ArrayList<String>();
for (String wordContainingAffix : allWordsContainingAffix)
{
if (Collections.frequency(filteredWords, wordContainingAffix) == 1)
{
hapaxes.add(wordContainingAffix);
}
}
return hapaxes;
}
private int calculateNumberOfAffixes(ArrayList<String> allWordsContainingAffix)
{
int numberOfAffixInCorpus = 0;
for (String wordContainingAffix : allWordsContainingAffix)
{
numberOfAffixInCorpus += Collections.frequency(filteredWords, wordContainingAffix);
}
return numberOfAffixInCorpus;
}
private double calculateP_Value(int hapaxtypes, int numberOfAffixes)
{
double p_value = 0.0;
if (numberOfAffixes != 0)
{
p_value = (double)hapaxtypes / (double)numberOfAffixes;
}
return p_value;
}
private Map<String, Integer> countTokens(ArrayList<String> words)
{
Map<String, Integer> frequencyWords = new HashMap<String,Integer>();
Set<String> wordTypes = new HashSet<String>(words);
for (String key : wordTypes)
{
frequencyWords.put(key, Collections.frequency(words, key));
//System.out.println(key + ": " + Collections.frequency(filteredWords, key));
}
// for (String word: frequencyWords.keySet()) {
// String key = word.toString();
// String value = frequencyWords.get(word).toString();
// System.out.println(key + " " + value);
// }
return frequencyWords;
}
private Set<String> setWordTypes(ArrayList<String> words)
{
Set<String> wordTypes = new HashSet<String>(words);
return wordTypes;
}
/*
* Defines Header as it appears as a heading in the CSC file
*/
private String createHeader()
{
//more appropriate Naming for Header Output
if (affixtype.equals("_su01")) affixtype = "Suffixes";
if (affixtype.equals("_pr01")) affixtype = "Prefixes";
if (wordclass.equals("_nn01")) wordclass = "Nouns";
if (wordclass.equals("_vb01")) wordclass = "Verbs";
if (wordclass.equals("_jj01")) wordclass = "Adjectives";
return affixtype + ": " + wordclass + " in " + cp.getCorpusName() + "/" + cp.getPeriod() + " (" + cp.getStartDate() + "-" + cp.getEndDate() + ")\n" +
"Total Types (" + wordclass+ "): " + setWordTypes(filteredWords).size() + " of " +
filteredWords.size() + " " + wordclass + " and of " + allWordsOfCorpus.size() +
" words in total\n\n" + "Morpheme;Contained in Words;Hapaxes;Types (V);Tokens;No Hapaxes;P\n";
}
/*
* Generates the Data string written to the CSV result file
*/
private String generateDataSet(Map<String, ArrayList<String>> list)
{
String data = createHeader();
for (String s : list.keySet())
{
String key = s.toString();
// allWordsContainingAffix is a list with all wordtypes containing one affix s.toString
ArrayList<String> allWordsContainingAffix = list.get(s);
ArrayList<String> hapaxes = findHapaxes(allWordsContainingAffix);
int affixfrequencyForAllWordTokens = calculateNumberOfAffixes(allWordsContainingAffix);
int affixfrequencyForAllWordTypes = allWordsContainingAffix.size();
data += key + ";" + allWordsContainingAffix + ";" + hapaxes + ";" +
affixfrequencyForAllWordTypes + ";" + affixfrequencyForAllWordTokens + ";" +
hapaxes.size() + ";"+ calculateP_Value(hapaxes.size(), affixfrequencyForAllWordTokens) + "\n";
}
return data + "\n\n";
}
private void setFileNames()
{
// location postprocessing file
postprocessingfile = resultPath + File.separator + "postprocessingfile" + wordclass + "" + affixtype + "_" + cp.getPeriod() + "-" + cp.getCorpusName() + ".csv";
// location final result file
finalresultsfile = resultPath + File.separator + "resultsMorphochron.csv";
}
private void writePostProcessingFile()
{
IO io = new IO();
//write csv file to manually postprocess the data
io.writeMorphemeWordListToCSVFile(postprocessingfile, notInOEDWordList);
}
private void writeAllMeasuresFile() throws IOException
{
IO io = new IO();
// write all results to CSV file
io.appendResultsToCSVFile(finalresultsfile, generateDataSet(morphemeWordList));
}
}