Skip to content
Snippets Groups Projects
Commit 83b0a69a authored by Peukert's avatar Peukert
Browse files

data extraction for nouns complete

parent 48436ba6
No related branches found
No related tags found
No related merge requests found
......@@ -215,7 +215,7 @@ public class AffixStripper {
private void analyzeWord()
{
//analyze inflection first because it always occurs at the end of a word
inflection = analyzeInflection(wordtoken);
inflection = "";//analyzeInflection(wordtoken);
lemma = analyzeLemma(wordtoken, inflection);
analyzePrefix(lemma);
analyzeSuffix(lemma);
......@@ -390,9 +390,11 @@ public class AffixStripper {
for (SuffixEnum sufEnum : SuffixEnum.values())
{
String s = sufEnum.toString();
//System.out.println("morpheme: " + sufEnum.name() + " allomorph: " + sufEnum.getMorpheme());
if (restword.endsWith(s))
{
suffixMorpheme.put(s, suffixMorpheme.size() + 1);
//if the allomorphs are supposed be given to the map, use s instead of sufEnum.getMorpheme()
suffixMorpheme.put(sufEnum.getMorpheme(), suffixMorpheme.size() + 1);
//suffixAllomorph.add(0, restword.substring(sufEnum.toString().length()));
//cut off the suffix that is added to the list
analyzeSuffix(restword.substring(0, restword.length() - s.length()));
......
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Collections;
public class Init {
public static void main(String[] args)
{
// read corpus file as list
// read all texts of the corpus file in list
String directory = "C:\\Users\\Peukert\\Corpora\\PENN Corpus\\PENN-CORPORA\\PPCMBE-RELEASE-1\\corpus\\pos";
IO io = new IO();
ArrayList<String> allWordsOfCorpus = new ArrayList<String>();
allWordsOfCorpus = io.readFilesFromDirectory(directory);
// create 6 lists for each word class (A,V,N) and compounds each
ArrayList<String> allNounsOfCorpus = new ArrayList<String>();
ArrayList<String> allVerbsOfCorpus = new ArrayList<String>();
ArrayList<String> allAdjectivesOfCorpus = new ArrayList<String>();
allWordsOfCorpus = io.readFilesFromDirectory(directory);
// make 3 lists for A,V,N
//String nl = System.getProperty("line.separator");
ArrayList<String> allCompoundNounsOfCorpus = new ArrayList<String>();
ArrayList<String> allComoundVerbsOfCorpus = new ArrayList<String>();
ArrayList<String> allCompoundAdjectivesOfCorpus = new ArrayList<String>();
ArrayList<String> allPluralnounsOfCorpus = new ArrayList<String>();
//remove inflected forms
for (String word : allWordsOfCorpus)
{
//handling for verbs
if (word.endsWith("MD") || word.endsWith("MD0") ||
word.endsWith("VAG") || word.endsWith("VAN") ||
word.endsWith("VB") || word.endsWith("VBI") ||
......@@ -25,14 +38,78 @@ public class Init {
{
allVerbsOfCorpus.add(word);
}
else if (word.endsWith("N") || word.endsWith("N$") ||
word.endsWith("NPR") || word.endsWith("NPR$") ||
word.endsWith("NPRS") || word.endsWith("NPRS$") ||
word.endsWith("NS") || word.endsWith("NS$") ||
word.endsWith("OTHER") || word.endsWith("OTHER$") ||
word.endsWith("OTHERS$") || word.endsWith("OTHERS$"))
// handling for compound nouns
else if (word.endsWith("+N") || word.endsWith("+N$")
|| word.endsWith("+NS") || word.endsWith("+NS$")
)
{
allCompoundNounsOfCorpus.add(word);
//System.out.println(word);
}
else if (word.endsWith("/N")
//Proper Nouns of all kinds are excluded
// || word.endsWith("NPR") || word.endsWith("NPR$")
// || word.endsWith("NPRS") || word.endsWith("NPRS$")
// all forms of nominalized other, e.g. the other are excluded
// || word.endsWith("OTHER") || word.endsWith("OTHER$")
// || word.endsWith("OTHERS") || word.endsWith("OTHERS$")
)
{
word = word.replace("/N", "");
allNounsOfCorpus.add(word.toLowerCase());
}
/*get rid of Possessives and Plural
* (Plural nouns cannot be sorted out,
* possible with a second loop but not
* worthwhile since not containing lexical morphemes)*/
else if (word.endsWith("/NS$"))
{
word = word.replace("ies/NS$", "y");
word = word.replace("ies'/NS$", "y");
word = word.replace("ches/NS$", "ch");
word = word.replace("ches'/NS$", "ch");
word = word.replace("ses/NS$", "s");
word = word.replace("ses'/NS$", "s");
word = word.replace("shes/NS$", "sh");
word = word.replace("shes'/NS$", "sh");
word = word.replace("./NS$", "");
word = word.replace("s'/NS$", "");
word = word.replace("'/NS$", "");
word = word.replace("'s/NS$", "");
word = word.replace("s/NS$", "");
allNounsOfCorpus.add(word.toLowerCase());
}
//get rid of Possessives
else if (word.endsWith("/N$"))
{
allNounsOfCorpus.add(word);
word = word.replace("'s./N$", "");
word = word.replace("./N$", "");
word = word.replace("'s/N$", "");
word = word.replace("s/N$", "");
word = word.replace("'/N$", "");
word = word.replace("/N$", "");
allNounsOfCorpus.add(word.toLowerCase());
}
//get rid of Plural
else if (word.endsWith("/NS"))
{
//System.out.println(word);
word = word.replace("ies/NS", "y");
word = word.replace("ches/NS", "ch");
word = word.replace("ses/NS", "s");
word = word.replace("shes/NS", "sh");
word = word.replace("./NS", "");
word = word.replace("s/NS", "");
word = word.replace("s'/NS", "");
word = word.replace("'/NS", "");
if (word.endsWith("/NS"))
{
word = word.replace("/NS", "");
allPluralnounsOfCorpus.add(word);
}
allNounsOfCorpus.add(word.toLowerCase());
}
else if (word.endsWith("ADJ") || word.endsWith("ADJR") ||
word.endsWith("ADJS") || word.endsWith("ADV") ||
......@@ -40,16 +117,88 @@ public class Init {
{
allAdjectivesOfCorpus.add(word);
}
//System.out.println(word);
}
/*
System.out.println("Gesamt Wortanzahl: " + allWordsOfCorpus.size());
System.out.println("Anzahl Verben: " + allVerbsOfCorpus.size());
System.out.println("Anzahl Adjektive: " + allAdjectivesOfCorpus.size());
System.out.println("Anzahl Substantive: " + allNounsOfCorpus.size());
for (String noun : allVerbsOfCorpus)
*/
// create word frequency list of the nouns
Map<String, Integer> frequencyNouns = new HashMap<String,Integer>();
Set<String> nounTypes = new HashSet<String>(allNounsOfCorpus);
for (String key : nounTypes)
{
System.out.println(noun);
frequencyNouns.put(key, Collections.frequency(allNounsOfCorpus, key));
//System.out.println(key + ": " + Collections.frequency(allNounsOfCorpus, key));
}
/*
for (String noun: frequencyNouns.keySet()) {
String key = noun.toString();
String value = frequencyNouns.get(noun).toString();
System.out.println(key + " " + value);
}
*/
Map<String, Integer> suffixMorpheme = new HashMap<String,Integer>();
Map<String, ArrayList> morphemeWordList = new HashMap<String,ArrayList>();
for (String noun : nounTypes)
{
AffixStripper as = new AffixStripper(noun);
suffixMorpheme = as.getSuffixMorphem();
if (!suffixMorpheme.isEmpty())
{
for (String morpheme : suffixMorpheme.keySet())
{
ArrayList<String> WordListOfNounsWithSuffix = new ArrayList<String>();
if (morphemeWordList.get(morpheme)!=null)//only for the first iteration when the morphemeWordList does not contain any data
{
// keep the values of morphemeWordList that were written to it previously
WordListOfNounsWithSuffix = morphemeWordList.get(morpheme);
}
WordListOfNounsWithSuffix.add(noun);
morphemeWordList.put(morpheme, WordListOfNounsWithSuffix);
}
//System.out.println(noun + ": " + suffixMorpheme.keySet());
}
}
for (String s : morphemeWordList.keySet())
{
String key = s.toString();
String value = morphemeWordList.get(s).toString();
System.out.println(key + " " + value);
System.out.print("From which Hapax: ");
ArrayList<String> HapaxLegonoma = new ArrayList<String>();
ArrayList<String> allWordsContainingAffix = new ArrayList<String>(morphemeWordList.get(s));
int numberOfAffixInCorpus = 0;
for (String wordContainingAffix : allWordsContainingAffix)
{
numberOfAffixInCorpus += Collections.frequency(allNounsOfCorpus, wordContainingAffix);
if (Collections.frequency(allNounsOfCorpus, wordContainingAffix) == 1)
{
HapaxLegonoma.add(wordContainingAffix);
System.out.print(wordContainingAffix + " ");
}
}
System.out.println();
System.out.println("Number of Hapaxes: " + HapaxLegonoma.size());
System.out.println("Total number of word types containing the " + key + "-morpheme: " + morphemeWordList.get(s).size());
System.out.println("Total number of word tokens containing the " + key + "-morpheme: " + numberOfAffixInCorpus);
}
System.out.println("size noun types: " + nounTypes.size());
System.out.println("size morphemes: " + morphemeWordList.size());
//remove inflected forms
/**
* loop through list: for each word do:
* 0. reduce to verbs, nouns, and adjectives in three different lists
......
......@@ -88,7 +88,7 @@ public enum SuffixEnum {
ward("ward"), wards("wards"), ware("ware"), uaeras("ware"), uaras("ware"), uaro("ware"), waeras("ware"), wara("ware"), waran("ware"),
waras("ware"), waru("ware"), wearan("ware"), waeren("ware"), warae("ware"), wick("wick"), y("y"), ig("y"), ye("y"), igan("y"), izen("y"),
ezen("y"), yen("y"), ey("y"), yl("yl"), yne("yne");
private String morpheme;
private final String morpheme;
//constructor
SuffixEnum(String morpheme) {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment