Skip to content
Snippets Groups Projects
Commit dac27b04 authored by Peukert, Dr. Hagen's avatar Peukert, Dr. Hagen
Browse files

runnable prototype v1.0

verb, adjectives, composita not ready
GUI and m1-m4 variance not done yet
parent aed4a714
Branches
No related tags found
No related merge requests found
Showing
with 1007 additions and 231 deletions
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry including="**/*.java" kind="src" output="target/classes" path="src">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-16">
<attributes>
<attribute name="module" value="true"/>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="src" path="src"/>
<classpathentry kind="output" path="bin"/>
<classpathentry kind="output" path="target/classes"/>
</classpath>
/bin/
/target/
......@@ -10,8 +10,14 @@
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.m2e.core.maven2Builder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.m2e.core.maven2Nature</nature>
<nature>org.eclipse.jdt.core.javanature</nature>
</natures>
</projectDescription>
eclipse.preferences.version=1
encoding//src/Corpus.java=UTF-8
......@@ -9,6 +9,7 @@ org.eclipse.jdt.core.compiler.debug.sourceFile=generate
org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled
org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=warning
org.eclipse.jdt.core.compiler.release=enabled
org.eclipse.jdt.core.compiler.source=16
activeProfiles=
eclipse.preferences.version=1
resolveWorkspaceProjects=true
version=1
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>Morphochron</groupId>
<artifactId>Morphochron</artifactId>
<version>0.0.1-SNAPSHOT</version>
<build>
<sourceDirectory>src</sourceDirectory>
<resources>
<resource>
<directory>src</directory>
<excludes>
<exclude>**/*.java</exclude>
</excludes>
</resource>
</resources>
<plugins>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.1</version>
<configuration>
<release>16</release>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>org.json</groupId>
<artifactId>json</artifactId>
<version>20160810</version>
</dependency>
</dependencies>
</project>
\ No newline at end of file
import java.util.ArrayList;
public class Adjective implements WordClass{
private ArrayList<String> allAdjectivesOfCorpus = new ArrayList<String>();
private ArrayList<String> allWordsOfCorpus = new ArrayList<String>();
public void deleteInflections()
{
for (String word : allWordsOfCorpus)
{
if (word.endsWith("ADJ") || word.endsWith("ADJR") ||
word.endsWith("ADJS") || word.endsWith("ADV") ||
word.endsWith("ADVR") || word.endsWith("ADVS"))
{
allAdjectivesOfCorpus.add(word);
}
}
}
public void setWords(ArrayList<String> al)
{
this.allWordsOfCorpus = al;
}
public ArrayList<String> getNormalizedWords()
{
return allAdjectivesOfCorpus;
}
}
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
public class Affix
{
private Map<String, ArrayList<String>> morphemeWordList = new HashMap<String,ArrayList<String>>();
private ArrayList<String> filteredWords = new ArrayList<String>();
private String wordclass = "";
private String affixtype = "";
private int startdate = 0;
private int enddate = 0;
public Affix(ArrayList<String> filteredWords, int startdate, int enddate, String wordclass, String affixtype)
{
this.filteredWords = filteredWords;
this.affixtype = affixtype;
this.wordclass = wordclass;
this.startdate = startdate;
this.enddate = enddate;
processMorphemes();
}
public Map<String, ArrayList<String>> getMorphemeWordList()
{
return morphemeWordList;
}
private void processMorphemes()
{
Set<String> wordTypes = new HashSet<String>(filteredWords);
Map<String, Integer> affixMorpheme = new HashMap<String,Integer>();
//Map<String, ArrayList<String>> morphemeWordList = new HashMap<String,ArrayList<String>>();
for (String word : wordTypes)
{
AffixStripper as = new AffixStripper(word);
if (affixtype.equals("_su01"))
{
//System.out.println("Suffix morpheme list will be generated");
affixMorpheme = as.getSuffixMorphem(); //contains all suffix morphemes found in noun
}
else if (affixtype.equals("_pr01"))
{
affixMorpheme = as.getPrefixMorphem(); //contains all prefix morphemes found in noun
}
else
{
System.out.println("Affixtype not known");
}
if (!affixMorpheme.isEmpty())
{
for (String morpheme : affixMorpheme.keySet())
{
ArrayList<String> wordsWithAffix = new ArrayList<String>();
if (morphemeWordList.get(morpheme)!=null)//only for the first iteration when the morphemeWordList does not contain any data
{
// keep the values of morphemeWordList that were written to it previously
wordsWithAffix = morphemeWordList.get(morpheme);
//System.out.println("First Iteration: " + morphemeWordList.get(morpheme));
}
//System.out.println(word + " " + morpheme);
//call the Oxford class and check if the morpheme occurs in the noun
OED ox = new OED(word, morpheme, wordclass, affixtype, startdate, enddate);
if (ox.processOEDRequest())
{
wordsWithAffix.add(word);
morphemeWordList.put(morpheme, wordsWithAffix);
//System.out.println("when OED was consulted: " + word + ": " + morpheme);
}
//if (number_of_queries == 1000) break;
}
//System.out.println("Outside the second for-loop: " + word + ": " + affixMorpheme.keySet());
}
}
}
}
import java.util.ArrayList;
public class Compositum implements WordClass{
private ArrayList<String> allWordsOfCorpus = new ArrayList<String>();
private ArrayList<String> allCompoundNounsOfCorpus = new ArrayList<String>();
private ArrayList<String> allComoundVerbsOfCorpus = new ArrayList<String>();
private ArrayList<String> allCompoundAdjectivesOfCorpus = new ArrayList<String>();
public void deleteInflections()
{
}
public void setWords(ArrayList<String> al)
{
this.allWordsOfCorpus = al;
}
public ArrayList<String> getNormalizedWords()
{
return allCompoundNounsOfCorpus;
}
}
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
public class Corpus
{
private String corpusname = "";
private String directory = "";
private String period = "";
private String filter = "";
private int startdate = 0;
private int enddate = 0;
private ArrayList<String> allWordsOfCorpus = new ArrayList<String>();
public Corpus(String corpusname, String period)
{
this.corpusname = corpusname;
this.period = period;
setCorpusDirectory();
readCorpus();
}
public String getCorpusName()
{
return corpusname;
}
public String getPeriod()
{
return period;
}
public ArrayList<String> getCorpus()
{
return allWordsOfCorpus;
}
public int getStartDate()
{
return startdate;
}
public int getEndDate()
{
return enddate;
}
// location directory of corpus
private void setCorpusDirectory()
{
switch (corpusname)
{
case "ppcmbe":
startdate = 1700;
enddate = 1914;
directory = "C:\\Users\\Peukert\\Corpora\\PENN Corpus\\PENN-CORPORA\\PPCMBE-RELEASE-1\\corpus\\pos";
break;
case "ppceme":
startdate = 1500;
enddate = 1710;
directory = "C:\\Users\\Peukert\\Corpora\\PENN Corpus\\PENN-CORPORA\\PPCEME-RELEASE-2\\corpus\\pos\\penn2";
break;
case "ppcme2":
startdate = 1150;
enddate = 1500;
directory = "C:\\Users\\Peukert\\Corpora\\PENN Corpus\\PENN-CORPORA\\PPCME2-RELEASE-3\\corpus\\pos";
break;
}
//System.out.println(directory);
}
// set time information, method to read which files,
private void readCorpus()
{
IO io = new IO();
if (period.isEmpty())
{
filter = ".*\\.pos";
allWordsOfCorpus = io.readFilesFromDirectory(directory, filter);
}
else
{
switch (period)
{
case "m1":
startdate = 1150;
enddate = 1250;
filter = "[a-z0-9]+\\.[m][x]?[1]\\.pos";
allWordsOfCorpus = io.readFilesFromDirectory(directory, filter);
break;
case "m2":
startdate = 1250;
enddate = 1350;
filter = "[a-z0-9]+\\.[m][2]\\d?\\.pos";
allWordsOfCorpus = io.readFilesFromDirectory(directory, filter);
break;
case "m3":
startdate = 1350;
enddate = 1420;
filter = "[a-z0-9]+\\.[m][3]\\d?\\.pos";
allWordsOfCorpus = io.readFilesFromDirectory(directory, filter);
break;
case "m4":
startdate = 1420;
enddate = 1500;
filter = "[a-z0-9]+\\.[m][x]?[4]\\.pos";
allWordsOfCorpus = io.readFilesFromDirectory(directory, filter);
break;
case "e1":
startdate = 1500;
enddate = 1569;
filter = "[a-z0-9]+-e1-p2\\.pos";
allWordsOfCorpus = io.readFilesFromDirectory(directory, filter);
break;
case "e2":
startdate = 1570;
enddate = 1639;
filter = "[a-z0-9]+-e2-p2\\.pos";
allWordsOfCorpus = io.readFilesFromDirectory(directory, filter);
break;
case "e3":
startdate = 1640;
enddate = 1710;
filter = "[a-z0-9]+-e3-p2\\.pos";
allWordsOfCorpus = io.readFilesFromDirectory(directory, filter);
break;
case "Emod1":
startdate = 1700;
enddate = 1769;
filter = "[a-z0-9]+-[1][7][^789](\\d|[x]?)\\.pos";
allWordsOfCorpus = io.readFilesFromDirectory(directory, filter);
break;
case "Emod2":
startdate = 1770;
enddate = 1839;
filter = "[a-z0-9]+-[1]([7][789]|[8][0123])(\\d|[x]?)\\.pos";
allWordsOfCorpus = io.readFilesFromDirectory(directory, filter);
break;
case "Emod3":
startdate = 1840;
enddate = 1914;
filter = "[a-z0-9]+-[1]([8][456789]|[9][01])(\\d|[x]?)\\.pos";;
allWordsOfCorpus = io.readFilesFromDirectory(directory, filter);
break;
}
normalizeText();
}
}
private void normalizeText()
{
//+a for æ; +t for þ; +d for ð, +g for ȝ
for (int i = 0; i < allWordsOfCorpus.size(); i++)
{
String word = allWordsOfCorpus.get(i);
//delete $-sign of manually corrected words (see corpus docu)
if (word.startsWith("$"))
{
word = word.replaceFirst("\\$", "");
//System.out.println(word);
}
// normalize text data (only applicable for m1-m4)
if (word.matches(".*\\+[agdtAGDT][a-zA-Z0-9]*[_|/][A-Z]+") && corpusname.equals("ppcme2"))
{
word = word.replaceAll("\\+[tT]", "th");//þ
word = word.replaceAll("\\+[gG]", "z");//ȝ
word = word.replaceAll("\\+[dD]", "th");//ð
word = word.replaceAll("\\+[aA]", "ae");//æ
allWordsOfCorpus.add(word);
//System.out.println(word);
allWordsOfCorpus.remove(i);
}
}
}
/*
* delete the main method once programm is finished
* exist only for test purposes
*/
// public static void main(String[] args)
// {
// Corpus cp = new Corpus("ppcme2", "m1");
// cp.getCorpus();
// }
}
......@@ -10,27 +10,33 @@ import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.stream.Stream;
import javax.net.ssl.HttpsURLConnection;
import org.json.JSONObject;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
public class IO {
static ArrayList<String> readFilesFromDirectory(String path)
// read the files of entire directory
public ArrayList<String> readFilesFromDirectory(String path, String filter)
{
ArrayList<String> allWordsOfTexts = new ArrayList<String>();
try (Stream<Path> paths = Files.walk(Paths.get(path))) {
paths.filter(Files::isRegularFile).forEach(item ->
paths.filter(Files::isRegularFile).
filter(p -> p.getFileName().toString().matches(filter)).
forEach(item ->
{
//System.out.println(item.getFileName().toString() );
try (Scanner s = new Scanner(new File(item.toString())).useDelimiter("\\s+"))
{
while (s.hasNext())
......@@ -40,13 +46,15 @@ public class IO {
}
catch (FileNotFoundException e) {}
});
}
catch (IOException e) {}
return allWordsOfTexts;
}
static String readFile(String file, boolean unicode)
public String readFile(String file, boolean unicode)
{
String data = "";
......@@ -76,10 +84,10 @@ public class IO {
} catch(IOException e){}
}
static Map<String, ArrayList> readMorphemeWordListFromCSVFile(String filepath) throws IOException
static Map<String, ArrayList<String>> readMorphemeWordListFromCSVFile(String filepath) throws IOException
{
Map<String, ArrayList> morphemeWordList = new HashMap<String, ArrayList>();
Map<String, ArrayList<String>> morphemeWordList = new HashMap<String, ArrayList<String>>();
try(BufferedReader br = new BufferedReader(new FileReader(filepath))) {
String line = "";
while ((line = br.readLine()) != null) {
......@@ -98,7 +106,7 @@ public class IO {
return morphemeWordList;
}
static void writeMorphemeWordListToCSVFile(String filepath, Map<String, ArrayList> morphemeWordList)
public void writeMorphemeWordListToCSVFile(String filepath, Map<String, ArrayList<String>> morphemeWordList)
{
try
{
......@@ -125,57 +133,51 @@ public class IO {
}
}
static void appendResultsToCSVFile(String filepath, String postprocessingfilepath, Set<String> wordClassTypes, ArrayList<String> allWordClassOfCorpus, ArrayList<String> allWordsOfCorpus) throws IOException
public void appendResultsToCSVFile(String filepath, String data) throws IOException
{
Map<String, ArrayList> morphemeWordList = new HashMap<String,ArrayList>();
//read from file the postprocessed data
morphemeWordList = readMorphemeWordListFromCSVFile(postprocessingfilepath);
// header of the result file
String header = "Suffixes: Nouns in PPCMBE\nTotal noun types: " + wordClassTypes.size() + " of "
+ allWordClassOfCorpus.size() + " nouns and of " + allWordsOfCorpus.size()
+ " word in total"
+ "\n\nMorpheme;Contained in Words;Hapaxes;Types (V);Tokens;No Hapaxes;P";
try
{
FileWriter file = new FileWriter(filepath, true); //for overwriting set boolean to false
PrintWriter write = new PrintWriter(file);
write.println(header);
for (String s : morphemeWordList.keySet())
{
String key = s.toString();
ArrayList<String> allWordsContainingAffix = morphemeWordList.get(s);
//System.out.println(key + " " + value);
//System.out.print("From which Hapax: ");
ArrayList<String> HapaxLegonema = new ArrayList<String>();
//ArrayList<String> allWordsContainingAffix = new ArrayList<String>(value);
int numberOfAffixInCorpus = 0;
for (String wordContainingAffix : allWordsContainingAffix)
{
numberOfAffixInCorpus += Collections.frequency(allWordClassOfCorpus, wordContainingAffix);
if (Collections.frequency(allWordClassOfCorpus, wordContainingAffix) == 1)
write.println(data);
write.close();
}
catch (IOException e)
{
HapaxLegonema.add(wordContainingAffix);
//System.out.print(wordContainingAffix + " ");
System.out.println(e.getMessage());
}
}
//calculte the p-value as a productivity measure
int hapaxtypes = HapaxLegonema.size();
double p_value = 0.0;
if (numberOfAffixInCorpus != 0)
/*
* given a REST API URL and credentials, data are read from which the given URL points to
*/
public String requestRESTfulAPI(String restUrl, String app_id, String app_key)
{
p_value = (double)hapaxtypes / (double)numberOfAffixInCorpus;
}
String jsonString = "";
write.println(key + ";" + allWordsContainingAffix + ";" + HapaxLegonema + ";"
+ allWordsContainingAffix.size() + ";" + numberOfAffixInCorpus
+ ";" + hapaxtypes + ";" + p_value);
try
{
URL url = new URL(restUrl);
HttpsURLConnection urlConnection = (HttpsURLConnection) url.openConnection();
urlConnection.setRequestProperty("Accept", "application/json");
urlConnection.setRequestProperty("app_id", app_id);
urlConnection.setRequestProperty("app_key", app_key);
// read the output from the server
BufferedReader reader = new BufferedReader(new InputStreamReader(urlConnection.getInputStream()));
StringBuilder stringBuilder = new StringBuilder();
String line = null;
while ((line = reader.readLine()) != null)
{
stringBuilder.append(line + "\n");
}
write.close();
jsonString = stringBuilder.toString();
//System.out.println("retrieved OED entry: " + stringBuilder.toString());
}
catch (IOException e)
{
System.out.println(e.getMessage());
e.printStackTrace();
}
return jsonString;
}
}
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Collections;
public class Init {
/**
......@@ -23,171 +17,65 @@ public class Init {
*/
public static void main(String[] args) throws IOException
{
// read all texts of the corpus file in list
String directory = "C:\\Users\\Peukert\\Corpora\\PENN Corpus\\PENN-CORPORA\\PPCMBE-RELEASE-1\\corpus\\pos";
// write file to postprocessing file
String postprocessingfile = "C:\\Users\\Peukert\\Documents\\postprocessingfile.csv";
// write file to final result file
String finalresultsfile = "C:\\Users\\Peukert\\Documents\\resultsMorphochron.csv";
IO io = new IO();
ArrayList<String> allWordsOfCorpus = new ArrayList<String>();
allWordsOfCorpus = io.readFilesFromDirectory(directory);
// create 6 lists for each word class (A,V,N) and compounds each
ArrayList<String> allNounsOfCorpus = new ArrayList<String>();
ArrayList<String> allVerbsOfCorpus = new ArrayList<String>();
ArrayList<String> allAdjectivesOfCorpus = new ArrayList<String>();
ArrayList<String> allCompoundNounsOfCorpus = new ArrayList<String>();
ArrayList<String> allComoundVerbsOfCorpus = new ArrayList<String>();
ArrayList<String> allCompoundAdjectivesOfCorpus = new ArrayList<String>();
ArrayList<String> allPluralnounsOfCorpus = new ArrayList<String>();
//remove inflected forms
for (String word : allWordsOfCorpus)
{
//handling for verbs
if (word.endsWith("MD") || word.endsWith("MD0") ||
word.endsWith("VAG") || word.endsWith("VAN") ||
word.endsWith("VB") || word.endsWith("VBI") ||
word.endsWith("VBD") || word.endsWith("VBN") ||
word.endsWith("VBP"))
{
allVerbsOfCorpus.add(word);
}
// handling for compound nouns
else if (word.endsWith("+N") || word.endsWith("+N$")
|| word.endsWith("+NS") || word.endsWith("+NS$")
)
{
allCompoundNounsOfCorpus.add(word);
//System.out.println(word);
}
else if (word.endsWith("/N")
//Proper Nouns of all kinds are excluded
// || word.endsWith("NPR") || word.endsWith("NPR$")
// || word.endsWith("NPRS") || word.endsWith("NPRS$")
// all forms of nominalized other, e.g. the other are excluded
// || word.endsWith("OTHER") || word.endsWith("OTHER$")
// || word.endsWith("OTHERS") || word.endsWith("OTHERS$")
)
{
word = word.replace("/N", "");
allNounsOfCorpus.add(word.toLowerCase());
}
/*get rid of Possessives and Plural
* (Plural nouns cannot be sorted out,
* possible with a second loop but not
* worthwhile since not containing lexical morphemes)*/
else if (word.endsWith("/NS$"))
{
word = word.replace("ies/NS$", "y");
word = word.replace("ies'/NS$", "y");
word = word.replace("ches/NS$", "ch");
word = word.replace("ches'/NS$", "ch");
word = word.replace("ses/NS$", "s");
word = word.replace("ses'/NS$", "s");
word = word.replace("shes/NS$", "sh");
word = word.replace("shes'/NS$", "sh");
word = word.replace("./NS$", "");
word = word.replace("s'/NS$", "");
word = word.replace("'/NS$", "");
word = word.replace("'s/NS$", "");
word = word.replace("s/NS$", "");
allNounsOfCorpus.add(word.toLowerCase());
}
//get rid of Possessives
else if (word.endsWith("/N$"))
{
word = word.replace("'s./N$", "");
word = word.replace("./N$", "");
word = word.replace("'s/N$", "");
word = word.replace("s/N$", "");
word = word.replace("'/N$", "");
word = word.replace("/N$", "");
allNounsOfCorpus.add(word.toLowerCase());
}
//get rid of Plural
else if (word.endsWith("/NS"))
{
//System.out.println(word);
word = word.replace("ies/NS", "y");
word = word.replace("ches/NS", "ch");
word = word.replace("ses/NS", "s");
word = word.replace("shes/NS", "sh");
word = word.replace("./NS", "");
word = word.replace("s/NS", "");
word = word.replace("s'/NS", "");
word = word.replace("'/NS", "");
if (word.endsWith("/NS"))
{
word = word.replace("/NS", "");
allPluralnounsOfCorpus.add(word);
}
allNounsOfCorpus.add(word.toLowerCase());
}
else if (word.endsWith("ADJ") || word.endsWith("ADJR") ||
word.endsWith("ADJS") || word.endsWith("ADV") ||
word.endsWith("ADVR") || word.endsWith("ADVS"))
{
allAdjectivesOfCorpus.add(word);
}
}
/*
System.out.println("Gesamt Wortanzahl: " + allWordsOfCorpus.size());
System.out.println("Anzahl Verben: " + allVerbsOfCorpus.size());
System.out.println("Anzahl Adjektive: " + allAdjectivesOfCorpus.size());
System.out.println("Anzahl Substantive: " + allNounsOfCorpus.size());
*/
// create word frequency list of the nouns
Map<String, Integer> frequencyNouns = new HashMap<String,Integer>();
Set<String> nounTypes = new HashSet<String>(allNounsOfCorpus);
for (String key : nounTypes)
{
frequencyNouns.put(key, Collections.frequency(allNounsOfCorpus, key));
//System.out.println(key + ": " + Collections.frequency(allNounsOfCorpus, key));
}
/*
for (String noun: frequencyNouns.keySet()) {
String key = noun.toString();
String value = frequencyNouns.get(noun).toString();
System.out.println(key + " " + value);
}
* Usage: specify which of the following corpora (2nd argument = period is optional)
* Corpus: period
* ppcme2: m1, m2, m3, m4
* ppceme: e1, e2, e3
* ppcmbe: Emod1, Emod2, Emod3
*
* Word Class: _nn01, _vb01, _jj01
*
* Affix Type: _su01, _pr01
*
* TODO:
* 1. implement GUI to have properties selected
* 2. include credentials and directories as selection
*/
String corpus = "ppcmbe";
String period = "Emod2";
String wordclass ="_nn01";
String affixtype = "_su01";
Map<String, Integer> suffixMorpheme = new HashMap<String,Integer>();
Map<String, ArrayList> morphemeWordList = new HashMap<String,ArrayList>();
System.out.println(
"Selection made\ncorpus: " + corpus + "\nperiod: " + period +
"\nword class: " + wordclass + "\naffixtype: " + affixtype);
for (String noun : nounTypes)
{
AffixStripper as = new AffixStripper(noun);
suffixMorpheme = as.getSuffixMorphem();
if (!suffixMorpheme.isEmpty())
{
Corpus cp = new Corpus(corpus, period);
ArrayList<String> allWordsOfCorpus = new ArrayList<String>();
allWordsOfCorpus = cp.getCorpus();
System.out.println("Corpus read completely and normalized");
for (String morpheme : suffixMorpheme.keySet())
{
ArrayList<String> WordListOfNounsWithSuffix = new ArrayList<String>();
//create normalized word lists (factory pattern)
WordClassFactory wordClassFactory = new WordClassFactory();
WordClass wc = wordClassFactory.normalizeWords(wordclass, allWordsOfCorpus);
if (morphemeWordList.get(morpheme)!=null)//only for the first iteration when the morphemeWordList does not contain any data
{
// keep the values of morphemeWordList that were written to it previously
WordListOfNounsWithSuffix = morphemeWordList.get(morpheme);
}
ArrayList<String> normalizedWords = new ArrayList<String>();
// normalizedWords = wc.getNormalizedWords();
// for (String word : normalizedWords)
// {
// System.out.println(word);
// }
normalizedWords.add("mountainousness");
normalizedWords.add("mountainous");
normalizedWords.add("counterargument");
normalizedWords.add("precondition");
normalizedWords.add("reanimation");
normalizedWords.add("degeneration");
normalizedWords.add("proposition");
WordListOfNounsWithSuffix.add(noun);
morphemeWordList.put(morpheme, WordListOfNounsWithSuffix);
}
//System.out.println(noun + ": " + suffixMorpheme.keySet());
}
}
//write csv file to manually postprocess the data
//io.writeMorphemeWordListToCSVFile(postprocessingfile, morphemeWordList);
System.out.println("All words of type " + wordclass + " selected");
//detect affixes in word list as a pre-processing and countercheck these with OED REST API
Affix aff = new Affix(normalizedWords, cp.getStartDate(), cp.getEndDate(), wordclass, affixtype);
Map<String, ArrayList<String>> morphemeWordList = new HashMap<String,ArrayList<String>>();
morphemeWordList = aff.getMorphemeWordList();
// write all results to CSV file
io.appendResultsToCSVFile(finalresultsfile, postprocessingfile, nounTypes, allNounsOfCorpus, allWordsOfCorpus);
System.out.println("Affixes parsed and validated in OED");
System.out.println("Writing results to file");
//calculate results and write them to file
Result rs = new Result(morphemeWordList, cp, normalizedWords, wordclass, affixtype);
System.out.println("Done!");
}
}
public class Measure {
}
import java.util.ArrayList;
public class Noun implements WordClass
{
private ArrayList<String> allNounsOfCorpus = new ArrayList<String>();
private ArrayList<String> allPluralnounsOfCorpus = new ArrayList<String>();
private ArrayList<String> allWordsOfCorpus = new ArrayList<String>();
public void deleteInflections()
{
for (String word : allWordsOfCorpus)
{
if (word.endsWith("/N")
//Proper Nouns of all kinds are excluded
// || word.endsWith("NPR") || word.endsWith("NPR$")
// || word.endsWith("NPRS") || word.endsWith("NPRS$")
// all forms of nominalized other, e.g. the other are excluded
// || word.endsWith("OTHER") || word.endsWith("OTHER$")
// || word.endsWith("OTHERS") || word.endsWith("OTHERS$")
)
{
word = word.replace("/N", "");
allNounsOfCorpus.add(word.toLowerCase());
}
/*get rid of Possessives and Plural
* (Plural nouns cannot be sorted out,
* possible with a second loop but not
* worthwhile since not containing lexical morphemes)*/
else if (word.endsWith("/NS$"))
{
word = word.replace("ies/NS$", "y");
word = word.replace("ies'/NS$", "y");
word = word.replace("ches/NS$", "ch");
word = word.replace("ches'/NS$", "ch");
word = word.replace("ses/NS$", "s");
word = word.replace("ses'/NS$", "s");
word = word.replace("shes/NS$", "sh");
word = word.replace("shes'/NS$", "sh");
word = word.replace("./NS$", "");
word = word.replace("s'/NS$", "");
word = word.replace("'/NS$", "");
word = word.replace("'s/NS$", "");
word = word.replace("s/NS$", "");
allNounsOfCorpus.add(word.toLowerCase());
}
//get rid of Possessives
else if (word.endsWith("/N$"))
{
word = word.replace("'s./N$", "");
word = word.replace("./N$", "");
word = word.replace("'s/N$", "");
word = word.replace("s/N$", "");
word = word.replace("'/N$", "");
word = word.replace("/N$", "");
allNounsOfCorpus.add(word.toLowerCase());
}
//get rid of Plural
else if (word.endsWith("/NS"))
{
//System.out.println(word);
word = word.replace("ies/NS", "y");
word = word.replace("ches/NS", "ch");
word = word.replace("ses/NS", "s");
word = word.replace("shes/NS", "sh");
word = word.replace("./NS", "");
word = word.replace("s/NS", "");
word = word.replace("s'/NS", "");
word = word.replace("'/NS", "");
if (word.endsWith("/NS"))
{
word = word.replace("/NS", "");
allPluralnounsOfCorpus.add(word);
}
allNounsOfCorpus.add(word.toLowerCase());
}
}
}
public void setWords(ArrayList<String> al)
{
this.allWordsOfCorpus = al;
deleteInflections();
}
public ArrayList<String> getNormalizedWords()
{
return allNounsOfCorpus;
}
}
import javax.net.ssl.HttpsURLConnection;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;
import org.json.JSONObject;
import org.json.JSONArray;
public class OED
/**
* to get the word_ID right, concatenate the word with the following codes
* - nn01 for Noun
* - vb01 for Verb
* - jj01 for Adjective
* - rb01 for Adverb
* - su01 for Suffix
* - pr01 for Prefix
* the running number orders the entries in the dictionary, i.e. the number of meanings
*/
{
private String word = "";
private String morpheme = "";
private String wordclass = "";
private String affixtype = "";
private int enddateCorpus = 0;
private int startdateCorpus = 0;
private final String app_id;
private final String app_key;
private IO restapi = new IO();
//for each look up the word and one of its contained affixes is needed
public OED(String word, String morpheme, String wordclass, String affixtype, int startdateCorpus, int enddateCorpus)
{
this.app_id = restapi.readFile("C:\\Users\\Peukert\\Documents\\Morphochron\\id", false);
this.app_key = restapi.readFile("C:\\Users\\Peukert\\Documents\\Morphochron\\key", false);
this.word = word;
this.morpheme = morpheme;
this.wordclass = wordclass;
this.affixtype = affixtype;
this.startdateCorpus = startdateCorpus;
this.enddateCorpus = enddateCorpus;
}
/*
* gets word representation of OED REST API as JSON object
*/
private String getRESTAPIWordRepresentation(String word)
{
return "https://oed-researcher-api.oxfordlanguages.com/oed/api/v0.2/words/?lemma=" + word;
}
/*
* gets the roots OED representation of a word
*/
private String getRESTAPIRootRepresentation(String wordID)
{
return "https://oed-researcher-api.oxfordlanguages.com/oed/api/v0.2/word/" + wordID + "/roots/";
}
/*
* extracts ID from the OED word JSON response
* checks if ID corresponds to given word class
*/
private String processJSonWordID(JSONObject obj)
{
String wordid = "";
JSONArray arr = obj.getJSONArray("data");
for (int i = 0; i < arr.length(); i++)
{
wordid = arr.getJSONObject(i).getString("id");
//System.out.println("Wort-ID:" + wordid);
if (wordid.equals(word + wordclass)) break; //words may be part of several word classes
}
//System.out.println("Wort-ID-vor Rckgabe:" + wordid);
return wordid;
}
/*
* checks if a word has an extra entry for the given morpheme
* checks if field daterange.obsolete = false and daterange.end is null
*/
private Boolean processJSonRoot(JSONObject obj)
{
int startyearOED = 0;
int endyearOED = 0;
Boolean occurredIn = false;
Boolean obsolete = true;
String affix = "";
if (affixtype.equals("_pr01"))
{
affix = morpheme +"-";
}
else if (affixtype.equals("_su01"))
{
affix = "-" + morpheme;
}
else
{
System.out.println("Affix type not defined");
}
JSONArray arr = obj.getJSONArray("data");
for (int i = 0; i < arr.length(); i++)
{
//System.out.println("The following morpheme is checked for existence: " + morpheme);
occurredIn = arr.getJSONObject(i).getString("lemma").equals(affix);
if (occurredIn)
{
//System.out.println("The following morpheme was actually found: " + morpheme);
endyearOED = arr.getJSONObject(i).getJSONObject("daterange").optInt("end", 10000);//lots of enddates are null,i.e. not set
startyearOED = arr.getJSONObject(i).getJSONObject("daterange").optInt("start", 0);
obsolete = arr.getJSONObject(i).getJSONObject("daterange").getBoolean("obsolete");
break;
}
}
// && obsolete not included because it seems to be today's perspective of obselete
return (occurredIn && startyearOED < startdateCorpus && enddateCorpus < endyearOED);
}
/*
* processes OED API queries
*/
private JSONObject getJSonResponse(String restUrl)
{
return new JSONObject(restapi.requestRESTfulAPI(restUrl, app_id, app_key));
}
/*
* 1. build URL for Word-REST API --> getRESTAPIWordRepresentation(String) String
* 2. get word JSON Format from Word-REST API --> getJSonResponse(String) JSONObject
* 3a. get the wordID out of the JSon --> processJSonWordID(JSONOBject) String
* 3b. get time range ?
* ---> same logic again <---
* 4. build URL for Root-REST API --> getRESTAPIRootRepresentation(String) String
* 5. get root JSON format from Root-REST API --> getJSonResponse(String) JSONObject
* 6a. get start date out of JSON --> processJSonRoot(JSONObject)
*
*/
public Boolean processOEDRequest()
{
//Map<String, Integer> oedData = new HashMap<String,Integer>();
Boolean entryAvailable = false;
String wordJSON = getRESTAPIWordRepresentation(word.toLowerCase());
JSONObject jo = getJSonResponse(wordJSON);
String id = processJSonWordID(jo);
if (!id.isEmpty())
{
String s = getRESTAPIRootRepresentation(id);
JSONObject o = getJSonResponse(s);
entryAvailable = processJSonRoot(o);
}
else
{
System.out.println("Word does not exist in OED");
}
return entryAvailable;
}
/*
* delete the main method once programm is finished
* exist only for test purposes
*/
// public static void main(String[] args)
// {
//
// String word = "mountainousness";
// String morpheme ="ous";
// String wordclass = "_nn01";
// String affixtype = "_su01";
// int startdate = 1570;
// int enddate = 1639;
// OED ox = new OED(word, morpheme, wordclass, affixtype, startdate, enddate);
// ox.processOEDRequest();
//
// }
}
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
public class Result
{
private Corpus cp;
private String affixtype = "";
private String wordclass = "";
private String postprocessingfile = "";
private String finalresultsfile = "";
private ArrayList<String> filteredWords = new ArrayList<String>();
private ArrayList<String> allWordsOfCorpus = new ArrayList<String>();
private Map<String, ArrayList<String>> morphemeWordList = new HashMap<String,ArrayList<String>>();
public Result(Map<String, ArrayList<String>> morphemeWordList, Corpus cp, ArrayList<String> filteredWords, String wordtype, String affixtype) throws IOException
{
this.morphemeWordList = morphemeWordList; //the mapping between the morpheme and all word types it is contained in
this.filteredWords = filteredWords; //these are all the words of the respective word class without tags)
this.affixtype = affixtype;
this.wordclass = wordtype;
this.cp = cp;
this.allWordsOfCorpus = cp.getCorpus(); //all words of the corpus irrespective of the word class (words contain tags!)
setFileNames();
generateDataSet();
writeAllMeasuresFile();
}
private ArrayList<String> findHapaxes(ArrayList<String> allWordsContainingAffix)
{
ArrayList<String> hapaxes = new ArrayList<String>();
for (String wordContainingAffix : allWordsContainingAffix)
{
if (Collections.frequency(filteredWords, wordContainingAffix) == 1)
{
hapaxes.add(wordContainingAffix);
}
}
return hapaxes;
}
private int calculateNumberOfAffixes(ArrayList<String> allWordsContainingAffix)
{
int numberOfAffixInCorpus = 0;
for (String wordContainingAffix : allWordsContainingAffix)
{
numberOfAffixInCorpus += Collections.frequency(filteredWords, wordContainingAffix);
}
return numberOfAffixInCorpus;
}
private double calculateP_Value(int hapaxtypes, int numberOfAffixes)
{
double p_value = 0.0;
if (numberOfAffixes != 0)
{
p_value = (double)hapaxtypes / (double)numberOfAffixes;
}
return p_value;
}
private Map<String, Integer> countTokens(ArrayList<String> words)
{
Map<String, Integer> frequencyWords = new HashMap<String,Integer>();
Set<String> wordTypes = new HashSet<String>(words);
for (String key : wordTypes)
{
frequencyWords.put(key, Collections.frequency(words, key));
//System.out.println(key + ": " + Collections.frequency(filteredWords, key));
}
// for (String word: frequencyWords.keySet()) {
// String key = word.toString();
// String value = frequencyWords.get(word).toString();
// System.out.println(key + " " + value);
// }
return frequencyWords;
}
private Set<String> setWordTypes(ArrayList<String> words)
{
Set<String> wordTypes = new HashSet<String>(words);
return wordTypes;
}
/*
* Defines Header as it appears as a heading in the CSC file
*/
private String createHeader()
{
//more appropriate Naming for Header Output
if (affixtype.equals("_su01")) affixtype = "Suffixes";
if (affixtype.equals("_pr01")) affixtype = "Prefixes";
if (wordclass.equals("_nn01")) wordclass = "Nouns";
if (wordclass.equals("_vb01")) wordclass = "Verbs";
if (wordclass.equals("_jj01")) wordclass = "Adjectives";
return affixtype + ": " + wordclass + " in " + cp.getCorpusName() + "/" + cp.getPeriod() + " (" + cp.getStartDate() + "-" + cp.getEndDate() + ")\n" +
"Total Types (" + wordclass+ "): " + setWordTypes(filteredWords).size() + " of " +
filteredWords.size() + " " + wordclass + " and of " + allWordsOfCorpus.size() +
" words in total\n\n" + "Morpheme;Contained in Words;Hapaxes;Types (V);Tokens;No Hapaxes;P\n";
}
/*
* Generates the Data string written to the CSV result file
*/
private String generateDataSet()
{
String data = createHeader();
for (String s : morphemeWordList.keySet())
{
String key = s.toString();
// allWordsContainingAffix is a list with all wordtypes containing one affix s.toString
ArrayList<String> allWordsContainingAffix = morphemeWordList.get(s);
ArrayList<String> hapaxes = findHapaxes(allWordsContainingAffix);
int affixfrequencyForAllWordTokens = calculateNumberOfAffixes(allWordsContainingAffix);
int affixfrequencyForAllWordTypes = allWordsContainingAffix.size();
data += key + ";" + allWordsContainingAffix + ";" + hapaxes + ";" +
affixfrequencyForAllWordTypes + ";" + affixfrequencyForAllWordTokens + ";" +
hapaxes.size() + ";"+ calculateP_Value(hapaxes.size(), affixfrequencyForAllWordTokens) + "\n";
}
return data + "\n\n";
}
private void setFileNames()
{
// location postprocessing file
postprocessingfile = "C:\\Users\\Peukert\\Documents\\postprocessingfile.csv";
// location final result file
finalresultsfile = "C:\\Users\\Peukert\\Documents\\resultsMorphochron.csv";
}
private void writePostProcessingFile()
{
IO io = new IO();
//write csv file to manually postprocess the data
io.writeMorphemeWordListToCSVFile(postprocessingfile, morphemeWordList);
}
private void writeAllMeasuresFile() throws IOException
{
IO io = new IO();
// write all results to CSV file
io.appendResultsToCSVFile(finalresultsfile, generateDataSet());
}
}
import java.util.ArrayList;
public class Verb implements WordClass{
private ArrayList<String> allVerbsOfCorpus = new ArrayList<String>();
private ArrayList<String> allWordsOfCorpus = new ArrayList<String>();
public void deleteInflections()
{
for (String word : allWordsOfCorpus)
{
if (word.endsWith("MD") || word.endsWith("MD0") ||
word.endsWith("VAG") || word.endsWith("VAN") ||
word.endsWith("VB") || word.endsWith("VBI") ||
word.endsWith("VBD") || word.endsWith("VBN") ||
word.endsWith("VBP"))
{
allVerbsOfCorpus.add(word);
}
}
}
public void setWords(ArrayList<String> al)
{
this.allWordsOfCorpus = al;
}
public ArrayList<String> getNormalizedWords()
{
return allVerbsOfCorpus;
}
}
import java.util.ArrayList;
public interface WordClass {
public void deleteInflections();
public void setWords(ArrayList<String> al);
public ArrayList<String> getNormalizedWords();
}
import java.util.ArrayList;
public class WordClassFactory {
public WordClass normalizeWords(String type, ArrayList<String> allWordsOfCorpus)
{
WordClass wc = null;
if (type.equals("_nn01"))
{
wc = new Noun();
}
else if (type.equals("_vb01"))
{
wc = new Verb();
}
else if (type.equals("_jj01"))
{
wc = new Adjective();
}
else
{
System.out.println("Undefined word class! Use _nn01, _vb01, or _jj01");
}
wc.setWords(allWordsOfCorpus);
return wc;
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment