diff --git a/Morphochron/.classpath b/Morphochron/.classpath index c0faa259f17fb9437f86dffca9f6396c0cbcb598..0e8fc5ba08e5af26b6c6c7fa1c28bf28a9c6bb61 100644 --- a/Morphochron/.classpath +++ b/Morphochron/.classpath @@ -1,10 +1,21 @@ <?xml version="1.0" encoding="UTF-8"?> <classpath> + <classpathentry including="**/*.java" kind="src" output="target/classes" path="src"> + <attributes> + <attribute name="optional" value="true"/> + <attribute name="maven.pomderived" value="true"/> + </attributes> + </classpathentry> <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-16"> <attributes> <attribute name="module" value="true"/> + <attribute name="maven.pomderived" value="true"/> + </attributes> + </classpathentry> + <classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER"> + <attributes> + <attribute name="maven.pomderived" value="true"/> </attributes> </classpathentry> - <classpathentry kind="src" path="src"/> - <classpathentry kind="output" path="bin"/> + <classpathentry kind="output" path="target/classes"/> </classpath> diff --git a/Morphochron/.gitignore b/Morphochron/.gitignore index ae3c1726048cd06b9a143e0376ed46dd9b9a8d53..09e3bc9b241c477ea341af9ee029becad0c2148c 100644 --- a/Morphochron/.gitignore +++ b/Morphochron/.gitignore @@ -1 +1,2 @@ /bin/ +/target/ diff --git a/Morphochron/.project b/Morphochron/.project index ff09e0642ede5ab4acd25600fec9bee032194f86..6bdb594c9784ea143e91d3c5feaa278f3cf57101 100644 --- a/Morphochron/.project +++ b/Morphochron/.project @@ -10,8 +10,14 @@ <arguments> </arguments> </buildCommand> + <buildCommand> + <name>org.eclipse.m2e.core.maven2Builder</name> + <arguments> + </arguments> + </buildCommand> </buildSpec> <natures> + <nature>org.eclipse.m2e.core.maven2Nature</nature> <nature>org.eclipse.jdt.core.javanature</nature> </natures> </projectDescription> diff --git a/Morphochron/.settings/org.eclipse.core.resources.prefs b/Morphochron/.settings/org.eclipse.core.resources.prefs new file mode 100644 index 0000000000000000000000000000000000000000..c5b47a0634c9bd27699cb22a1b076fcf61e470da --- /dev/null +++ b/Morphochron/.settings/org.eclipse.core.resources.prefs @@ -0,0 +1,2 @@ +eclipse.preferences.version=1 +encoding//src/Corpus.java=UTF-8 diff --git a/Morphochron/.settings/org.eclipse.jdt.core.prefs b/Morphochron/.settings/org.eclipse.jdt.core.prefs index ae7f7b393112f044fb4f4f25019353a5fa38c6b5..787d811d1fdfb1ffc869848809626537704e3576 100644 --- a/Morphochron/.settings/org.eclipse.jdt.core.prefs +++ b/Morphochron/.settings/org.eclipse.jdt.core.prefs @@ -9,6 +9,7 @@ org.eclipse.jdt.core.compiler.debug.sourceFile=generate org.eclipse.jdt.core.compiler.problem.assertIdentifier=error org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled org.eclipse.jdt.core.compiler.problem.enumIdentifier=error +org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=warning org.eclipse.jdt.core.compiler.release=enabled org.eclipse.jdt.core.compiler.source=16 diff --git a/Morphochron/.settings/org.eclipse.m2e.core.prefs b/Morphochron/.settings/org.eclipse.m2e.core.prefs new file mode 100644 index 0000000000000000000000000000000000000000..14b697b7bbb0d85e8d8ee19141a2a92d9ce211be --- /dev/null +++ b/Morphochron/.settings/org.eclipse.m2e.core.prefs @@ -0,0 +1,4 @@ +activeProfiles= +eclipse.preferences.version=1 +resolveWorkspaceProjects=true +version=1 diff --git a/Morphochron/pom.xml b/Morphochron/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..5b9c3254827a0c9a8bf395b2f00fa4733c0d789c --- /dev/null +++ b/Morphochron/pom.xml @@ -0,0 +1,33 @@ +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + <groupId>Morphochron</groupId> + <artifactId>Morphochron</artifactId> + <version>0.0.1-SNAPSHOT</version> + <build> + <sourceDirectory>src</sourceDirectory> + <resources> + <resource> + <directory>src</directory> + <excludes> + <exclude>**/*.java</exclude> + </excludes> + </resource> + </resources> + <plugins> + <plugin> + <artifactId>maven-compiler-plugin</artifactId> + <version>3.8.1</version> + <configuration> + <release>16</release> + </configuration> + </plugin> + </plugins> + </build> + <dependencies> + <dependency> + <groupId>org.json</groupId> + <artifactId>json</artifactId> + <version>20160810</version> + </dependency> +</dependencies> +</project> \ No newline at end of file diff --git a/Morphochron/src/Adjective.java b/Morphochron/src/Adjective.java new file mode 100644 index 0000000000000000000000000000000000000000..d45d4aeec8b4931242e938b04335302879085f35 --- /dev/null +++ b/Morphochron/src/Adjective.java @@ -0,0 +1,31 @@ +import java.util.ArrayList; + +public class Adjective implements WordClass{ + + + private ArrayList<String> allAdjectivesOfCorpus = new ArrayList<String>(); + private ArrayList<String> allWordsOfCorpus = new ArrayList<String>(); + + public void deleteInflections() + { + for (String word : allWordsOfCorpus) + { + if (word.endsWith("ADJ") || word.endsWith("ADJR") || + word.endsWith("ADJS") || word.endsWith("ADV") || + word.endsWith("ADVR") || word.endsWith("ADVS")) + { + allAdjectivesOfCorpus.add(word); + } + } + } + + public void setWords(ArrayList<String> al) + { + this.allWordsOfCorpus = al; + } + + public ArrayList<String> getNormalizedWords() + { + return allAdjectivesOfCorpus; + } +} diff --git a/Morphochron/src/Affix.java b/Morphochron/src/Affix.java new file mode 100644 index 0000000000000000000000000000000000000000..7683798a35bcd3bdb04a28393d4c81b252ef8349 --- /dev/null +++ b/Morphochron/src/Affix.java @@ -0,0 +1,84 @@ +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +public class Affix +{ + private Map<String, ArrayList<String>> morphemeWordList = new HashMap<String,ArrayList<String>>(); + private ArrayList<String> filteredWords = new ArrayList<String>(); + private String wordclass = ""; + private String affixtype = ""; + private int startdate = 0; + private int enddate = 0; + + public Affix(ArrayList<String> filteredWords, int startdate, int enddate, String wordclass, String affixtype) + { + this.filteredWords = filteredWords; + this.affixtype = affixtype; + this.wordclass = wordclass; + this.startdate = startdate; + this.enddate = enddate; + processMorphemes(); + } + + public Map<String, ArrayList<String>> getMorphemeWordList() + { + return morphemeWordList; + } + + private void processMorphemes() + { + Set<String> wordTypes = new HashSet<String>(filteredWords); + + Map<String, Integer> affixMorpheme = new HashMap<String,Integer>(); + //Map<String, ArrayList<String>> morphemeWordList = new HashMap<String,ArrayList<String>>(); + + for (String word : wordTypes) + { + AffixStripper as = new AffixStripper(word); + if (affixtype.equals("_su01")) + { + //System.out.println("Suffix morpheme list will be generated"); + affixMorpheme = as.getSuffixMorphem(); //contains all suffix morphemes found in noun + } + else if (affixtype.equals("_pr01")) + { + affixMorpheme = as.getPrefixMorphem(); //contains all prefix morphemes found in noun + } + else + { + System.out.println("Affixtype not known"); + } + if (!affixMorpheme.isEmpty()) + { + + for (String morpheme : affixMorpheme.keySet()) + { + ArrayList<String> wordsWithAffix = new ArrayList<String>(); + + if (morphemeWordList.get(morpheme)!=null)//only for the first iteration when the morphemeWordList does not contain any data + { + // keep the values of morphemeWordList that were written to it previously + wordsWithAffix = morphemeWordList.get(morpheme); + //System.out.println("First Iteration: " + morphemeWordList.get(morpheme)); + } + //System.out.println(word + " " + morpheme); + //call the Oxford class and check if the morpheme occurs in the noun + OED ox = new OED(word, morpheme, wordclass, affixtype, startdate, enddate); + + if (ox.processOEDRequest()) + { + wordsWithAffix.add(word); + morphemeWordList.put(morpheme, wordsWithAffix); + //System.out.println("when OED was consulted: " + word + ": " + morpheme); + } + + //if (number_of_queries == 1000) break; + } + //System.out.println("Outside the second for-loop: " + word + ": " + affixMorpheme.keySet()); + } + } +} +} diff --git a/Morphochron/src/Compositum.java b/Morphochron/src/Compositum.java new file mode 100644 index 0000000000000000000000000000000000000000..8cd02bd2987d71944f4be95becb5bba475731efd --- /dev/null +++ b/Morphochron/src/Compositum.java @@ -0,0 +1,24 @@ +import java.util.ArrayList; + +public class Compositum implements WordClass{ + + private ArrayList<String> allWordsOfCorpus = new ArrayList<String>(); + private ArrayList<String> allCompoundNounsOfCorpus = new ArrayList<String>(); + private ArrayList<String> allComoundVerbsOfCorpus = new ArrayList<String>(); + private ArrayList<String> allCompoundAdjectivesOfCorpus = new ArrayList<String>(); + + public void deleteInflections() + { + + } + + public void setWords(ArrayList<String> al) + { + this.allWordsOfCorpus = al; + } + + public ArrayList<String> getNormalizedWords() + { + return allCompoundNounsOfCorpus; + } +} diff --git a/Morphochron/src/Corpus.java b/Morphochron/src/Corpus.java new file mode 100644 index 0000000000000000000000000000000000000000..a918319ca9a9c18447fce6ada0a7ba1f461ea3e1 --- /dev/null +++ b/Morphochron/src/Corpus.java @@ -0,0 +1,190 @@ +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; + +public class Corpus +{ + private String corpusname = ""; + private String directory = ""; + private String period = ""; + private String filter = ""; + private int startdate = 0; + private int enddate = 0; + private ArrayList<String> allWordsOfCorpus = new ArrayList<String>(); + + public Corpus(String corpusname, String period) + { + this.corpusname = corpusname; + this.period = period; + setCorpusDirectory(); + readCorpus(); + } + + public String getCorpusName() + { + return corpusname; + } + + public String getPeriod() + { + return period; + } + + public ArrayList<String> getCorpus() + { + return allWordsOfCorpus; + } + + public int getStartDate() + { + return startdate; + } + + public int getEndDate() + { + return enddate; + } + + // location directory of corpus + private void setCorpusDirectory() + { + switch (corpusname) + { + case "ppcmbe": + startdate = 1700; + enddate = 1914; + directory = "C:\\Users\\Peukert\\Corpora\\PENN Corpus\\PENN-CORPORA\\PPCMBE-RELEASE-1\\corpus\\pos"; + break; + case "ppceme": + startdate = 1500; + enddate = 1710; + directory = "C:\\Users\\Peukert\\Corpora\\PENN Corpus\\PENN-CORPORA\\PPCEME-RELEASE-2\\corpus\\pos\\penn2"; + break; + case "ppcme2": + startdate = 1150; + enddate = 1500; + directory = "C:\\Users\\Peukert\\Corpora\\PENN Corpus\\PENN-CORPORA\\PPCME2-RELEASE-3\\corpus\\pos"; + break; + } + //System.out.println(directory); + } + + // set time information, method to read which files, + private void readCorpus() + { + IO io = new IO(); + if (period.isEmpty()) + { + filter = ".*\\.pos"; + allWordsOfCorpus = io.readFilesFromDirectory(directory, filter); + } + else + { + switch (period) + { + case "m1": + startdate = 1150; + enddate = 1250; + filter = "[a-z0-9]+\\.[m][x]?[1]\\.pos"; + allWordsOfCorpus = io.readFilesFromDirectory(directory, filter); + break; + case "m2": + startdate = 1250; + enddate = 1350; + filter = "[a-z0-9]+\\.[m][2]\\d?\\.pos"; + allWordsOfCorpus = io.readFilesFromDirectory(directory, filter); + break; + case "m3": + startdate = 1350; + enddate = 1420; + filter = "[a-z0-9]+\\.[m][3]\\d?\\.pos"; + allWordsOfCorpus = io.readFilesFromDirectory(directory, filter); + break; + case "m4": + startdate = 1420; + enddate = 1500; + filter = "[a-z0-9]+\\.[m][x]?[4]\\.pos"; + allWordsOfCorpus = io.readFilesFromDirectory(directory, filter); + break; + case "e1": + startdate = 1500; + enddate = 1569; + filter = "[a-z0-9]+-e1-p2\\.pos"; + allWordsOfCorpus = io.readFilesFromDirectory(directory, filter); + break; + case "e2": + startdate = 1570; + enddate = 1639; + filter = "[a-z0-9]+-e2-p2\\.pos"; + allWordsOfCorpus = io.readFilesFromDirectory(directory, filter); + break; + case "e3": + startdate = 1640; + enddate = 1710; + filter = "[a-z0-9]+-e3-p2\\.pos"; + allWordsOfCorpus = io.readFilesFromDirectory(directory, filter); + break; + case "Emod1": + startdate = 1700; + enddate = 1769; + filter = "[a-z0-9]+-[1][7][^789](\\d|[x]?)\\.pos"; + allWordsOfCorpus = io.readFilesFromDirectory(directory, filter); + break; + case "Emod2": + startdate = 1770; + enddate = 1839; + filter = "[a-z0-9]+-[1]([7][789]|[8][0123])(\\d|[x]?)\\.pos"; + allWordsOfCorpus = io.readFilesFromDirectory(directory, filter); + break; + case "Emod3": + startdate = 1840; + enddate = 1914; + filter = "[a-z0-9]+-[1]([8][456789]|[9][01])(\\d|[x]?)\\.pos";; + allWordsOfCorpus = io.readFilesFromDirectory(directory, filter); + break; + } + normalizeText(); + } + } + + + + private void normalizeText() + { + //+a for æ; +t for þ; +d for ð, +g for ȝ + for (int i = 0; i < allWordsOfCorpus.size(); i++) + { + String word = allWordsOfCorpus.get(i); + //delete $-sign of manually corrected words (see corpus docu) + if (word.startsWith("$")) + { + word = word.replaceFirst("\\$", ""); + //System.out.println(word); + } + // normalize text data (only applicable for m1-m4) + if (word.matches(".*\\+[agdtAGDT][a-zA-Z0-9]*[_|/][A-Z]+") && corpusname.equals("ppcme2")) + { + word = word.replaceAll("\\+[tT]", "th");//þ + word = word.replaceAll("\\+[gG]", "z");//ȝ + word = word.replaceAll("\\+[dD]", "th");//ð + word = word.replaceAll("\\+[aA]", "ae");//æ + allWordsOfCorpus.add(word); + //System.out.println(word); + allWordsOfCorpus.remove(i); + } + } + } + + /* + * delete the main method once programm is finished + * exist only for test purposes + */ +// public static void main(String[] args) +// { +// Corpus cp = new Corpus("ppcme2", "m1"); +// cp.getCorpus(); +// } + + + +} diff --git a/Morphochron/src/IO.java b/Morphochron/src/IO.java index a1a2cf0f7b26292b0637e483d42e34b84dee5efd..5ecef15cffa29b16309cd99755c49fe98072060f 100644 --- a/Morphochron/src/IO.java +++ b/Morphochron/src/IO.java @@ -10,43 +10,51 @@ import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.PrintWriter; +import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.stream.Stream; + +import javax.net.ssl.HttpsURLConnection; + +import org.json.JSONObject; + import java.util.ArrayList; -import java.util.Collections; import java.util.HashMap; -import java.util.List; import java.util.Map; import java.util.Scanner; -import java.util.Set; public class IO { - - static ArrayList<String> readFilesFromDirectory(String path) + // read the files of entire directory + public ArrayList<String> readFilesFromDirectory(String path, String filter) { ArrayList<String> allWordsOfTexts = new ArrayList<String>(); try (Stream<Path> paths = Files.walk(Paths.get(path))) { - paths.filter(Files::isRegularFile).forEach(item -> + paths.filter(Files::isRegularFile). + filter(p -> p.getFileName().toString().matches(filter)). + forEach(item -> { + //System.out.println(item.getFileName().toString() ); try (Scanner s = new Scanner(new File(item.toString())).useDelimiter("\\s+")) - { - while (s.hasNext()) - { - allWordsOfTexts.add(s.next()); - } - } - catch (FileNotFoundException e) {} + { + while (s.hasNext()) + { + allWordsOfTexts.add(s.next()); + } + } + catch (FileNotFoundException e) {} }); + } catch (IOException e) {} return allWordsOfTexts; } - static String readFile(String file, boolean unicode) + + public String readFile(String file, boolean unicode) { String data = ""; @@ -76,10 +84,10 @@ public class IO { } catch(IOException e){} } - static Map<String, ArrayList> readMorphemeWordListFromCSVFile(String filepath) throws IOException + static Map<String, ArrayList<String>> readMorphemeWordListFromCSVFile(String filepath) throws IOException { - Map<String, ArrayList> morphemeWordList = new HashMap<String, ArrayList>(); + Map<String, ArrayList<String>> morphemeWordList = new HashMap<String, ArrayList<String>>(); try(BufferedReader br = new BufferedReader(new FileReader(filepath))) { String line = ""; while ((line = br.readLine()) != null) { @@ -98,7 +106,7 @@ public class IO { return morphemeWordList; } - static void writeMorphemeWordListToCSVFile(String filepath, Map<String, ArrayList> morphemeWordList) + public void writeMorphemeWordListToCSVFile(String filepath, Map<String, ArrayList<String>> morphemeWordList) { try { @@ -125,52 +133,13 @@ public class IO { } } - static void appendResultsToCSVFile(String filepath, String postprocessingfilepath, Set<String> wordClassTypes, ArrayList<String> allWordClassOfCorpus, ArrayList<String> allWordsOfCorpus) throws IOException + public void appendResultsToCSVFile(String filepath, String data) throws IOException { - Map<String, ArrayList> morphemeWordList = new HashMap<String,ArrayList>(); - //read from file the postprocessed data - morphemeWordList = readMorphemeWordListFromCSVFile(postprocessingfilepath); - // header of the result file - String header = "Suffixes: Nouns in PPCMBE\nTotal noun types: " + wordClassTypes.size() + " of " - + allWordClassOfCorpus.size() + " nouns and of " + allWordsOfCorpus.size() - + " word in total" - + "\n\nMorpheme;Contained in Words;Hapaxes;Types (V);Tokens;No Hapaxes;P"; try { FileWriter file = new FileWriter(filepath, true); //for overwriting set boolean to false PrintWriter write = new PrintWriter(file); - write.println(header); - for (String s : morphemeWordList.keySet()) - { - String key = s.toString(); - ArrayList<String> allWordsContainingAffix = morphemeWordList.get(s); - - //System.out.println(key + " " + value); - //System.out.print("From which Hapax: "); - ArrayList<String> HapaxLegonema = new ArrayList<String>(); - //ArrayList<String> allWordsContainingAffix = new ArrayList<String>(value); - int numberOfAffixInCorpus = 0; - for (String wordContainingAffix : allWordsContainingAffix) - { - numberOfAffixInCorpus += Collections.frequency(allWordClassOfCorpus, wordContainingAffix); - if (Collections.frequency(allWordClassOfCorpus, wordContainingAffix) == 1) - { - HapaxLegonema.add(wordContainingAffix); - //System.out.print(wordContainingAffix + " "); - } - } - //calculte the p-value as a productivity measure - int hapaxtypes = HapaxLegonema.size(); - double p_value = 0.0; - if (numberOfAffixInCorpus != 0) - { - p_value = (double)hapaxtypes / (double)numberOfAffixInCorpus; - } - - write.println(key + ";" + allWordsContainingAffix + ";" + HapaxLegonema + ";" - + allWordsContainingAffix.size() + ";" + numberOfAffixInCorpus - + ";" + hapaxtypes + ";" + p_value); - } + write.println(data); write.close(); } catch (IOException e) @@ -178,4 +147,37 @@ public class IO { System.out.println(e.getMessage()); } } + /* + * given a REST API URL and credentials, data are read from which the given URL points to + */ + public String requestRESTfulAPI(String restUrl, String app_id, String app_key) + { + String jsonString = ""; + + try + { + URL url = new URL(restUrl); + HttpsURLConnection urlConnection = (HttpsURLConnection) url.openConnection(); + urlConnection.setRequestProperty("Accept", "application/json"); + urlConnection.setRequestProperty("app_id", app_id); + urlConnection.setRequestProperty("app_key", app_key); + + // read the output from the server + BufferedReader reader = new BufferedReader(new InputStreamReader(urlConnection.getInputStream())); + StringBuilder stringBuilder = new StringBuilder(); + String line = null; + while ((line = reader.readLine()) != null) + { + stringBuilder.append(line + "\n"); + } + jsonString = stringBuilder.toString(); + + //System.out.println("retrieved OED entry: " + stringBuilder.toString()); + } + catch (IOException e) + { + e.printStackTrace(); + } + return jsonString; + } } diff --git a/Morphochron/src/Init.java b/Morphochron/src/Init.java index f51480cdeb4cc2e8662245c2425667c66448898a..d12cb75991e1b47a545075ea2b5aa92cc3e69d5e 100644 --- a/Morphochron/src/Init.java +++ b/Morphochron/src/Init.java @@ -1,13 +1,7 @@ - import java.io.IOException; import java.util.ArrayList; -import java.util.Arrays; import java.util.HashMap; -import java.util.HashSet; -import java.util.List; import java.util.Map; -import java.util.Set; -import java.util.Collections; public class Init { /** @@ -23,171 +17,65 @@ public class Init { */ public static void main(String[] args) throws IOException { - // read all texts of the corpus file in list - String directory = "C:\\Users\\Peukert\\Corpora\\PENN Corpus\\PENN-CORPORA\\PPCMBE-RELEASE-1\\corpus\\pos"; - // write file to postprocessing file - String postprocessingfile = "C:\\Users\\Peukert\\Documents\\postprocessingfile.csv"; - // write file to final result file - String finalresultsfile = "C:\\Users\\Peukert\\Documents\\resultsMorphochron.csv"; - IO io = new IO(); + /* + * Usage: specify which of the following corpora (2nd argument = period is optional) + * Corpus: period + * ppcme2: m1, m2, m3, m4 + * ppceme: e1, e2, e3 + * ppcmbe: Emod1, Emod2, Emod3 + * + * Word Class: _nn01, _vb01, _jj01 + * + * Affix Type: _su01, _pr01 + * + * TODO: + * 1. implement GUI to have properties selected + * 2. include credentials and directories as selection + */ + String corpus = "ppcmbe"; + String period = "Emod2"; + String wordclass ="_nn01"; + String affixtype = "_su01"; + + System.out.println( + "Selection made\ncorpus: " + corpus + "\nperiod: " + period + + "\nword class: " + wordclass + "\naffixtype: " + affixtype); + + Corpus cp = new Corpus(corpus, period); ArrayList<String> allWordsOfCorpus = new ArrayList<String>(); - allWordsOfCorpus = io.readFilesFromDirectory(directory); + allWordsOfCorpus = cp.getCorpus(); + System.out.println("Corpus read completely and normalized"); - // create 6 lists for each word class (A,V,N) and compounds each - ArrayList<String> allNounsOfCorpus = new ArrayList<String>(); - ArrayList<String> allVerbsOfCorpus = new ArrayList<String>(); - ArrayList<String> allAdjectivesOfCorpus = new ArrayList<String>(); - ArrayList<String> allCompoundNounsOfCorpus = new ArrayList<String>(); - ArrayList<String> allComoundVerbsOfCorpus = new ArrayList<String>(); - ArrayList<String> allCompoundAdjectivesOfCorpus = new ArrayList<String>(); - ArrayList<String> allPluralnounsOfCorpus = new ArrayList<String>(); - //remove inflected forms - for (String word : allWordsOfCorpus) - { - //handling for verbs - if (word.endsWith("MD") || word.endsWith("MD0") || - word.endsWith("VAG") || word.endsWith("VAN") || - word.endsWith("VB") || word.endsWith("VBI") || - word.endsWith("VBD") || word.endsWith("VBN") || - word.endsWith("VBP")) - { - allVerbsOfCorpus.add(word); - } - // handling for compound nouns - else if (word.endsWith("+N") || word.endsWith("+N$") - || word.endsWith("+NS") || word.endsWith("+NS$") - ) - { - allCompoundNounsOfCorpus.add(word); - //System.out.println(word); - } - - else if (word.endsWith("/N") - //Proper Nouns of all kinds are excluded - // || word.endsWith("NPR") || word.endsWith("NPR$") - // || word.endsWith("NPRS") || word.endsWith("NPRS$") - // all forms of nominalized other, e.g. the other are excluded - // || word.endsWith("OTHER") || word.endsWith("OTHER$") - // || word.endsWith("OTHERS") || word.endsWith("OTHERS$") - ) - { - word = word.replace("/N", ""); - allNounsOfCorpus.add(word.toLowerCase()); - } - /*get rid of Possessives and Plural - * (Plural nouns cannot be sorted out, - * possible with a second loop but not - * worthwhile since not containing lexical morphemes)*/ - - else if (word.endsWith("/NS$")) - { - word = word.replace("ies/NS$", "y"); - word = word.replace("ies'/NS$", "y"); - word = word.replace("ches/NS$", "ch"); - word = word.replace("ches'/NS$", "ch"); - word = word.replace("ses/NS$", "s"); - word = word.replace("ses'/NS$", "s"); - word = word.replace("shes/NS$", "sh"); - word = word.replace("shes'/NS$", "sh"); - word = word.replace("./NS$", ""); - word = word.replace("s'/NS$", ""); - word = word.replace("'/NS$", ""); - word = word.replace("'s/NS$", ""); - word = word.replace("s/NS$", ""); - allNounsOfCorpus.add(word.toLowerCase()); - } - //get rid of Possessives - else if (word.endsWith("/N$")) - { - word = word.replace("'s./N$", ""); - word = word.replace("./N$", ""); - word = word.replace("'s/N$", ""); - word = word.replace("s/N$", ""); - word = word.replace("'/N$", ""); - word = word.replace("/N$", ""); - allNounsOfCorpus.add(word.toLowerCase()); - } - //get rid of Plural - else if (word.endsWith("/NS")) - { - //System.out.println(word); - word = word.replace("ies/NS", "y"); - word = word.replace("ches/NS", "ch"); - word = word.replace("ses/NS", "s"); - word = word.replace("shes/NS", "sh"); - word = word.replace("./NS", ""); - word = word.replace("s/NS", ""); - word = word.replace("s'/NS", ""); - word = word.replace("'/NS", ""); - if (word.endsWith("/NS")) - { - word = word.replace("/NS", ""); - allPluralnounsOfCorpus.add(word); - } - allNounsOfCorpus.add(word.toLowerCase()); - } - else if (word.endsWith("ADJ") || word.endsWith("ADJR") || - word.endsWith("ADJS") || word.endsWith("ADV") || - word.endsWith("ADVR") || word.endsWith("ADVS")) - { - allAdjectivesOfCorpus.add(word); - } - } - /* - System.out.println("Gesamt Wortanzahl: " + allWordsOfCorpus.size()); - System.out.println("Anzahl Verben: " + allVerbsOfCorpus.size()); - System.out.println("Anzahl Adjektive: " + allAdjectivesOfCorpus.size()); - System.out.println("Anzahl Substantive: " + allNounsOfCorpus.size()); - */ - // create word frequency list of the nouns - Map<String, Integer> frequencyNouns = new HashMap<String,Integer>(); - Set<String> nounTypes = new HashSet<String>(allNounsOfCorpus); - for (String key : nounTypes) - { - frequencyNouns.put(key, Collections.frequency(allNounsOfCorpus, key)); - //System.out.println(key + ": " + Collections.frequency(allNounsOfCorpus, key)); - } - /* - for (String noun: frequencyNouns.keySet()) { - String key = noun.toString(); - String value = frequencyNouns.get(noun).toString(); - System.out.println(key + " " + value); - } - */ - - Map<String, Integer> suffixMorpheme = new HashMap<String,Integer>(); - Map<String, ArrayList> morphemeWordList = new HashMap<String,ArrayList>(); + //create normalized word lists (factory pattern) + WordClassFactory wordClassFactory = new WordClassFactory(); + WordClass wc = wordClassFactory.normalizeWords(wordclass, allWordsOfCorpus); + + ArrayList<String> normalizedWords = new ArrayList<String>(); +// normalizedWords = wc.getNormalizedWords(); +// for (String word : normalizedWords) +// { +// System.out.println(word); +// } + normalizedWords.add("mountainousness"); + normalizedWords.add("mountainous"); + normalizedWords.add("counterargument"); + normalizedWords.add("precondition"); + normalizedWords.add("reanimation"); + normalizedWords.add("degeneration"); + normalizedWords.add("proposition"); - for (String noun : nounTypes) - { - AffixStripper as = new AffixStripper(noun); - suffixMorpheme = as.getSuffixMorphem(); - if (!suffixMorpheme.isEmpty()) - { - - for (String morpheme : suffixMorpheme.keySet()) - { - ArrayList<String> WordListOfNounsWithSuffix = new ArrayList<String>(); - - if (morphemeWordList.get(morpheme)!=null)//only for the first iteration when the morphemeWordList does not contain any data - { - // keep the values of morphemeWordList that were written to it previously - WordListOfNounsWithSuffix = morphemeWordList.get(morpheme); - } - - WordListOfNounsWithSuffix.add(noun); - - morphemeWordList.put(morpheme, WordListOfNounsWithSuffix); - } - //System.out.println(noun + ": " + suffixMorpheme.keySet()); - } - } - //write csv file to manually postprocess the data - //io.writeMorphemeWordListToCSVFile(postprocessingfile, morphemeWordList); - // write all results to CSV file - io.appendResultsToCSVFile(finalresultsfile, postprocessingfile, nounTypes, allNounsOfCorpus, allWordsOfCorpus); + System.out.println("All words of type " + wordclass + " selected"); + //detect affixes in word list as a pre-processing and countercheck these with OED REST API + Affix aff = new Affix(normalizedWords, cp.getStartDate(), cp.getEndDate(), wordclass, affixtype); + Map<String, ArrayList<String>> morphemeWordList = new HashMap<String,ArrayList<String>>(); + morphemeWordList = aff.getMorphemeWordList(); + System.out.println("Affixes parsed and validated in OED"); + + System.out.println("Writing results to file"); + //calculate results and write them to file + Result rs = new Result(morphemeWordList, cp, normalizedWords, wordclass, affixtype); + System.out.println("Done!"); } - } diff --git a/Morphochron/src/Measure.java b/Morphochron/src/Measure.java deleted file mode 100644 index 7c27eab14730d070a1bae9497fa1c286c1df46ee..0000000000000000000000000000000000000000 --- a/Morphochron/src/Measure.java +++ /dev/null @@ -1,4 +0,0 @@ - -public class Measure { - -} diff --git a/Morphochron/src/Noun.java b/Morphochron/src/Noun.java new file mode 100644 index 0000000000000000000000000000000000000000..f5ee7c98541c46f060f25b8455b10e134871bbe9 --- /dev/null +++ b/Morphochron/src/Noun.java @@ -0,0 +1,90 @@ +import java.util.ArrayList; + +public class Noun implements WordClass +{ + private ArrayList<String> allNounsOfCorpus = new ArrayList<String>(); + private ArrayList<String> allPluralnounsOfCorpus = new ArrayList<String>(); + private ArrayList<String> allWordsOfCorpus = new ArrayList<String>(); + + public void deleteInflections() + { + for (String word : allWordsOfCorpus) + { + if (word.endsWith("/N") + //Proper Nouns of all kinds are excluded + // || word.endsWith("NPR") || word.endsWith("NPR$") + // || word.endsWith("NPRS") || word.endsWith("NPRS$") + // all forms of nominalized other, e.g. the other are excluded + // || word.endsWith("OTHER") || word.endsWith("OTHER$") + // || word.endsWith("OTHERS") || word.endsWith("OTHERS$") + ) + { + word = word.replace("/N", ""); + allNounsOfCorpus.add(word.toLowerCase()); + } + /*get rid of Possessives and Plural + * (Plural nouns cannot be sorted out, + * possible with a second loop but not + * worthwhile since not containing lexical morphemes)*/ + + else if (word.endsWith("/NS$")) + { + word = word.replace("ies/NS$", "y"); + word = word.replace("ies'/NS$", "y"); + word = word.replace("ches/NS$", "ch"); + word = word.replace("ches'/NS$", "ch"); + word = word.replace("ses/NS$", "s"); + word = word.replace("ses'/NS$", "s"); + word = word.replace("shes/NS$", "sh"); + word = word.replace("shes'/NS$", "sh"); + word = word.replace("./NS$", ""); + word = word.replace("s'/NS$", ""); + word = word.replace("'/NS$", ""); + word = word.replace("'s/NS$", ""); + word = word.replace("s/NS$", ""); + allNounsOfCorpus.add(word.toLowerCase()); + } + //get rid of Possessives + else if (word.endsWith("/N$")) + { + word = word.replace("'s./N$", ""); + word = word.replace("./N$", ""); + word = word.replace("'s/N$", ""); + word = word.replace("s/N$", ""); + word = word.replace("'/N$", ""); + word = word.replace("/N$", ""); + allNounsOfCorpus.add(word.toLowerCase()); + } + //get rid of Plural + else if (word.endsWith("/NS")) + { + //System.out.println(word); + word = word.replace("ies/NS", "y"); + word = word.replace("ches/NS", "ch"); + word = word.replace("ses/NS", "s"); + word = word.replace("shes/NS", "sh"); + word = word.replace("./NS", ""); + word = word.replace("s/NS", ""); + word = word.replace("s'/NS", ""); + word = word.replace("'/NS", ""); + if (word.endsWith("/NS")) + { + word = word.replace("/NS", ""); + allPluralnounsOfCorpus.add(word); + } + allNounsOfCorpus.add(word.toLowerCase()); + } + } + } + + public void setWords(ArrayList<String> al) + { + this.allWordsOfCorpus = al; + deleteInflections(); + } + + public ArrayList<String> getNormalizedWords() + { + return allNounsOfCorpus; + } +} diff --git a/Morphochron/src/OED.java b/Morphochron/src/OED.java new file mode 100644 index 0000000000000000000000000000000000000000..153e2e8152c32a728cd91ac39b35e926c31dd5e7 --- /dev/null +++ b/Morphochron/src/OED.java @@ -0,0 +1,182 @@ + +import javax.net.ssl.HttpsURLConnection; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.URL; +import java.util.HashMap; +import java.util.Map; + +import org.json.JSONObject; +import org.json.JSONArray; + +public class OED + +/** + * to get the word_ID right, concatenate the word with the following codes + * - nn01 for Noun + * - vb01 for Verb + * - jj01 for Adjective + * - rb01 for Adverb + * - su01 for Suffix + * - pr01 for Prefix + * the running number orders the entries in the dictionary, i.e. the number of meanings + */ +{ + private String word = ""; + private String morpheme = ""; + private String wordclass = ""; + private String affixtype = ""; + private int enddateCorpus = 0; + private int startdateCorpus = 0; + private final String app_id; + private final String app_key; + private IO restapi = new IO(); + + //for each look up the word and one of its contained affixes is needed + public OED(String word, String morpheme, String wordclass, String affixtype, int startdateCorpus, int enddateCorpus) + { + this.app_id = restapi.readFile("C:\\Users\\Peukert\\Documents\\Morphochron\\id", false); + this.app_key = restapi.readFile("C:\\Users\\Peukert\\Documents\\Morphochron\\key", false); + this.word = word; + this.morpheme = morpheme; + this.wordclass = wordclass; + this.affixtype = affixtype; + this.startdateCorpus = startdateCorpus; + this.enddateCorpus = enddateCorpus; + } + + /* + * gets word representation of OED REST API as JSON object + */ + private String getRESTAPIWordRepresentation(String word) + { + return "https://oed-researcher-api.oxfordlanguages.com/oed/api/v0.2/words/?lemma=" + word; + } + + /* + * gets the roots OED representation of a word + */ + private String getRESTAPIRootRepresentation(String wordID) + { + return "https://oed-researcher-api.oxfordlanguages.com/oed/api/v0.2/word/" + wordID + "/roots/"; + } + + /* + * extracts ID from the OED word JSON response + * checks if ID corresponds to given word class + */ + private String processJSonWordID(JSONObject obj) + { + String wordid = ""; + JSONArray arr = obj.getJSONArray("data"); + for (int i = 0; i < arr.length(); i++) + { + wordid = arr.getJSONObject(i).getString("id"); + //System.out.println("Wort-ID:" + wordid); + if (wordid.equals(word + wordclass)) break; //words may be part of several word classes + } + //System.out.println("Wort-ID-vor R�ckgabe:" + wordid); + return wordid; + } + + /* + * checks if a word has an extra entry for the given morpheme + * checks if field daterange.obsolete = false and daterange.end is null + */ + private Boolean processJSonRoot(JSONObject obj) + { + int startyearOED = 0; + int endyearOED = 0; + Boolean occurredIn = false; + Boolean obsolete = true; + + String affix = ""; + if (affixtype.equals("_pr01")) + { + affix = morpheme +"-"; + } + else if (affixtype.equals("_su01")) + { + affix = "-" + morpheme; + } + else + { + System.out.println("Affix type not defined"); + } + + JSONArray arr = obj.getJSONArray("data"); + for (int i = 0; i < arr.length(); i++) + { + //System.out.println("The following morpheme is checked for existence: " + morpheme); + occurredIn = arr.getJSONObject(i).getString("lemma").equals(affix); + if (occurredIn) + { + //System.out.println("The following morpheme was actually found: " + morpheme); + endyearOED = arr.getJSONObject(i).getJSONObject("daterange").optInt("end", 10000);//lots of enddates are null,i.e. not set + startyearOED = arr.getJSONObject(i).getJSONObject("daterange").optInt("start", 0); + obsolete = arr.getJSONObject(i).getJSONObject("daterange").getBoolean("obsolete"); + break; + } + } + // && obsolete not included because it seems to be today's perspective of obselete + return (occurredIn && startyearOED < startdateCorpus && enddateCorpus < endyearOED); + } + + /* + * processes OED API queries + */ + private JSONObject getJSonResponse(String restUrl) + { + return new JSONObject(restapi.requestRESTfulAPI(restUrl, app_id, app_key)); + } + + /* + * 1. build URL for Word-REST API --> getRESTAPIWordRepresentation(String) String + * 2. get word JSON Format from Word-REST API --> getJSonResponse(String) JSONObject + * 3a. get the wordID out of the JSon --> processJSonWordID(JSONOBject) String + * 3b. get time range ? + * ---> same logic again <--- + * 4. build URL for Root-REST API --> getRESTAPIRootRepresentation(String) String + * 5. get root JSON format from Root-REST API --> getJSonResponse(String) JSONObject + * 6a. get start date out of JSON --> processJSonRoot(JSONObject) + * + */ + public Boolean processOEDRequest() + { + //Map<String, Integer> oedData = new HashMap<String,Integer>(); + Boolean entryAvailable = false; + String wordJSON = getRESTAPIWordRepresentation(word.toLowerCase()); + JSONObject jo = getJSonResponse(wordJSON); + String id = processJSonWordID(jo); + if (!id.isEmpty()) + { + String s = getRESTAPIRootRepresentation(id); + JSONObject o = getJSonResponse(s); + entryAvailable = processJSonRoot(o); + } + else + { + System.out.println("Word does not exist in OED"); + } + return entryAvailable; + } + + /* + * delete the main method once programm is finished + * exist only for test purposes + */ +// public static void main(String[] args) +// { +// +// String word = "mountainousness"; +// String morpheme ="ous"; +// String wordclass = "_nn01"; +// String affixtype = "_su01"; +// int startdate = 1570; +// int enddate = 1639; +// OED ox = new OED(word, morpheme, wordclass, affixtype, startdate, enddate); +// ox.processOEDRequest(); +// +// } +} diff --git a/Morphochron/src/Result.java b/Morphochron/src/Result.java new file mode 100644 index 0000000000000000000000000000000000000000..b2fc8498f42566138ac4b720d78e28d3f65f4939 --- /dev/null +++ b/Morphochron/src/Result.java @@ -0,0 +1,160 @@ +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +public class Result +{ + + private Corpus cp; + private String affixtype = ""; + private String wordclass = ""; + private String postprocessingfile = ""; + private String finalresultsfile = ""; + private ArrayList<String> filteredWords = new ArrayList<String>(); + private ArrayList<String> allWordsOfCorpus = new ArrayList<String>(); + private Map<String, ArrayList<String>> morphemeWordList = new HashMap<String,ArrayList<String>>(); + + public Result(Map<String, ArrayList<String>> morphemeWordList, Corpus cp, ArrayList<String> filteredWords, String wordtype, String affixtype) throws IOException + { + this.morphemeWordList = morphemeWordList; //the mapping between the morpheme and all word types it is contained in + this.filteredWords = filteredWords; //these are all the words of the respective word class without tags) + this.affixtype = affixtype; + this.wordclass = wordtype; + this.cp = cp; + this.allWordsOfCorpus = cp.getCorpus(); //all words of the corpus irrespective of the word class (words contain tags!) + setFileNames(); + generateDataSet(); + writeAllMeasuresFile(); + } + + private ArrayList<String> findHapaxes(ArrayList<String> allWordsContainingAffix) + { + ArrayList<String> hapaxes = new ArrayList<String>(); + + for (String wordContainingAffix : allWordsContainingAffix) + { + if (Collections.frequency(filteredWords, wordContainingAffix) == 1) + { + hapaxes.add(wordContainingAffix); + } + } + + return hapaxes; + } + + private int calculateNumberOfAffixes(ArrayList<String> allWordsContainingAffix) + { + int numberOfAffixInCorpus = 0; + for (String wordContainingAffix : allWordsContainingAffix) + { + numberOfAffixInCorpus += Collections.frequency(filteredWords, wordContainingAffix); + } + + return numberOfAffixInCorpus; + } + + private double calculateP_Value(int hapaxtypes, int numberOfAffixes) + { + double p_value = 0.0; + if (numberOfAffixes != 0) + { + p_value = (double)hapaxtypes / (double)numberOfAffixes; + } + + return p_value; + } + + private Map<String, Integer> countTokens(ArrayList<String> words) + { + Map<String, Integer> frequencyWords = new HashMap<String,Integer>(); + Set<String> wordTypes = new HashSet<String>(words); + + for (String key : wordTypes) + { + frequencyWords.put(key, Collections.frequency(words, key)); + //System.out.println(key + ": " + Collections.frequency(filteredWords, key)); + } + +// for (String word: frequencyWords.keySet()) { +// String key = word.toString(); +// String value = frequencyWords.get(word).toString(); +// System.out.println(key + " " + value); +// } + + return frequencyWords; + } + + private Set<String> setWordTypes(ArrayList<String> words) + { + Set<String> wordTypes = new HashSet<String>(words); + return wordTypes; + } + + /* + * Defines Header as it appears as a heading in the CSC file + */ + private String createHeader() + { + //more appropriate Naming for Header Output + if (affixtype.equals("_su01")) affixtype = "Suffixes"; + if (affixtype.equals("_pr01")) affixtype = "Prefixes"; + if (wordclass.equals("_nn01")) wordclass = "Nouns"; + if (wordclass.equals("_vb01")) wordclass = "Verbs"; + if (wordclass.equals("_jj01")) wordclass = "Adjectives"; + + return affixtype + ": " + wordclass + " in " + cp.getCorpusName() + "/" + cp.getPeriod() + " (" + cp.getStartDate() + "-" + cp.getEndDate() + ")\n" + + "Total Types (" + wordclass+ "): " + setWordTypes(filteredWords).size() + " of " + + filteredWords.size() + " " + wordclass + " and of " + allWordsOfCorpus.size() + + " words in total\n\n" + "Morpheme;Contained in Words;Hapaxes;Types (V);Tokens;No Hapaxes;P\n"; + } + + /* + * Generates the Data string written to the CSV result file + */ + private String generateDataSet() + { + String data = createHeader(); + + for (String s : morphemeWordList.keySet()) + { + String key = s.toString(); + // allWordsContainingAffix is a list with all wordtypes containing one affix s.toString + ArrayList<String> allWordsContainingAffix = morphemeWordList.get(s); + ArrayList<String> hapaxes = findHapaxes(allWordsContainingAffix); + int affixfrequencyForAllWordTokens = calculateNumberOfAffixes(allWordsContainingAffix); + int affixfrequencyForAllWordTypes = allWordsContainingAffix.size(); + + data += key + ";" + allWordsContainingAffix + ";" + hapaxes + ";" + + affixfrequencyForAllWordTypes + ";" + affixfrequencyForAllWordTokens + ";" + + hapaxes.size() + ";"+ calculateP_Value(hapaxes.size(), affixfrequencyForAllWordTokens) + "\n"; + } + + return data + "\n\n"; + } + + private void setFileNames() + { + // location postprocessing file + postprocessingfile = "C:\\Users\\Peukert\\Documents\\postprocessingfile.csv"; + // location final result file + finalresultsfile = "C:\\Users\\Peukert\\Documents\\resultsMorphochron.csv"; + } + + private void writePostProcessingFile() + { + IO io = new IO(); + //write csv file to manually postprocess the data + io.writeMorphemeWordListToCSVFile(postprocessingfile, morphemeWordList); + } + + private void writeAllMeasuresFile() throws IOException + { + IO io = new IO(); + // write all results to CSV file + io.appendResultsToCSVFile(finalresultsfile, generateDataSet()); + } +} diff --git a/Morphochron/src/Verb.java b/Morphochron/src/Verb.java new file mode 100644 index 0000000000000000000000000000000000000000..ef8edf47e90115f913199fa135cbe0f9a64a3629 --- /dev/null +++ b/Morphochron/src/Verb.java @@ -0,0 +1,33 @@ +import java.util.ArrayList; + +public class Verb implements WordClass{ + + private ArrayList<String> allVerbsOfCorpus = new ArrayList<String>(); + private ArrayList<String> allWordsOfCorpus = new ArrayList<String>(); + + public void deleteInflections() + { + for (String word : allWordsOfCorpus) + { + if (word.endsWith("MD") || word.endsWith("MD0") || + word.endsWith("VAG") || word.endsWith("VAN") || + word.endsWith("VB") || word.endsWith("VBI") || + word.endsWith("VBD") || word.endsWith("VBN") || + word.endsWith("VBP")) + { + allVerbsOfCorpus.add(word); + } + } + } + + public void setWords(ArrayList<String> al) + { + this.allWordsOfCorpus = al; + } + + public ArrayList<String> getNormalizedWords() + { + return allVerbsOfCorpus; + } + +} diff --git a/Morphochron/src/WordClass.java b/Morphochron/src/WordClass.java new file mode 100644 index 0000000000000000000000000000000000000000..903575d1d6daccf8185ba1ae2f9fdfe6659ec624 --- /dev/null +++ b/Morphochron/src/WordClass.java @@ -0,0 +1,10 @@ +import java.util.ArrayList; + +public interface WordClass { + + public void deleteInflections(); + + public void setWords(ArrayList<String> al); + + public ArrayList<String> getNormalizedWords(); +} diff --git a/Morphochron/src/WordClassFactory.java b/Morphochron/src/WordClassFactory.java new file mode 100644 index 0000000000000000000000000000000000000000..79d4fcbadbd2c7f902b9d37464015c4fd6554221 --- /dev/null +++ b/Morphochron/src/WordClassFactory.java @@ -0,0 +1,28 @@ +import java.util.ArrayList; + +public class WordClassFactory { + + public WordClass normalizeWords(String type, ArrayList<String> allWordsOfCorpus) + { + WordClass wc = null; + if (type.equals("_nn01")) + { + wc = new Noun(); + } + else if (type.equals("_vb01")) + { + wc = new Verb(); + } + else if (type.equals("_jj01")) + { + wc = new Adjective(); + } + else + { + System.out.println("Undefined word class! Use _nn01, _vb01, or _jj01"); + } + wc.setWords(allWordsOfCorpus); + + return wc; + } +}