diff --git a/Morphochron/.classpath b/Morphochron/.classpath index 0e8fc5ba08e5af26b6c6c7fa1c28bf28a9c6bb61..fdfa97b5124f24df94acec4b9c81a1fe8b13ca52 100644 --- a/Morphochron/.classpath +++ b/Morphochron/.classpath @@ -17,5 +17,6 @@ <attribute name="maven.pomderived" value="true"/> </attributes> </classpathentry> + <classpathentry kind="lib" path="jgoodies-forms-1.8.0.jar" sourcepath="jgoodies-forms-1.8.0-sources.jar"/> <classpathentry kind="output" path="target/classes"/> </classpath> diff --git a/Morphochron/.project b/Morphochron/.project index 6bdb594c9784ea143e91d3c5feaa278f3cf57101..8f88abfd554dcda8675404117463435ed475ec46 100644 --- a/Morphochron/.project +++ b/Morphochron/.project @@ -15,9 +15,15 @@ <arguments> </arguments> </buildCommand> + <buildCommand> + <name>net.sourceforge.metrics.builder</name> + <arguments> + </arguments> + </buildCommand> </buildSpec> <natures> <nature>org.eclipse.m2e.core.maven2Nature</nature> <nature>org.eclipse.jdt.core.javanature</nature> + <nature>net.sourceforge.metrics.nature</nature> </natures> </projectDescription> diff --git a/Morphochron/jgoodies-forms-1.8.0-sources.jar b/Morphochron/jgoodies-forms-1.8.0-sources.jar new file mode 100644 index 0000000000000000000000000000000000000000..dc57032cd1240284bbdfabfb3548cc496c4f79f5 Binary files /dev/null and b/Morphochron/jgoodies-forms-1.8.0-sources.jar differ diff --git a/Morphochron/jgoodies-forms-1.8.0.jar b/Morphochron/jgoodies-forms-1.8.0.jar new file mode 100644 index 0000000000000000000000000000000000000000..0abf5991a74855b817da98c14533827610efb117 Binary files /dev/null and b/Morphochron/jgoodies-forms-1.8.0.jar differ diff --git a/Morphochron/src/Adjective.java b/Morphochron/src/Adjective.java index d45d4aeec8b4931242e938b04335302879085f35..fbe693e64db52e63409a4c3a1a515fba423d1e50 100644 --- a/Morphochron/src/Adjective.java +++ b/Morphochron/src/Adjective.java @@ -10,11 +10,12 @@ public class Adjective implements WordClass{ { for (String word : allWordsOfCorpus) { - if (word.endsWith("ADJ") || word.endsWith("ADJR") || - word.endsWith("ADJS") || word.endsWith("ADV") || - word.endsWith("ADVR") || word.endsWith("ADVS")) + //the inflections -er/est and -ier/iest are left because inflected word 1. is found in OED and 2. is monomorphemic + if (word.matches("[a-zA-Z]+[_|/](ADVR|ADJ|ADJR|ADJS|ADV|ADVS)")) { - allAdjectivesOfCorpus.add(word); + word = word.replaceAll("[_|/](ADVR|ADJR|ADJS|ADVS)", ""); + word = word.replaceAll("[_|/](ADJ|ADV)", ""); + allAdjectivesOfCorpus.add(word.toLowerCase()); } } } @@ -22,6 +23,7 @@ public class Adjective implements WordClass{ public void setWords(ArrayList<String> al) { this.allWordsOfCorpus = al; + deleteInflections(); } public ArrayList<String> getNormalizedWords() diff --git a/Morphochron/src/Affix.java b/Morphochron/src/Affix.java index 7683798a35bcd3bdb04a28393d4c81b252ef8349..c65cd574bf4e93d94163e52781260f5682480655 100644 --- a/Morphochron/src/Affix.java +++ b/Morphochron/src/Affix.java @@ -7,19 +7,32 @@ import java.util.Set; public class Affix { private Map<String, ArrayList<String>> morphemeWordList = new HashMap<String,ArrayList<String>>(); + private Map<String, ArrayList<String>> notInOEDWordList = new HashMap<String,ArrayList<String>>(); private ArrayList<String> filteredWords = new ArrayList<String>(); private String wordclass = ""; private String affixtype = ""; private int startdate = 0; private int enddate = 0; + private String app_key = ""; + private String app_id = ""; + private String resultPath = ""; + private Corpus corpus; + private String corpusName = ""; + private String corpusPeriod = ""; - public Affix(ArrayList<String> filteredWords, int startdate, int enddate, String wordclass, String affixtype) + public Affix(ArrayList<String> filteredWords, Corpus corpus, String wordclass, String affixtype, String app_key, String app_id, String resultPath) { + this.corpus = corpus; this.filteredWords = filteredWords; this.affixtype = affixtype; this.wordclass = wordclass; - this.startdate = startdate; - this.enddate = enddate; + this.startdate = corpus.getStartDate(); + this.enddate = corpus.getEndDate(); + this.app_key = app_key; + this.app_id = app_id; + this.resultPath = resultPath; + this.corpusName = corpus.getCorpusName(); + this.corpusPeriod = corpus.getPeriod(); processMorphemes(); } @@ -28,57 +41,85 @@ public class Affix return morphemeWordList; } + public Map<String, ArrayList<String>> getNotInOEDWordList() + { + return notInOEDWordList; + } + private void processMorphemes() { Set<String> wordTypes = new HashSet<String>(filteredWords); - Map<String, Integer> affixMorpheme = new HashMap<String,Integer>(); + //Map<String, Integer> affixMorpheme = new HashMap<String,Integer>(); //Map<String, ArrayList<String>> morphemeWordList = new HashMap<String,ArrayList<String>>(); - + + //these ints are only for feedback in the System.output + int numberOfWordTypes = wordTypes.size(); + int lookups = 0; + for (String word : wordTypes) { - AffixStripper as = new AffixStripper(word); - if (affixtype.equals("_su01")) - { - //System.out.println("Suffix morpheme list will be generated"); - affixMorpheme = as.getSuffixMorphem(); //contains all suffix morphemes found in noun - } - else if (affixtype.equals("_pr01")) - { - affixMorpheme = as.getPrefixMorphem(); //contains all prefix morphemes found in noun - } - else - { - System.out.println("Affixtype not known"); - } - if (!affixMorpheme.isEmpty()) + Map<String,Integer> affixMorpheme = new HashMap<String,Integer>(); + OED ox = new OED(word, "", wordclass, affixtype, corpus, app_key, app_id, resultPath); + Set<String> affixInWord = ox.getMorphemesOEDRequest(); + lookups++; + + if (affixInWord.size() > 0) { - for (String morpheme : affixMorpheme.keySet()) + for (String morpheme : affixInWord) { - ArrayList<String> wordsWithAffix = new ArrayList<String>(); - - if (morphemeWordList.get(morpheme)!=null)//only for the first iteration when the morphemeWordList does not contain any data + //System.out.println(morpheme); + if (morphemeWordList.containsKey(morpheme)) { - // keep the values of morphemeWordList that were written to it previously - wordsWithAffix = morphemeWordList.get(morpheme); - //System.out.println("First Iteration: " + morphemeWordList.get(morpheme)); + morphemeWordList.get(morpheme).add(word); } - //System.out.println(word + " " + morpheme); - //call the Oxford class and check if the morpheme occurs in the noun - OED ox = new OED(word, morpheme, wordclass, affixtype, startdate, enddate); - - if (ox.processOEDRequest()) + else { + ArrayList<String> wordsWithAffix = new ArrayList<String>(); wordsWithAffix.add(word); morphemeWordList.put(morpheme, wordsWithAffix); - //System.out.println("when OED was consulted: " + word + ": " + morpheme); } - - //if (number_of_queries == 1000) break; + } + } + else + { + System.out.println("No Morpheme Representation in OED: " + word); + AffixStripper as = new AffixStripper(word); + + if (affixtype.equals("_su01")) + { + affixMorpheme = as.getSuffixMorphem(); //contains all suffix morphemes found in noun + } + else if (affixtype.equals("_pr01")) + { + affixMorpheme = as.getPrefixMorphem(); //contains all prefix morphemes found in noun + } + else + { + System.out.println("Affixtype not known"); + } + if (!affixMorpheme.isEmpty()) + { + for (String morpheme : affixMorpheme.keySet()) + { + if (notInOEDWordList.containsKey(morpheme) && !notInOEDWordList.get(morpheme).equals(word)) + { + notInOEDWordList.get(morpheme).add(word); + } + else + { + ArrayList<String> wordsWithAffixEstimate = new ArrayList<String>(); //this list contains only one word + wordsWithAffixEstimate.add(word); + notInOEDWordList.put(morpheme, wordsWithAffixEstimate); + } + } } - //System.out.println("Outside the second for-loop: " + word + ": " + affixMorpheme.keySet()); + + } + + System.out.println("Word " + lookups + " from " + numberOfWordTypes + " mapped."); + } } } -} diff --git a/Morphochron/src/Corpus.java b/Morphochron/src/Corpus.java index a918319ca9a9c18447fce6ada0a7ba1f461ea3e1..3c71fdf48aba40b92741aa599d11d62fca628ed0 100644 --- a/Morphochron/src/Corpus.java +++ b/Morphochron/src/Corpus.java @@ -12,11 +12,12 @@ public class Corpus private int enddate = 0; private ArrayList<String> allWordsOfCorpus = new ArrayList<String>(); - public Corpus(String corpusname, String period) + public Corpus(String corpusname, String period, String directory) { this.corpusname = corpusname; this.period = period; - setCorpusDirectory(); + this.directory = directory; + setCorpusDates(); readCorpus(); } @@ -46,24 +47,24 @@ public class Corpus } // location directory of corpus - private void setCorpusDirectory() + private void setCorpusDates() { switch (corpusname) { case "ppcmbe": startdate = 1700; enddate = 1914; - directory = "C:\\Users\\Peukert\\Corpora\\PENN Corpus\\PENN-CORPORA\\PPCMBE-RELEASE-1\\corpus\\pos"; + //directory = "C:\\Users\\Peukert\\Corpora\\PENN Corpus\\PENN-CORPORA\\PPCMBE-RELEASE-1\\corpus\\pos"; break; case "ppceme": startdate = 1500; enddate = 1710; - directory = "C:\\Users\\Peukert\\Corpora\\PENN Corpus\\PENN-CORPORA\\PPCEME-RELEASE-2\\corpus\\pos\\penn2"; + //directory = "C:\\Users\\Peukert\\Corpora\\PENN Corpus\\PENN-CORPORA\\PPCEME-RELEASE-2\\corpus\\pos\\penn2"; break; case "ppcme2": startdate = 1150; enddate = 1500; - directory = "C:\\Users\\Peukert\\Corpora\\PENN Corpus\\PENN-CORPORA\\PPCME2-RELEASE-3\\corpus\\pos"; + //directory = "C:\\Users\\Peukert\\Corpora\\PENN Corpus\\PENN-CORPORA\\PPCME2-RELEASE-3\\corpus\\pos"; break; } //System.out.println(directory); diff --git a/Morphochron/src/GUI_mainMenu.java b/Morphochron/src/GUI_mainMenu.java new file mode 100644 index 0000000000000000000000000000000000000000..23b98bbd90ddf1e5328895eb8cf729a491711535 --- /dev/null +++ b/Morphochron/src/GUI_mainMenu.java @@ -0,0 +1,783 @@ +import java.awt.BorderLayout; +import java.awt.EventQueue; + +import javax.swing.JFrame; +import javax.swing.JPanel; +import javax.swing.border.EmptyBorder; +import javax.swing.JLabel; +import javax.swing.JOptionPane; +import javax.swing.JTextField; +import javax.swing.JButton; +import javax.swing.border.LineBorder; +import javax.swing.filechooser.FileNameExtensionFilter; + +import java.awt.Color; +import javax.swing.JDesktopPane; +import javax.swing.JFileChooser; + +import java.awt.GridLayout; +import java.awt.GridBagLayout; +import java.awt.GridBagConstraints; +import java.awt.Insets; +import java.awt.FlowLayout; +import javax.swing.GroupLayout; +import javax.swing.GroupLayout.Alignment; +import javax.swing.BoxLayout; +import javax.swing.ButtonGroup; + +import com.jgoodies.forms.layout.FormLayout; +import com.jgoodies.forms.layout.ColumnSpec; +import com.jgoodies.forms.layout.FormSpecs; +import com.jgoodies.forms.layout.RowSpec; +import java.awt.event.ActionListener; +import java.io.File; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Set; +import java.awt.event.ActionEvent; +import javax.swing.JCheckBox; +import javax.swing.JRadioButton; +import javax.swing.JTextArea; +import javax.swing.JTextPane; +import javax.swing.JScrollBar; +import javax.swing.JScrollPane; + +public class GUI_mainMenu extends JFrame { + + private JPanel contentPane; + private JTextField textFieldCorpusDir; + private JTextField textFieldResultDir; + private javax.swing.JFileChooser selectCorpus; + private javax.swing.JFileChooser selectResultDir; + private JTextArea resultTextArea; + private JTextField textFieldID; + private JTextField textFieldKey; + private javax.swing.JFileChooser selectCredentialsID; + private javax.swing.JFileChooser selectCredentialsKey; + private JCheckBox chckbxM1; + private JCheckBox chckbxM2; + private JCheckBox chckbxM3; + private JCheckBox chckbxM4; + private JCheckBox chckbxE1; + private JCheckBox chckbxE2; + private JCheckBox chckbxE3; + private JCheckBox chckbxEmod1; + private JCheckBox chckbxEmod2; + private JCheckBox chckbxEmod3; + private JRadioButton rdbtnPrefix; + private JRadioButton rdbtnSuffix; + private JRadioButton rdbtnAdjective; + private JRadioButton rdbtnNoun; + private JRadioButton rdbtnVerb; + private String corpus = ""; + private String wordclass =""; + private String affixtype = ""; + private String corpusPath = ""; + private String resultPath = ""; + private String app_id = ""; + private String app_key = ""; + private Set<String> period = new HashSet<String>(); + private JButton btnRun = new JButton("Run"); + private ButtonGroup affixGroup; + private ButtonGroup wordclassGroup; + + /** + * Launch the application. + */ +// public static void main(String[] args) { +// EventQueue.invokeLater(new Runnable() { +// public void run() { +// try { +// GUI_mainMenu frame = new GUI_mainMenu(); +// frame.setVisible(true); +// } catch (Exception e) { +// e.printStackTrace(); +// } +// } +// }); +// } + + public void setMessage(String msg) + { + resultTextArea.append(msg); + } + + public String getAppID() + { + return app_id; + } + + public String getAppKey() + { + return app_key; + } + + public String getWordClass() + { + return wordclass; + } + + public String getAffixType() + { + return affixtype; + } + + public Set<String> getPeriods() + { + return period; + } + + public String getCorpus() + { + return corpus; + } + + public String getResultPath() + { + return textFieldResultDir.getText(); + } + + public String getCorpusPath() + { + return textFieldCorpusDir.getText(); + } + + public JButton getRunButton() + { + return btnRun; + } + + private String getOSPathCorpora() + { + if (System.getProperty("os.name").startsWith("Windows")) + { + return "C:\\Users\\Peukert\\Corpora\\PENN Corpus\\PENN-CORPORA"; + } + else + { + return "/"; + } + } + + private String getOSPathID() + { + if (System.getProperty("os.name").startsWith("Windows")) + { + return "C:\\Users\\Peukert\\Documents\\Morphochron"; + } + else + { + return "/"; + } + } + /** + * Create the frame. + */ + public GUI_mainMenu() { + setTitle("Morphochron"); + setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); + setBounds(200, 200, 650, 600); + contentPane = new JPanel(); + contentPane.setBorder(new LineBorder(new Color(0, 0, 0))); + setContentPane(contentPane); + + textFieldCorpusDir = new JTextField(); + textFieldCorpusDir.setBounds(22, 46, 498, 20); + textFieldCorpusDir.setColumns(10); + + textFieldResultDir = new JTextField(); + textFieldResultDir.setBounds(22, 96, 498, 20); + textFieldResultDir.setColumns(10); + + selectCorpus = new javax.swing.JFileChooser(); + selectResultDir = new javax.swing.JFileChooser(); + selectCredentialsID = new javax.swing.JFileChooser(); + selectCredentialsKey = new javax.swing.JFileChooser(); + + JLabel lblCorpusDirectory = new JLabel("Select Corpus Directory"); + lblCorpusDirectory.setBounds(22, 27, 116, 14); + + JButton btnSelectResultDirectory = new JButton("Select Result"); + btnSelectResultDirectory.addActionListener(new ActionListener() + { + public void actionPerformed(ActionEvent e) + { + selectResultDir.setCurrentDirectory(new File(getOSPathID())); + //selectResultDir.setFileFilter( new FileNameExtensionFilter(".","pos")) ; + selectResultDir.setFileSelectionMode(JFileChooser.FILES_AND_DIRECTORIES); + int action = selectResultDir.showOpenDialog(btnSelectResultDirectory); + if (action == JFileChooser.APPROVE_OPTION) + { + File location = selectResultDir.getSelectedFile(); +// dictName = location.getAbsolutePath().toString(); + resultPath = location.getAbsolutePath();//getParent(); + } + + //selection is displayed at the left + textFieldResultDir.setText(resultPath); + } + }); + btnSelectResultDirectory.setBounds(527, 95, 99, 23); + contentPane.setLayout(null); + + JButton btnSelectCorpusDirectory = new JButton("Select Corpus"); + btnSelectCorpusDirectory.addActionListener(new ActionListener() { + public void actionPerformed(ActionEvent e) + { + selectCorpus.setCurrentDirectory(new File(getOSPathCorpora())); + //selectCorpus.setFileFilter( new FileNameExtensionFilter(".","pos")) ; + selectCorpus.setFileSelectionMode(JFileChooser.FILES_AND_DIRECTORIES); + int action = selectCorpus.showOpenDialog(btnSelectCorpusDirectory); + if (action == JFileChooser.APPROVE_OPTION) + { + File location = selectCorpus.getSelectedFile(); +// dictName = location.getAbsolutePath().toString(); + corpusPath = location.getParent(); + } + + //selection is displayed at the left + textFieldCorpusDir.setText(corpusPath); + if (corpusPath.matches(".*PPCMBE-RELEASE-1.*")) + { + corpus = "ppcmbe"; + chckbxM1.setEnabled(false); + chckbxM2.setEnabled(false); + chckbxM3.setEnabled(false); + chckbxM4.setEnabled(false); + chckbxM1.setSelected(false); + chckbxM2.setSelected(false); + chckbxM3.setSelected(false); + chckbxM4.setSelected(false); + + chckbxE1.setEnabled(false); + chckbxE2.setEnabled(false); + chckbxE3.setEnabled(false); + chckbxE1.setSelected(false); + chckbxE2.setSelected(false); + chckbxE3.setSelected(false); + + chckbxEmod1.setSelected(false); + chckbxEmod2.setSelected(false); + chckbxEmod3.setSelected(false); + chckbxEmod1.setEnabled(true); + chckbxEmod2.setEnabled(true); + chckbxEmod3.setEnabled(true); + period.clear(); + } + else if (corpusPath.matches(".*PPCEME-RELEASE-2.*")) + { + corpus = "ppceme"; + chckbxM1.setEnabled(false); + chckbxM2.setEnabled(false); + chckbxM3.setEnabled(false); + chckbxM4.setEnabled(false); + chckbxM1.setSelected(false); + chckbxM2.setSelected(false); + chckbxM3.setSelected(false); + chckbxM4.setSelected(false); + + chckbxE1.setEnabled(true); + chckbxE2.setEnabled(true); + chckbxE3.setEnabled(true); + chckbxE1.setSelected(false); + chckbxE2.setSelected(false); + chckbxE3.setSelected(false); + + chckbxEmod1.setSelected(false); + chckbxEmod2.setSelected(false); + chckbxEmod3.setSelected(false); + chckbxEmod1.setEnabled(false); + chckbxEmod2.setEnabled(false); + chckbxEmod3.setEnabled(false); + period.clear(); + } + else if (corpusPath.matches(".*PPCME2-RELEASE-3.*")) + { + corpus = "ppcmb2"; + chckbxM1.setEnabled(true); + chckbxM2.setEnabled(true); + chckbxM3.setEnabled(true); + chckbxM4.setEnabled(true); + chckbxM1.setSelected(false); + chckbxM2.setSelected(false); + chckbxM3.setSelected(false); + chckbxM4.setSelected(false); + + chckbxE1.setEnabled(false); + chckbxE2.setEnabled(false); + chckbxE3.setEnabled(false); + chckbxE1.setSelected(false); + chckbxE2.setSelected(false); + chckbxE3.setSelected(false); + + chckbxEmod1.setSelected(false); + chckbxEmod2.setSelected(false); + chckbxEmod3.setSelected(false); + chckbxEmod1.setEnabled(false); + chckbxEmod2.setEnabled(false); + chckbxEmod3.setEnabled(false); + period.clear(); + } + else + { + chckbxM1.setEnabled(false); + chckbxM2.setEnabled(false); + chckbxM3.setEnabled(false); + chckbxM4.setEnabled(false); + chckbxM1.setSelected(false); + chckbxM2.setSelected(false); + chckbxM3.setSelected(false); + chckbxM4.setSelected(false); + + chckbxE1.setEnabled(false); + chckbxE2.setEnabled(false); + chckbxE3.setEnabled(false); + chckbxE1.setSelected(false); + chckbxE2.setSelected(false); + chckbxE3.setSelected(false); + + chckbxEmod1.setSelected(false); + chckbxEmod2.setSelected(false); + chckbxEmod3.setSelected(false); + chckbxEmod1.setEnabled(false); + chckbxEmod2.setEnabled(false); + chckbxEmod3.setEnabled(false); + period.clear(); + } + } + }); + btnSelectCorpusDirectory.setBounds(527, 45, 99, 23); + contentPane.add(btnSelectCorpusDirectory, "9, 1, left, top"); + contentPane.add(textFieldCorpusDir, "5, 2, left, center"); + contentPane.add(textFieldResultDir, "9, 2, left, center"); + contentPane.add(lblCorpusDirectory, "11, 2, left, center"); + + JLabel lblSelectResultDirectory = new JLabel("Select Result Directory"); + lblSelectResultDirectory.setBounds(25, 77, 113, 14); + contentPane.add(lblSelectResultDirectory, "12, 2, left, center"); + contentPane.add(btnSelectResultDirectory, "13, 2, left, top"); + + JLabel lblCredentialID = new JLabel("Enter OED ID or load from file"); + lblCredentialID.setBounds(28, 130, 269, 14); + contentPane.add(lblCredentialID); + + textFieldID = new JTextField(); + textFieldID.setBounds(22, 152, 498, 20); + contentPane.add(textFieldID); + textFieldID.setColumns(10); + + JButton btnCredentialsID = new JButton("Load ID"); + btnCredentialsID.addActionListener(new ActionListener() + { + public void actionPerformed(ActionEvent e) + { + selectCredentialsID.setCurrentDirectory(new File(getOSPathID())); + //selectCredentialsID.setFileFilter( new FileNameExtensionFilter(".","txt")) ; + selectCredentialsID.setFileSelectionMode(JFileChooser.FILES_AND_DIRECTORIES); + int action = selectCredentialsID.showOpenDialog(btnCredentialsID); + if (action == JFileChooser.APPROVE_OPTION) + { + File location = selectCredentialsID.getSelectedFile(); + String idName = location.getAbsolutePath().toString(); + IO credentials = new IO(); + app_id = credentials.readFile(idName, false); + } + //selection is displayed at the left + textFieldID.setText(app_id); + } + }); + btnCredentialsID.setBounds(527, 151, 99, 23); + contentPane.add(btnCredentialsID); + + textFieldKey = new JTextField(); + textFieldKey.setColumns(10); + textFieldKey.setBounds(22, 205, 498, 20); + contentPane.add(textFieldKey); + + JLabel lblCredentialKey = new JLabel("Enter OED key or load from file"); + lblCredentialKey.setBounds(22, 183, 269, 14); + contentPane.add(lblCredentialKey); + + JButton btnCredentialsKey = new JButton("Load Key"); + btnCredentialsKey.addActionListener(new ActionListener() + { + public void actionPerformed(ActionEvent e) + { + selectCredentialsKey.setCurrentDirectory(new File(getOSPathID())); + //selectCredentialsKey.setFileFilter( new FileNameExtensionFilter(".","txt")) ; + selectCredentialsKey.setFileSelectionMode(JFileChooser.FILES_AND_DIRECTORIES); + int action = selectCredentialsKey.showOpenDialog(btnCredentialsKey); + if (action == JFileChooser.APPROVE_OPTION) + { + File location = selectCredentialsKey.getSelectedFile(); + String idName = location.getAbsolutePath().toString(); + IO credentials = new IO(); + app_key = credentials.readFile(idName, false); + } + //selection is displayed at the left + textFieldKey.setText(app_key); + } + }); + btnCredentialsKey.setBounds(527, 204, 99, 23); + contentPane.add(btnCredentialsKey); + + chckbxM1 = new JCheckBox("M1"); + chckbxM1.addActionListener(new ActionListener() + { + public void actionPerformed(ActionEvent e) + { + if (chckbxM1.isSelected()) + { + period.add("M1"); + } + else + { + period.remove("M1"); + } + } + }); + chckbxM1.setBounds(22, 264, 49, 23); + chckbxM1.setEnabled(false); + contentPane.add(chckbxM1); + + chckbxM2 = new JCheckBox("M2"); + chckbxM2.addActionListener(new ActionListener() + { + public void actionPerformed(ActionEvent e) + { + if (chckbxM1.isSelected()) + { + period.add("M2"); + } + else + { + period.remove("M2"); + } + } + }); + chckbxM2.setEnabled(false); + chckbxM2.setBounds(22, 289, 54, 23); + contentPane.add(chckbxM2); + + chckbxM3 = new JCheckBox("M3"); + chckbxM3.addActionListener(new ActionListener() + { + public void actionPerformed(ActionEvent e) + { + if (chckbxM3.isSelected()) + { + period.add("M3"); + } + else + { + period.remove("M3"); + } + } + }); + chckbxM3.setEnabled(false); + chckbxM3.setBounds(22, 317, 49, 23); + contentPane.add(chckbxM3); + + chckbxM4 = new JCheckBox("M4"); + chckbxM4.addActionListener(new ActionListener() + { + public void actionPerformed(ActionEvent e) + { + if (chckbxM4.isSelected()) + { + period.add("M4"); + } + else + { + period.remove("M4"); + } + } + }); + chckbxM4.setEnabled(false); + chckbxM4.setBounds(22, 343, 49, 23); + contentPane.add(chckbxM4); + + chckbxE1 = new JCheckBox("E1"); + chckbxE1.addActionListener(new ActionListener() + { + public void actionPerformed(ActionEvent e) + { + if (chckbxE1.isSelected()) + { + period.add("E1"); + } + else + { + period.remove("E1"); + } + } + }); + chckbxE1.setBounds(99, 264, 49, 23); + chckbxE1.setEnabled(false); + contentPane.add(chckbxE1); + + chckbxE2 = new JCheckBox("E2"); + chckbxE2.addActionListener(new ActionListener() + { + public void actionPerformed(ActionEvent e) + { + if (chckbxE2.isSelected()) + { + period.add("E2"); + } + else + { + period.remove("E2"); + } + } + }); + chckbxE2.setBounds(99, 289, 41, 23); + chckbxE2.setEnabled(false); + contentPane.add(chckbxE2); + + chckbxE3 = new JCheckBox("E3"); + chckbxE3.addActionListener(new ActionListener() + { + public void actionPerformed(ActionEvent e) + { + if (chckbxE3.isSelected()) + { + period.add("E3"); + } + else + { + period.remove("E3"); + } + } + }); + chckbxE3.setBounds(99, 317, 41, 23); + chckbxE3.setEnabled(false); + contentPane.add(chckbxE3); + + chckbxEmod1 = new JCheckBox("Emod1"); + chckbxEmod1.addActionListener(new ActionListener() + { + public void actionPerformed(ActionEvent e) + { + if (chckbxEmod1.isSelected()) + { + period.add("Emod1"); + } + else + { + period.remove("Emod1"); + } + } + }); + chckbxEmod1.setBounds(171, 264, 59, 23); + chckbxEmod1.setEnabled(false); + contentPane.add(chckbxEmod1); + + chckbxEmod2 = new JCheckBox("Emod2"); + chckbxEmod2.addActionListener(new ActionListener() + { + public void actionPerformed(ActionEvent e) + { + if (chckbxEmod2.isSelected()) + { + period.add("Emod2"); + } + else + { + period.remove("Emod2"); + } + } + }); + chckbxEmod2.setBounds(171, 289, 59, 23); + chckbxEmod2.setEnabled(false); + contentPane.add(chckbxEmod2); + + chckbxEmod3 = new JCheckBox("Emod3"); + chckbxEmod3.addActionListener(new ActionListener() + { + public void actionPerformed(ActionEvent e) + { + if (chckbxEmod3.isSelected()) + { + period.add("Emod3"); + } + else + { + period.remove("Emod3"); + } + } + }); + chckbxEmod3.setBounds(171, 317, 59, 23); + chckbxEmod3.setEnabled(false); + contentPane.add(chckbxEmod3); + + JRadioButton rdbtnPrefix = new JRadioButton("Prefix"); + rdbtnPrefix.addActionListener(new ActionListener() + { + public void actionPerformed(ActionEvent e) + { + affixtype = "_pr01"; + } + }); + rdbtnPrefix.setBounds(416, 264, 111, 23); + contentPane.add(rdbtnPrefix); + + JRadioButton rdbtnSuffix = new JRadioButton("Suffix"); + rdbtnSuffix.addActionListener(new ActionListener() + { + public void actionPerformed(ActionEvent e) + { + affixtype = "_su01"; + } + }); + rdbtnSuffix.setBounds(416, 289, 111, 23); + contentPane.add(rdbtnSuffix); + + affixGroup = new ButtonGroup(); + affixGroup.add(rdbtnPrefix); + affixGroup.add(rdbtnSuffix); + + JButton btnCancel = new JButton("Cancel"); + btnCancel.addActionListener(new ActionListener() + { + public void actionPerformed(ActionEvent e) + { + System.exit(0); + } + }); + btnCancel.setBounds(438, 343, 89, 23); + contentPane.add(btnCancel); + + btnRun.setBounds(537, 343, 89, 23); + contentPane.add(btnRun); + + JLabel lblppcme2 = new JLabel("PPCME2"); + lblppcme2.setBounds(22, 243, 59, 14); + contentPane.add(lblppcme2); + + JLabel lblppceme = new JLabel("PPCEME"); + lblppceme.setBounds(99, 243, 62, 14); + contentPane.add(lblppceme); + + JLabel lblppcmbe = new JLabel("PPCMBE"); + lblppcmbe.setBounds(171, 243, 59, 14); + contentPane.add(lblppcmbe); + + JLabel lblAffixType = new JLabel("Affix Type"); + lblAffixType.setBounds(418, 243, 49, 14); + contentPane.add(lblAffixType); + + JLabel lblResults = new JLabel("Results"); + lblResults.setBounds(22, 371, 49, 14); + contentPane.add(lblResults); + //contentPane.add(scrollPane); + + JLabel lblWordclass = new JLabel("Word Class"); + lblWordclass.setBounds(284, 243, 99, 14); + contentPane.add(lblWordclass); + + JRadioButton rdbtnNoun = new JRadioButton("Noun"); + rdbtnNoun.addActionListener(new ActionListener() + { + public void actionPerformed(ActionEvent e) + { + wordclass = "_nn01"; + } + }); + rdbtnNoun.setBounds(291, 264, 111, 23); + contentPane.add(rdbtnNoun); + + JRadioButton rdbtnVerb = new JRadioButton("Verb"); + rdbtnVerb.addActionListener(new ActionListener() + { + public void actionPerformed(ActionEvent e) + { + wordclass = "_vb01"; + } + }); + rdbtnVerb.setBounds(291, 289, 111, 23); + contentPane.add(rdbtnVerb); + + JRadioButton rdbtnAdjective = new JRadioButton("Adjective"); + rdbtnAdjective.addActionListener(new ActionListener() + { + public void actionPerformed(ActionEvent e) + { + wordclass = "_jj01"; + } + }); + rdbtnAdjective.setBounds(291, 317, 111, 23); + contentPane.add(rdbtnAdjective); + + wordclassGroup = new ButtonGroup(); + wordclassGroup.add(rdbtnNoun); + wordclassGroup.add(rdbtnVerb); + wordclassGroup.add(rdbtnAdjective); + + JScrollPane scrollPane = new JScrollPane(); + scrollPane.setBounds(25, 396, 601, 140); + contentPane.add(scrollPane); + + resultTextArea = new JTextArea(); + resultTextArea.setEditable(false); + scrollPane.setViewportView(resultTextArea); + resultTextArea.setAutoscrolls(true); + + } + + public boolean validateForm() { + + StringBuilder errors = new StringBuilder(); + IO testapi = new IO(); + // Confirm mandatory fields are filled out + if (textFieldCorpusDir.getText().trim().isEmpty()) { + errors.append("- Please enter a PPC directory.\n"); + textFieldCorpusDir.requestFocusInWindow(); + JOptionPane.showMessageDialog(null, errors, "Directory not specified!", JOptionPane.ERROR_MESSAGE); + return false; + } + else + if (textFieldResultDir.getText().trim().isEmpty()) { + errors.append("- Please enter a result directory.\n"); + textFieldResultDir.requestFocusInWindow(); + JOptionPane.showMessageDialog(null, errors, "Directory not specified!", JOptionPane.ERROR_MESSAGE); + return false; + } + else if (textFieldID.getText().trim().isEmpty()) + { + errors.append("- Please enter a valid OED ID.\n"); + textFieldID.requestFocusInWindow(); + JOptionPane.showMessageDialog(null, errors, "No OED ID available!", JOptionPane.ERROR_MESSAGE); + return false; + } + else if (textFieldKey.getText().trim().isEmpty()) + { + errors.append("- Please enter a valid OED ID.\n"); + textFieldKey.requestFocusInWindow(); + JOptionPane.showMessageDialog(null, errors, "No OED ID available!", JOptionPane.ERROR_MESSAGE); + return false; + } + else if ( affixGroup.getSelection() == null ) + { + errors.append("- Please select the affix type.\n"); + JOptionPane.showMessageDialog(null, errors, "No Affix type available!", JOptionPane.ERROR_MESSAGE); + return false; + } + else if (wordclassGroup.getSelection() == null) + { + errors.append("- Please select the word class.\n"); + JOptionPane.showMessageDialog(null, errors, "No word class available!", JOptionPane.ERROR_MESSAGE); + return false; + } + else if (testapi.requestRESTfulAPI("https://oed-researcher-api.oxfordlanguages.com/oed/api/v0.2/words/?lemma=test", app_id, app_key).isEmpty()) + { + errors.append("- Your OED credentials are invalid.\n"); + JOptionPane.showMessageDialog(null, errors, "No valid OED access!", JOptionPane.ERROR_MESSAGE); + return false; + } + else + { + return true; + } + } +} diff --git a/Morphochron/src/Init.java b/Morphochron/src/Init.java index d12cb75991e1b47a545075ea2b5aa92cc3e69d5e..2bbf14c989d3d0e97ad6c06c82964ab071b06ad6 100644 --- a/Morphochron/src/Init.java +++ b/Morphochron/src/Init.java @@ -1,9 +1,14 @@ +import java.awt.EventQueue; +import java.awt.event.ActionEvent; +import java.awt.event.ActionListener; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Map; +import java.util.Set; -public class Init { +public class Init +{ /** * Short description of the algorithm given a PENN tagged text corpus * loop through list: for each word do: @@ -15,6 +20,8 @@ public class Init { * 5. write all words that contain affix to list * 6. Check with Token-Hashmap if word in 5 is hapax legonoma */ + + public static void main(String[] args) throws IOException { /* @@ -29,53 +36,88 @@ public class Init { * Affix Type: _su01, _pr01 * * TODO: - * 1. implement GUI to have properties selected - * 2. include credentials and directories as selection + * 1. implement GUI to have properties selected -- Done + * 2. include credentials and directories as selection -- Done + * 3. check if selected corpus is indeed ppc + * 4. procedure to incorporate postprocessingfiles in the result (i.e. merge list from postprocessing in morphemeWordList */ - String corpus = "ppcmbe"; - String period = "Emod2"; - String wordclass ="_nn01"; - String affixtype = "_su01"; - - System.out.println( - "Selection made\ncorpus: " + corpus + "\nperiod: " + period + - "\nword class: " + wordclass + "\naffixtype: " + affixtype); - - Corpus cp = new Corpus(corpus, period); - ArrayList<String> allWordsOfCorpus = new ArrayList<String>(); - allWordsOfCorpus = cp.getCorpus(); - System.out.println("Corpus read completely and normalized"); - - //create normalized word lists (factory pattern) - WordClassFactory wordClassFactory = new WordClassFactory(); - WordClass wc = wordClassFactory.normalizeWords(wordclass, allWordsOfCorpus); - - ArrayList<String> normalizedWords = new ArrayList<String>(); -// normalizedWords = wc.getNormalizedWords(); -// for (String word : normalizedWords) -// { -// System.out.println(word); -// } - normalizedWords.add("mountainousness"); - normalizedWords.add("mountainous"); - normalizedWords.add("counterargument"); - normalizedWords.add("precondition"); - normalizedWords.add("reanimation"); - normalizedWords.add("degeneration"); - normalizedWords.add("proposition"); - - - System.out.println("All words of type " + wordclass + " selected"); - //detect affixes in word list as a pre-processing and countercheck these with OED REST API - Affix aff = new Affix(normalizedWords, cp.getStartDate(), cp.getEndDate(), wordclass, affixtype); - Map<String, ArrayList<String>> morphemeWordList = new HashMap<String,ArrayList<String>>(); - morphemeWordList = aff.getMorphemeWordList(); - - System.out.println("Affixes parsed and validated in OED"); - - System.out.println("Writing results to file"); - //calculate results and write them to file - Result rs = new Result(morphemeWordList, cp, normalizedWords, wordclass, affixtype); - System.out.println("Done!"); + EventQueue.invokeLater(new Runnable() { + public void run() { + + GUI_mainMenu frame = new GUI_mainMenu(); + frame.setVisible(true); + + frame.getRunButton().addActionListener(new ActionListener() + { + public void actionPerformed(ActionEvent e) + { + if (frame.validateForm()) + { + String corpus = frame.getCorpus(); + Set<String> periods = frame.getPeriods(); + String wordclass = frame.getWordClass(); + String affixtype = frame.getAffixType(); + String app_key = frame.getAppKey(); + String app_id = frame.getAppID(); + String corpusPath = frame.getCorpusPath(); + String resultPath = frame.getResultPath(); + + for (String period : periods) + { + frame.setMessage( + "Selection made\ncorpus: " + corpus + "\nperiod: " + period + + "\nword class: " + wordclass + "\naffixtype: " + affixtype + "\n"); + + Corpus cp = new Corpus(corpus, period, corpusPath); + ArrayList<String> allWordsOfCorpus = new ArrayList<String>(); + allWordsOfCorpus = cp.getCorpus(); + frame.setMessage("Corpus read completely and normalized\n"); + + //System.getProperties().list(System.out); + //create normalized word lists (factory pattern) + WordClassFactory wordClassFactory = new WordClassFactory(); + WordClass wc = wordClassFactory.normalizeWords(wordclass, allWordsOfCorpus); + + ArrayList<String> normalizedWords = new ArrayList<String>(); + normalizedWords = wc.getNormalizedWords(); + // for (String word : normalizedWords) + // { + // System.out.println(word); + // } +// normalizedWords.add("mountainousness"); +// normalizedWords.add("mountainous"); +// normalizedWords.add("consideration"); +// normalizedWords.add("precondition"); +// normalizedWords.add("restlessness"); +// normalizedWords.add("dignitary"); +// normalizedWords.add("proposition"); +// normalizedWords.add("daskommtnichtvor"); + + + frame.setMessage("All words of type " + wordclass + " selected\n"); + //detect affixes in word list as a pre-processing and countercheck these with OED REST API + Affix aff = new Affix(normalizedWords, cp, wordclass, affixtype, app_key, app_id, resultPath); + Map<String, ArrayList<String>> morphemeWordList = new HashMap<String,ArrayList<String>>(); + Map<String, ArrayList<String>> notInOEDWordList = new HashMap<String,ArrayList<String>>(); + morphemeWordList = aff.getMorphemeWordList(); + notInOEDWordList = aff.getNotInOEDWordList(); + + frame.setMessage("Affixes parsed and validated in OED\n"); + + frame.setMessage("Writing results to file\n"); + //calculate results and write them to file + try + { + Result rs = new Result(morphemeWordList, notInOEDWordList, cp, normalizedWords, wordclass, affixtype, resultPath); + } catch (Exception ex) { + ex.printStackTrace(); + } + frame.setMessage("Done!\n"); + } + } + } + }); + } + }); } } diff --git a/Morphochron/src/OED.java b/Morphochron/src/OED.java index 153e2e8152c32a728cd91ac39b35e926c31dd5e7..d46a583cb482e94b817eb80875e81aa4d7e79246 100644 --- a/Morphochron/src/OED.java +++ b/Morphochron/src/OED.java @@ -1,11 +1,15 @@ import javax.net.ssl.HttpsURLConnection; import java.io.BufferedReader; +import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import java.net.URL; +import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.Map; +import java.util.Set; import org.json.JSONObject; import org.json.JSONArray; @@ -32,18 +36,22 @@ public class OED private final String app_id; private final String app_key; private IO restapi = new IO(); + private String resultPath = ""; + private Corpus corpus; //for each look up the word and one of its contained affixes is needed - public OED(String word, String morpheme, String wordclass, String affixtype, int startdateCorpus, int enddateCorpus) + public OED(String word, String morpheme, String wordclass, String affixtype, Corpus corpus, String app_key, String app_id, String resultPath) { - this.app_id = restapi.readFile("C:\\Users\\Peukert\\Documents\\Morphochron\\id", false); - this.app_key = restapi.readFile("C:\\Users\\Peukert\\Documents\\Morphochron\\key", false); + this.corpus = corpus; + this.app_id = app_id;//restapi.readFile("C:\\Users\\Peukert\\Documents\\Morphochron\\id", false); + this.app_key = app_key;//restapi.readFile("C:\\Users\\Peukert\\Documents\\Morphochron\\key", false); this.word = word; this.morpheme = morpheme; this.wordclass = wordclass; this.affixtype = affixtype; - this.startdateCorpus = startdateCorpus; - this.enddateCorpus = enddateCorpus; + this.startdateCorpus = corpus.getStartDate(); + this.enddateCorpus = corpus.getEndDate(); + this.resultPath = resultPath; } /* @@ -80,7 +88,7 @@ public class OED return wordid; } - /* + /* @deprecated * checks if a word has an extra entry for the given morpheme * checks if field daterange.obsolete = false and daterange.end is null */ @@ -108,6 +116,7 @@ public class OED JSONArray arr = obj.getJSONArray("data"); for (int i = 0; i < arr.length(); i++) { + //System.out.println("The following morpheme is checked for existence: " + morpheme); occurredIn = arr.getJSONObject(i).getString("lemma").equals(affix); if (occurredIn) @@ -119,10 +128,36 @@ public class OED break; } } + // && obsolete not included because it seems to be today's perspective of obselete return (occurredIn && startyearOED < startdateCorpus && enddateCorpus < endyearOED); } + private Set<String> getAllAffixesFromJSonRoot(JSONObject obj) + { + int startyearOED = 0; + int endyearOED = 0; + Set<String> affixTypes = new HashSet<String>(); + + JSONArray arr = obj.getJSONArray("data"); + for (int i = 0; i < arr.length(); i++) + { + String s = arr.getJSONObject(i).getString("lemma"); + if ((s.startsWith("-") && affixtype.equals("_su01") ) || (s.endsWith("-") && affixtype.equals("_pr01"))) + { + endyearOED = arr.getJSONObject(i).getJSONObject("daterange").optInt("end", 10000);//lots of enddates are null,i.e. not set + startyearOED = arr.getJSONObject(i).getJSONObject("daterange").optInt("start", 0); + //obsolete = arr.getJSONObject(i).getJSONObject("daterange").getBoolean("obsolete"); + if (startyearOED < startdateCorpus && enddateCorpus < endyearOED) + { + affixTypes.add(s.replace("-", "")); + } + } + } + + return affixTypes; + } + /* * processes OED API queries */ @@ -155,13 +190,40 @@ public class OED JSONObject o = getJSonResponse(s); entryAvailable = processJSonRoot(o); } +// else +// { +// entryAvailable = false; +// } + return entryAvailable; + } + + public Set<String> getMorphemesOEDRequest() + { + String wordJSON = getRESTAPIWordRepresentation(word.toLowerCase()); + JSONObject jo = getJSonResponse(wordJSON); + String id = processJSonWordID(jo); + Set<String> affixes = new HashSet<String>(); + if (!id.isEmpty()) + { + String s = getRESTAPIRootRepresentation(id); + JSONObject o = getJSonResponse(s); + affixes = getAllAffixesFromJSonRoot(o); + } else { - System.out.println("Word does not exist in OED"); + System.out.println("Word does not exist in OED: " + word); + String file = resultPath + File.separator + "wordsNotExistentInOED" + wordclass + "" + affixtype + "_" + corpus.getPeriod() + "-" + corpus.getCorpusName() + ".csv"; + try + { + restapi.appendResultsToCSVFile(file, word); + } + catch (IOException e) + { + System.out.println(e.getMessage()); + } } - return entryAvailable; + return affixes; } - /* * delete the main method once programm is finished * exist only for test purposes diff --git a/Morphochron/src/Result.java b/Morphochron/src/Result.java index b2fc8498f42566138ac4b720d78e28d3f65f4939..3432486ea04380cc14f6539ccde9b313461a2d1a 100644 --- a/Morphochron/src/Result.java +++ b/Morphochron/src/Result.java @@ -1,3 +1,4 @@ +import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; @@ -14,21 +15,26 @@ public class Result private String wordclass = ""; private String postprocessingfile = ""; private String finalresultsfile = ""; + private String resultPath = ""; private ArrayList<String> filteredWords = new ArrayList<String>(); private ArrayList<String> allWordsOfCorpus = new ArrayList<String>(); private Map<String, ArrayList<String>> morphemeWordList = new HashMap<String,ArrayList<String>>(); + private Map<String, ArrayList<String>> notInOEDWordList = new HashMap<String,ArrayList<String>>(); - public Result(Map<String, ArrayList<String>> morphemeWordList, Corpus cp, ArrayList<String> filteredWords, String wordtype, String affixtype) throws IOException + public Result(Map<String, ArrayList<String>> morphemeWordList, Map<String, ArrayList<String>> notInOEDWordList, Corpus cp, ArrayList<String> filteredWords, String wordtype, String affixtype, String resultPath) throws IOException { this.morphemeWordList = morphemeWordList; //the mapping between the morpheme and all word types it is contained in + this.notInOEDWordList = notInOEDWordList; this.filteredWords = filteredWords; //these are all the words of the respective word class without tags) this.affixtype = affixtype; this.wordclass = wordtype; this.cp = cp; this.allWordsOfCorpus = cp.getCorpus(); //all words of the corpus irrespective of the word class (words contain tags!) + this.resultPath = resultPath; setFileNames(); - generateDataSet(); + //generateDataSet(morphemeWordList); writeAllMeasuresFile(); + writePostProcessingFile(); } private ArrayList<String> findHapaxes(ArrayList<String> allWordsContainingAffix) @@ -115,15 +121,15 @@ public class Result /* * Generates the Data string written to the CSV result file */ - private String generateDataSet() + private String generateDataSet(Map<String, ArrayList<String>> list) { String data = createHeader(); - for (String s : morphemeWordList.keySet()) + for (String s : list.keySet()) { String key = s.toString(); // allWordsContainingAffix is a list with all wordtypes containing one affix s.toString - ArrayList<String> allWordsContainingAffix = morphemeWordList.get(s); + ArrayList<String> allWordsContainingAffix = list.get(s); ArrayList<String> hapaxes = findHapaxes(allWordsContainingAffix); int affixfrequencyForAllWordTokens = calculateNumberOfAffixes(allWordsContainingAffix); int affixfrequencyForAllWordTypes = allWordsContainingAffix.size(); @@ -139,22 +145,22 @@ public class Result private void setFileNames() { // location postprocessing file - postprocessingfile = "C:\\Users\\Peukert\\Documents\\postprocessingfile.csv"; + postprocessingfile = resultPath + File.separator + "postprocessingfile" + wordclass + "" + affixtype + "_" + cp.getPeriod() + "-" + cp.getCorpusName() + ".csv"; // location final result file - finalresultsfile = "C:\\Users\\Peukert\\Documents\\resultsMorphochron.csv"; + finalresultsfile = resultPath + File.separator + "resultsMorphochron.csv"; } private void writePostProcessingFile() { IO io = new IO(); //write csv file to manually postprocess the data - io.writeMorphemeWordListToCSVFile(postprocessingfile, morphemeWordList); + io.writeMorphemeWordListToCSVFile(postprocessingfile, notInOEDWordList); } private void writeAllMeasuresFile() throws IOException { IO io = new IO(); // write all results to CSV file - io.appendResultsToCSVFile(finalresultsfile, generateDataSet()); + io.appendResultsToCSVFile(finalresultsfile, generateDataSet(morphemeWordList)); } } diff --git a/Morphochron/src/Verb.java b/Morphochron/src/Verb.java index ef8edf47e90115f913199fa135cbe0f9a64a3629..07cf75355fd51a92fc5553e11c447f1aaecea784 100644 --- a/Morphochron/src/Verb.java +++ b/Morphochron/src/Verb.java @@ -9,20 +9,21 @@ public class Verb implements WordClass{ { for (String word : allWordsOfCorpus) { - if (word.endsWith("MD") || word.endsWith("MD0") || - word.endsWith("VAG") || word.endsWith("VAN") || - word.endsWith("VB") || word.endsWith("VBI") || - word.endsWith("VBD") || word.endsWith("VBN") || - word.endsWith("VBP")) - { - allVerbsOfCorpus.add(word); - } + //inflections are left since word are found in OED and monomorphemic except 3.P Sing, which is not marked in PENN + //[a-zA-Z]+[_|/](MD|MD0|VAG|VAN|VB|VBI|VBD|VBN|VBP) + if (word.matches("[a-zA-Z]+[_|/](VBN)")) + { + word = word.replaceAll("[_|/](MD0|VAG|VAN|VBI|VBD|VBN|VBP)",""); + word = word.replaceAll("[_|/](MD|VB)",""); + allVerbsOfCorpus.add(word.toLowerCase()); + } } } public void setWords(ArrayList<String> al) { this.allWordsOfCorpus = al; + deleteInflections(); } public ArrayList<String> getNormalizedWords()