diff --git a/Morphochron/src/IO.java b/Morphochron/src/IO.java new file mode 100644 index 0000000000000000000000000000000000000000..ecbc61f224449b33757e3ae15a33ee054d68277c --- /dev/null +++ b/Morphochron/src/IO.java @@ -0,0 +1,70 @@ +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.FileWriter; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.stream.Stream; +import java.util.ArrayList; +import java.util.Scanner; + +public class IO { + + static ArrayList<String> readFilesFromDirectory(String path) + { + ArrayList<String> allWordsOfTexts = new ArrayList<String>(); + try (Stream<Path> paths = Files.walk(Paths.get(path))) { + paths.filter(Files::isRegularFile).forEach(item -> + { + try (Scanner s = new Scanner(new File(item.toString())).useDelimiter("\\s+")) + { + while (s.hasNext()) + { + allWordsOfTexts.add(s.next()); + } + } + catch (FileNotFoundException e) {} + }); + } + catch (IOException e) {} + + return allWordsOfTexts; + } + + static String readFile(String file, boolean unicode) + { + + String data = ""; + String line = ""; + char cData[] = new char[5*1048576]; + int i = 0; + try + { + BufferedReader fr = + new BufferedReader( + new InputStreamReader(new FileInputStream(file), + unicode?"UnicodeLittle":"ISO8859_1")); + i = fr.read(cData); + data = new String(cData, 0, i); + fr.close(); + } catch(IOException e){} + return(data); + } + + static void writeFile(String file, String data, boolean unicode) + { + try + { + BufferedWriter bw = new BufferedWriter(unicode?new OutputStreamWriter(new FileOutputStream(file),"UnicodeLittle"):new FileWriter(file)); + bw.write(data); + bw.close(); + } catch(IOException e){} + } +} diff --git a/Morphochron/src/Init.java b/Morphochron/src/Init.java index 2546fb6b4aa8155d758758a5e302ff26ce6457df..16deed83c8a3da7d6c30306d649163871d06ebca 100644 --- a/Morphochron/src/Init.java +++ b/Morphochron/src/Init.java @@ -1,9 +1,67 @@ +import java.util.ArrayList; + public class Init { - public static void main(String[] args) { - // TODO Auto-generated method stub - System.out.println("Hello World!"); + public static void main(String[] args) + { + // read corpus file as list + String directory = "C:\\Users\\Peukert\\Corpora\\PENN Corpus\\PENN-CORPORA\\PPCMBE-RELEASE-1\\corpus\\pos"; + IO io = new IO(); + ArrayList<String> allWordsOfCorpus = new ArrayList<String>(); + ArrayList<String> allNounsOfCorpus = new ArrayList<String>(); + ArrayList<String> allVerbsOfCorpus = new ArrayList<String>(); + ArrayList<String> allAdjectivesOfCorpus = new ArrayList<String>(); + allWordsOfCorpus = io.readFilesFromDirectory(directory); + // make 3 lists for A,V,N + //String nl = System.getProperty("line.separator"); + for (String word : allWordsOfCorpus) + { + if (word.endsWith("MD") || word.endsWith("MD0") || + word.endsWith("VAG") || word.endsWith("VAN") || + word.endsWith("VB") || word.endsWith("VBI") || + word.endsWith("VBD") || word.endsWith("VBN") || + word.endsWith("VBP")) + { + allVerbsOfCorpus.add(word); + } + else if (word.endsWith("N") || word.endsWith("N$") || + word.endsWith("NPR") || word.endsWith("NPR$") || + word.endsWith("NPRS") || word.endsWith("NPRS$") || + word.endsWith("NS") || word.endsWith("NS$") || + word.endsWith("OTHER") || word.endsWith("OTHER$") || + word.endsWith("OTHERS$") || word.endsWith("OTHERS$")) + { + allNounsOfCorpus.add(word); + } + else if (word.endsWith("ADJ") || word.endsWith("ADJR") || + word.endsWith("ADJS") || word.endsWith("ADV") || + word.endsWith("ADVR") || word.endsWith("ADVS")) + { + allAdjectivesOfCorpus.add(word); + } + //System.out.println(word); + } + System.out.println("Gesamt Wortanzahl: " + allWordsOfCorpus.size()); + System.out.println("Anzahl Verben: " + allVerbsOfCorpus.size()); + System.out.println("Anzahl Adjektive: " + allAdjectivesOfCorpus.size()); + System.out.println("Anzahl Substantive: " + allNounsOfCorpus.size()); + for (String noun : allVerbsOfCorpus) + { + System.out.println(noun); + } + /** + * loop through list: for each word do: + * 0. reduce to verbs, nouns, and adjectives in three different lists + * 1. Hashmap with number of each word + * 2. reduce to word types + * 3. instantiate AffixStripper + * 4. delete if no lexical affix is present + * 5. write all words that contain affix to list + * 6. Check with Token-Hashmap if word in 5 is hapax legonoma + */ + + // write to CSV file } diff --git a/Morphochron/src/TestInit.java b/Morphochron/src/TestInit.java deleted file mode 100644 index 95b05fb5a62df4c164124d1a70c1f6f4a71e4d5d..0000000000000000000000000000000000000000 --- a/Morphochron/src/TestInit.java +++ /dev/null @@ -1,9 +0,0 @@ - -public class TestInit { - - public static void main(String[] args) { - // TODO Auto-generated method stub - System.out.println("Hello World!"); - } - -}