Skip to content
Snippets Groups Projects
Commit 48436ba6 authored by Peukert's avatar Peukert
Browse files

Textinput fertig

ArrayListen mit V, N, A erstellt
parent 253ec15a
Branches
Tags
No related merge requests found
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.stream.Stream;
import java.util.ArrayList;
import java.util.Scanner;
public class IO {
static ArrayList<String> readFilesFromDirectory(String path)
{
ArrayList<String> allWordsOfTexts = new ArrayList<String>();
try (Stream<Path> paths = Files.walk(Paths.get(path))) {
paths.filter(Files::isRegularFile).forEach(item ->
{
try (Scanner s = new Scanner(new File(item.toString())).useDelimiter("\\s+"))
{
while (s.hasNext())
{
allWordsOfTexts.add(s.next());
}
}
catch (FileNotFoundException e) {}
});
}
catch (IOException e) {}
return allWordsOfTexts;
}
static String readFile(String file, boolean unicode)
{
String data = "";
String line = "";
char cData[] = new char[5*1048576];
int i = 0;
try
{
BufferedReader fr =
new BufferedReader(
new InputStreamReader(new FileInputStream(file),
unicode?"UnicodeLittle":"ISO8859_1"));
i = fr.read(cData);
data = new String(cData, 0, i);
fr.close();
} catch(IOException e){}
return(data);
}
static void writeFile(String file, String data, boolean unicode)
{
try
{
BufferedWriter bw = new BufferedWriter(unicode?new OutputStreamWriter(new FileOutputStream(file),"UnicodeLittle"):new FileWriter(file));
bw.write(data);
bw.close();
} catch(IOException e){}
}
}
import java.util.ArrayList;
public class Init { public class Init {
public static void main(String[] args) { public static void main(String[] args)
// TODO Auto-generated method stub {
System.out.println("Hello World!"); // read corpus file as list
String directory = "C:\\Users\\Peukert\\Corpora\\PENN Corpus\\PENN-CORPORA\\PPCMBE-RELEASE-1\\corpus\\pos";
IO io = new IO();
ArrayList<String> allWordsOfCorpus = new ArrayList<String>();
ArrayList<String> allNounsOfCorpus = new ArrayList<String>();
ArrayList<String> allVerbsOfCorpus = new ArrayList<String>();
ArrayList<String> allAdjectivesOfCorpus = new ArrayList<String>();
allWordsOfCorpus = io.readFilesFromDirectory(directory);
// make 3 lists for A,V,N
//String nl = System.getProperty("line.separator");
for (String word : allWordsOfCorpus)
{
if (word.endsWith("MD") || word.endsWith("MD0") ||
word.endsWith("VAG") || word.endsWith("VAN") ||
word.endsWith("VB") || word.endsWith("VBI") ||
word.endsWith("VBD") || word.endsWith("VBN") ||
word.endsWith("VBP"))
{
allVerbsOfCorpus.add(word);
}
else if (word.endsWith("N") || word.endsWith("N$") ||
word.endsWith("NPR") || word.endsWith("NPR$") ||
word.endsWith("NPRS") || word.endsWith("NPRS$") ||
word.endsWith("NS") || word.endsWith("NS$") ||
word.endsWith("OTHER") || word.endsWith("OTHER$") ||
word.endsWith("OTHERS$") || word.endsWith("OTHERS$"))
{
allNounsOfCorpus.add(word);
}
else if (word.endsWith("ADJ") || word.endsWith("ADJR") ||
word.endsWith("ADJS") || word.endsWith("ADV") ||
word.endsWith("ADVR") || word.endsWith("ADVS"))
{
allAdjectivesOfCorpus.add(word);
}
//System.out.println(word);
}
System.out.println("Gesamt Wortanzahl: " + allWordsOfCorpus.size());
System.out.println("Anzahl Verben: " + allVerbsOfCorpus.size());
System.out.println("Anzahl Adjektive: " + allAdjectivesOfCorpus.size());
System.out.println("Anzahl Substantive: " + allNounsOfCorpus.size());
for (String noun : allVerbsOfCorpus)
{
System.out.println(noun);
}
/**
* loop through list: for each word do:
* 0. reduce to verbs, nouns, and adjectives in three different lists
* 1. Hashmap with number of each word
* 2. reduce to word types
* 3. instantiate AffixStripper
* 4. delete if no lexical affix is present
* 5. write all words that contain affix to list
* 6. Check with Token-Hashmap if word in 5 is hapax legonoma
*/
// write to CSV file
} }
......
public class TestInit {
public static void main(String[] args) {
// TODO Auto-generated method stub
System.out.println("Hello World!");
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment