Select Git revision
Corpus.java
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
Corpus.java 4.86 KiB
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
public class Corpus
{
private String corpusname = "";
private String directory = "";
private String period = "";
private String filter = "";
private int startdate = 0;
private int enddate = 0;
private ArrayList<String> allWordsOfCorpus = new ArrayList<String>();
public Corpus(String corpusname, String period, String directory)
{
this.corpusname = corpusname;
this.period = period;
this.directory = directory;
setCorpusDates();
readCorpus();
}
public String getCorpusName()
{
return corpusname;
}
public String getPeriod()
{
return period;
}
public ArrayList<String> getCorpus()
{
return allWordsOfCorpus;
}
public int getStartDate()
{
return startdate;
}
public int getEndDate()
{
return enddate;
}
// location directory of corpus
private void setCorpusDates()
{
switch (corpusname)
{
case "ppcmbe":
startdate = 1700;
enddate = 1914;
//directory = "C:\\Users\\Peukert\\Corpora\\PENN Corpus\\PENN-CORPORA\\PPCMBE-RELEASE-1\\corpus\\pos";
break;
case "ppceme":
startdate = 1500;
enddate = 1710;
//directory = "C:\\Users\\Peukert\\Corpora\\PENN Corpus\\PENN-CORPORA\\PPCEME-RELEASE-2\\corpus\\pos\\penn2";
break;
case "ppcme2":
startdate = 1150;
enddate = 1500;
//directory = "C:\\Users\\Peukert\\Corpora\\PENN Corpus\\PENN-CORPORA\\PPCME2-RELEASE-3\\corpus\\pos";
break;
}
//System.out.println(directory);
}
// set time information, method to read which files,
private void readCorpus()
{
IO io = new IO();
if (period.isEmpty())
{
filter = ".*\\.pos";
allWordsOfCorpus = io.readFilesFromDirectory(directory, filter);
}
else
{
switch (period)
{
case "m1":
startdate = 1150;
enddate = 1250;
filter = "[a-z0-9]+\\.[m][x]?[1]\\.pos";
allWordsOfCorpus = io.readFilesFromDirectory(directory, filter);
break;
case "m2":
startdate = 1250;
enddate = 1350;
filter = "[a-z0-9]+\\.[m][2]\\d?\\.pos";
allWordsOfCorpus = io.readFilesFromDirectory(directory, filter);
break;
case "m3":
startdate = 1350;
enddate = 1420;
filter = "[a-z0-9]+\\.[m][3]\\d?\\.pos";
allWordsOfCorpus = io.readFilesFromDirectory(directory, filter);
break;
case "m4":
startdate = 1420;
enddate = 1500;
filter = "[a-z0-9]+\\.[m][x]?[4]\\.pos";
allWordsOfCorpus = io.readFilesFromDirectory(directory, filter);
break;
case "e1":
startdate = 1500;
enddate = 1569;
filter = "[a-z0-9]+-e1-p2\\.pos";
allWordsOfCorpus = io.readFilesFromDirectory(directory, filter);
break;
case "e2":
startdate = 1570;
enddate = 1639;
filter = "[a-z0-9]+-e2-p2\\.pos";
allWordsOfCorpus = io.readFilesFromDirectory(directory, filter);
break;
case "e3":
startdate = 1640;
enddate = 1710;
filter = "[a-z0-9]+-e3-p2\\.pos";
allWordsOfCorpus = io.readFilesFromDirectory(directory, filter);
break;
case "Emod1":
startdate = 1700;
enddate = 1769;
filter = "[a-z0-9]+-[1][7][^789](\\d|[x]?)\\.pos";
allWordsOfCorpus = io.readFilesFromDirectory(directory, filter);
break;
case "Emod2":
startdate = 1770;
enddate = 1839;
filter = "[a-z0-9]+-[1]([7][789]|[8][0123])(\\d|[x]?)\\.pos";
allWordsOfCorpus = io.readFilesFromDirectory(directory, filter);
break;
case "Emod3":
startdate = 1840;
enddate = 1914;
filter = "[a-z0-9]+-[1]([8][456789]|[9][01])(\\d|[x]?)\\.pos";;
allWordsOfCorpus = io.readFilesFromDirectory(directory, filter);
break;
}
normalizeText();
}
}
private void normalizeText()
{
//+a for æ; +t for þ; +d for ð, +g for ȝ
for (int i = 0; i < allWordsOfCorpus.size(); i++)
{
String word = allWordsOfCorpus.get(i);
//delete $-sign of manually corrected words (see corpus docu)
if (word.startsWith("$"))
{
word = word.replaceFirst("\\$", "");
//System.out.println(word);
}
// normalize text data (only applicable for m1-m4)
if (word.matches(".*\\+[agdtAGDT][a-zA-Z0-9]*[_|/][A-Z]+") && corpusname.equals("ppcme2"))
{
word = word.replaceAll("\\+[tT]", "th");//þ
word = word.replaceAll("\\+[gG]", "z");//ȝ
word = word.replaceAll("\\+[dD]", "th");//ð
word = word.replaceAll("\\+[aA]", "ae");//æ
allWordsOfCorpus.add(word);
//System.out.println(word);
allWordsOfCorpus.remove(i);
}
}
}
/*
* delete the main method once programm is finished
* exist only for test purposes
*/
// public static void main(String[] args)
// {
// Corpus cp = new Corpus("ppcme2", "m1");
// cp.getCorpus();
// }
}