Skip to content
Snippets Groups Projects
Select Git revision
  • 0d577bccdc21dc9309d51088452138880c4f2dab
  • master default protected
2 results

Corpus.java

Blame
  • Code owners
    Assign users and groups as approvers for specific file changes. Learn more.
    Corpus.java 4.86 KiB
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.Collections;
    
    public class Corpus 
    {
    	private String corpusname = "";
    	private String directory = "";
    	private String period = "";
    	private String filter = "";
    	private int startdate = 0;
    	private int enddate = 0;
    	private ArrayList<String> allWordsOfCorpus = new ArrayList<String>();
    	
    	public Corpus(String corpusname, String period, String directory)
    	{
    		this.corpusname = corpusname;
    		this.period = period;
    		this.directory = directory;
    		setCorpusDates();
    		readCorpus();
    	}
    	
    	public String getCorpusName()
    	{
    		return corpusname;
    	}
    	
    	public String getPeriod()
    	{
    		return period;
    	}
    	
    	public ArrayList<String> getCorpus()
    	{		
    		return allWordsOfCorpus;
    	}
    	
    	public int getStartDate()
    	{
    		return startdate;
    	}
    	
    	public int getEndDate()
    	{
    		return enddate;
    	}
    	
    	// location directory of corpus
    	private void setCorpusDates()
    	{
    		switch (corpusname) 
    		{
    			case "ppcmbe": 
    				startdate = 1700;
    				enddate = 1914;
    				//directory = "C:\\Users\\Peukert\\Corpora\\PENN Corpus\\PENN-CORPORA\\PPCMBE-RELEASE-1\\corpus\\pos";
    				break;
    			case "ppceme": 
    				startdate = 1500;
    				enddate = 1710;
    				//directory = "C:\\Users\\Peukert\\Corpora\\PENN Corpus\\PENN-CORPORA\\PPCEME-RELEASE-2\\corpus\\pos\\penn2";
    				break;
    			case "ppcme2":
    				startdate = 1150;
    				enddate = 1500;
    				//directory = "C:\\Users\\Peukert\\Corpora\\PENN Corpus\\PENN-CORPORA\\PPCME2-RELEASE-3\\corpus\\pos";
    				break;
    		}
    		//System.out.println(directory);		
    	}
    	
    	// set time information, method to read which files,
    	private void readCorpus()
    	{
    		IO io = new IO();
    		if (period.isEmpty())
    		{
    			filter = ".*\\.pos";
    			allWordsOfCorpus = io.readFilesFromDirectory(directory, filter);
    		}
    		else
    		{
    			switch (period)
    			{	
    				case "m1": 
    					startdate = 1150;
    					enddate = 1250;
    					filter = "[a-z0-9]+\\.[m][x]?[1]\\.pos";
    					allWordsOfCorpus = io.readFilesFromDirectory(directory, filter);
    					break;
    				case "m2":
    					startdate = 1250;
    					enddate = 1350;
    					filter = "[a-z0-9]+\\.[m][2]\\d?\\.pos";
    					allWordsOfCorpus = io.readFilesFromDirectory(directory, filter);
    					break;
    				case "m3":
    					startdate = 1350;
    					enddate = 1420;
    					filter = "[a-z0-9]+\\.[m][3]\\d?\\.pos";
    					allWordsOfCorpus = io.readFilesFromDirectory(directory, filter);
    					break;
    				case "m4":
    					startdate = 1420;
    					enddate = 1500;
    					filter = "[a-z0-9]+\\.[m][x]?[4]\\.pos";
    					allWordsOfCorpus = io.readFilesFromDirectory(directory, filter);
    					break;
    				case "e1":
    					startdate = 1500;
    					enddate = 1569;
    					filter = "[a-z0-9]+-e1-p2\\.pos";
    					allWordsOfCorpus = io.readFilesFromDirectory(directory, filter);
    					break;
    				case "e2":
    					startdate = 1570;
    					enddate = 1639;
    					filter = "[a-z0-9]+-e2-p2\\.pos";
    					allWordsOfCorpus = io.readFilesFromDirectory(directory, filter);
    					break;
    				case "e3":
    					startdate = 1640;
    					enddate = 1710;
    					filter = "[a-z0-9]+-e3-p2\\.pos";
    					allWordsOfCorpus = io.readFilesFromDirectory(directory, filter);
    					break;
    				case "Emod1":
    					startdate = 1700;
    					enddate = 1769;
    					filter = "[a-z0-9]+-[1][7][^789](\\d|[x]?)\\.pos";
    					allWordsOfCorpus = io.readFilesFromDirectory(directory, filter);
    					break;
    				case "Emod2":
    					startdate = 1770;
    					enddate = 1839;
    					filter = "[a-z0-9]+-[1]([7][789]|[8][0123])(\\d|[x]?)\\.pos";
    					allWordsOfCorpus = io.readFilesFromDirectory(directory, filter);
    					break;
    				case "Emod3":
    					startdate = 1840;
    					enddate = 1914;
    					filter = "[a-z0-9]+-[1]([8][456789]|[9][01])(\\d|[x]?)\\.pos";;
    					allWordsOfCorpus = io.readFilesFromDirectory(directory, filter);
    					break;
    			}
    			normalizeText();
    		}
    	}
    	
    	
    	
    	private void normalizeText()
    	{
    		//+a for æ; +t for þ; +d for ð, +g for ȝ
    		for (int i = 0; i < allWordsOfCorpus.size(); i++) 
    		{
    			String word = allWordsOfCorpus.get(i);
    			//delete $-sign of manually corrected words (see corpus docu)
    			if (word.startsWith("$"))
    			{
    				word = word.replaceFirst("\\$", "");
    				//System.out.println(word);
    			}
    			// normalize text data (only applicable for m1-m4)
    			if (word.matches(".*\\+[agdtAGDT][a-zA-Z0-9]*[_|/][A-Z]+") && corpusname.equals("ppcme2"))
    			{
    				word = word.replaceAll("\\+[tT]", "th");//þ
    				word = word.replaceAll("\\+[gG]", "z");//ȝ
    				word = word.replaceAll("\\+[dD]", "th");//ð
    				word = word.replaceAll("\\+[aA]", "ae");//æ
    				allWordsOfCorpus.add(word);
    				//System.out.println(word);
    				allWordsOfCorpus.remove(i);
    			}
    		}
    	}
    		
    	/*
    	 * delete the main method once programm is finished
    	 * exist only for test purposes
    	 */
    //	public static void main(String[] args) 
    //	{  
    //		Corpus cp = new Corpus("ppcme2", "m1");
    //		cp.getCorpus();
    //	}
    	
    	
    	
    }