Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
morphochron
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
softwaretools
morphochron
Commits
83b0a69a
Commit
83b0a69a
authored
3 years ago
by
Peukert
Browse files
Options
Downloads
Patches
Plain Diff
data extraction for nouns complete
parent
48436ba6
No related branches found
No related tags found
No related merge requests found
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
Morphochron/src/AffixStripper.java
+4
-2
4 additions, 2 deletions
Morphochron/src/AffixStripper.java
Morphochron/src/Init.java
+163
-14
163 additions, 14 deletions
Morphochron/src/Init.java
Morphochron/src/SuffixEnum.java
+1
-1
1 addition, 1 deletion
Morphochron/src/SuffixEnum.java
with
168 additions
and
17 deletions
Morphochron/src/AffixStripper.java
+
4
−
2
View file @
83b0a69a
...
...
@@ -215,7 +215,7 @@ public class AffixStripper {
private
void
analyzeWord
()
{
//analyze inflection first because it always occurs at the end of a word
inflection
=
analyzeInflection
(
wordtoken
);
inflection
=
""
;
//
analyzeInflection(wordtoken);
lemma
=
analyzeLemma
(
wordtoken
,
inflection
);
analyzePrefix
(
lemma
);
analyzeSuffix
(
lemma
);
...
...
@@ -390,9 +390,11 @@ public class AffixStripper {
for
(
SuffixEnum
sufEnum
:
SuffixEnum
.
values
())
{
String
s
=
sufEnum
.
toString
();
//System.out.println("morpheme: " + sufEnum.name() + " allomorph: " + sufEnum.getMorpheme());
if
(
restword
.
endsWith
(
s
))
{
suffixMorpheme
.
put
(
s
,
suffixMorpheme
.
size
()
+
1
);
//if the allomorphs are supposed be given to the map, use s instead of sufEnum.getMorpheme()
suffixMorpheme
.
put
(
sufEnum
.
getMorpheme
(),
suffixMorpheme
.
size
()
+
1
);
//suffixAllomorph.add(0, restword.substring(sufEnum.toString().length()));
//cut off the suffix that is added to the list
analyzeSuffix
(
restword
.
substring
(
0
,
restword
.
length
()
-
s
.
length
()));
...
...
This diff is collapsed.
Click to expand it.
Morphochron/src/Init.java
+
163
−
14
View file @
83b0a69a
import
java.util.ArrayList
;
import
java.util.Arrays
;
import
java.util.HashMap
;
import
java.util.HashSet
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Set
;
import
java.util.Collections
;
public
class
Init
{
public
static
void
main
(
String
[]
args
)
{
// read corpus file
as
list
// read
all texts of the
corpus file
in
list
String
directory
=
"C:\\Users\\Peukert\\Corpora\\PENN Corpus\\PENN-CORPORA\\PPCMBE-RELEASE-1\\corpus\\pos"
;
IO
io
=
new
IO
();
ArrayList
<
String
>
allWordsOfCorpus
=
new
ArrayList
<
String
>();
allWordsOfCorpus
=
io
.
readFilesFromDirectory
(
directory
);
// create 6 lists for each word class (A,V,N) and compounds each
ArrayList
<
String
>
allNounsOfCorpus
=
new
ArrayList
<
String
>();
ArrayList
<
String
>
allVerbsOfCorpus
=
new
ArrayList
<
String
>();
ArrayList
<
String
>
allAdjectivesOfCorpus
=
new
ArrayList
<
String
>();
allWordsOfCorpus
=
io
.
readFilesFromDirectory
(
directory
);
// make 3 lists for A,V,N
//String nl = System.getProperty("line.separator");
ArrayList
<
String
>
allCompoundNounsOfCorpus
=
new
ArrayList
<
String
>();
ArrayList
<
String
>
allComoundVerbsOfCorpus
=
new
ArrayList
<
String
>();
ArrayList
<
String
>
allCompoundAdjectivesOfCorpus
=
new
ArrayList
<
String
>();
ArrayList
<
String
>
allPluralnounsOfCorpus
=
new
ArrayList
<
String
>();
//remove inflected forms
for
(
String
word
:
allWordsOfCorpus
)
{
//handling for verbs
if
(
word
.
endsWith
(
"MD"
)
||
word
.
endsWith
(
"MD0"
)
||
word
.
endsWith
(
"VAG"
)
||
word
.
endsWith
(
"VAN"
)
||
word
.
endsWith
(
"VB"
)
||
word
.
endsWith
(
"VBI"
)
||
...
...
@@ -25,14 +38,78 @@ public class Init {
{
allVerbsOfCorpus
.
add
(
word
);
}
else
if
(
word
.
endsWith
(
"N"
)
||
word
.
endsWith
(
"N$"
)
||
word
.
endsWith
(
"NPR"
)
||
word
.
endsWith
(
"NPR$"
)
||
word
.
endsWith
(
"NPRS"
)
||
word
.
endsWith
(
"NPRS$"
)
||
word
.
endsWith
(
"NS"
)
||
word
.
endsWith
(
"NS$"
)
||
word
.
endsWith
(
"OTHER"
)
||
word
.
endsWith
(
"OTHER$"
)
||
word
.
endsWith
(
"OTHERS$"
)
||
word
.
endsWith
(
"OTHERS$"
))
// handling for compound nouns
else
if
(
word
.
endsWith
(
"+N"
)
||
word
.
endsWith
(
"+N$"
)
||
word
.
endsWith
(
"+NS"
)
||
word
.
endsWith
(
"+NS$"
)
)
{
allCompoundNounsOfCorpus
.
add
(
word
);
//System.out.println(word);
}
else
if
(
word
.
endsWith
(
"/N"
)
//Proper Nouns of all kinds are excluded
// || word.endsWith("NPR") || word.endsWith("NPR$")
// || word.endsWith("NPRS") || word.endsWith("NPRS$")
// all forms of nominalized other, e.g. the other are excluded
// || word.endsWith("OTHER") || word.endsWith("OTHER$")
// || word.endsWith("OTHERS") || word.endsWith("OTHERS$")
)
{
word
=
word
.
replace
(
"/N"
,
""
);
allNounsOfCorpus
.
add
(
word
.
toLowerCase
());
}
/*get rid of Possessives and Plural
* (Plural nouns cannot be sorted out,
* possible with a second loop but not
* worthwhile since not containing lexical morphemes)*/
else
if
(
word
.
endsWith
(
"/NS$"
))
{
word
=
word
.
replace
(
"ies/NS$"
,
"y"
);
word
=
word
.
replace
(
"ies'/NS$"
,
"y"
);
word
=
word
.
replace
(
"ches/NS$"
,
"ch"
);
word
=
word
.
replace
(
"ches'/NS$"
,
"ch"
);
word
=
word
.
replace
(
"ses/NS$"
,
"s"
);
word
=
word
.
replace
(
"ses'/NS$"
,
"s"
);
word
=
word
.
replace
(
"shes/NS$"
,
"sh"
);
word
=
word
.
replace
(
"shes'/NS$"
,
"sh"
);
word
=
word
.
replace
(
"./NS$"
,
""
);
word
=
word
.
replace
(
"s'/NS$"
,
""
);
word
=
word
.
replace
(
"'/NS$"
,
""
);
word
=
word
.
replace
(
"'s/NS$"
,
""
);
word
=
word
.
replace
(
"s/NS$"
,
""
);
allNounsOfCorpus
.
add
(
word
.
toLowerCase
());
}
//get rid of Possessives
else
if
(
word
.
endsWith
(
"/N$"
))
{
allNounsOfCorpus
.
add
(
word
);
word
=
word
.
replace
(
"'s./N$"
,
""
);
word
=
word
.
replace
(
"./N$"
,
""
);
word
=
word
.
replace
(
"'s/N$"
,
""
);
word
=
word
.
replace
(
"s/N$"
,
""
);
word
=
word
.
replace
(
"'/N$"
,
""
);
word
=
word
.
replace
(
"/N$"
,
""
);
allNounsOfCorpus
.
add
(
word
.
toLowerCase
());
}
//get rid of Plural
else
if
(
word
.
endsWith
(
"/NS"
))
{
//System.out.println(word);
word
=
word
.
replace
(
"ies/NS"
,
"y"
);
word
=
word
.
replace
(
"ches/NS"
,
"ch"
);
word
=
word
.
replace
(
"ses/NS"
,
"s"
);
word
=
word
.
replace
(
"shes/NS"
,
"sh"
);
word
=
word
.
replace
(
"./NS"
,
""
);
word
=
word
.
replace
(
"s/NS"
,
""
);
word
=
word
.
replace
(
"s'/NS"
,
""
);
word
=
word
.
replace
(
"'/NS"
,
""
);
if
(
word
.
endsWith
(
"/NS"
))
{
word
=
word
.
replace
(
"/NS"
,
""
);
allPluralnounsOfCorpus
.
add
(
word
);
}
allNounsOfCorpus
.
add
(
word
.
toLowerCase
());
}
else
if
(
word
.
endsWith
(
"ADJ"
)
||
word
.
endsWith
(
"ADJR"
)
||
word
.
endsWith
(
"ADJS"
)
||
word
.
endsWith
(
"ADV"
)
||
...
...
@@ -40,16 +117,88 @@ public class Init {
{
allAdjectivesOfCorpus
.
add
(
word
);
}
//System.out.println(word);
}
/*
System.out.println("Gesamt Wortanzahl: " + allWordsOfCorpus.size());
System.out.println("Anzahl Verben: " + allVerbsOfCorpus.size());
System.out.println("Anzahl Adjektive: " + allAdjectivesOfCorpus.size());
System.out.println("Anzahl Substantive: " + allNounsOfCorpus.size());
for
(
String
noun
:
allVerbsOfCorpus
)
*/
// create word frequency list of the nouns
Map
<
String
,
Integer
>
frequencyNouns
=
new
HashMap
<
String
,
Integer
>();
Set
<
String
>
nounTypes
=
new
HashSet
<
String
>(
allNounsOfCorpus
);
for
(
String
key
:
nounTypes
)
{
System
.
out
.
println
(
noun
);
frequencyNouns
.
put
(
key
,
Collections
.
frequency
(
allNounsOfCorpus
,
key
));
//System.out.println(key + ": " + Collections.frequency(allNounsOfCorpus, key));
}
/*
for (String noun: frequencyNouns.keySet()) {
String key = noun.toString();
String value = frequencyNouns.get(noun).toString();
System.out.println(key + " " + value);
}
*/
Map
<
String
,
Integer
>
suffixMorpheme
=
new
HashMap
<
String
,
Integer
>();
Map
<
String
,
ArrayList
>
morphemeWordList
=
new
HashMap
<
String
,
ArrayList
>();
for
(
String
noun
:
nounTypes
)
{
AffixStripper
as
=
new
AffixStripper
(
noun
);
suffixMorpheme
=
as
.
getSuffixMorphem
();
if
(!
suffixMorpheme
.
isEmpty
())
{
for
(
String
morpheme
:
suffixMorpheme
.
keySet
())
{
ArrayList
<
String
>
WordListOfNounsWithSuffix
=
new
ArrayList
<
String
>();
if
(
morphemeWordList
.
get
(
morpheme
)!=
null
)
//only for the first iteration when the morphemeWordList does not contain any data
{
// keep the values of morphemeWordList that were written to it previously
WordListOfNounsWithSuffix
=
morphemeWordList
.
get
(
morpheme
);
}
WordListOfNounsWithSuffix
.
add
(
noun
);
morphemeWordList
.
put
(
morpheme
,
WordListOfNounsWithSuffix
);
}
//System.out.println(noun + ": " + suffixMorpheme.keySet());
}
}
for
(
String
s
:
morphemeWordList
.
keySet
())
{
String
key
=
s
.
toString
();
String
value
=
morphemeWordList
.
get
(
s
).
toString
();
System
.
out
.
println
(
key
+
" "
+
value
);
System
.
out
.
print
(
"From which Hapax: "
);
ArrayList
<
String
>
HapaxLegonoma
=
new
ArrayList
<
String
>();
ArrayList
<
String
>
allWordsContainingAffix
=
new
ArrayList
<
String
>(
morphemeWordList
.
get
(
s
));
int
numberOfAffixInCorpus
=
0
;
for
(
String
wordContainingAffix
:
allWordsContainingAffix
)
{
numberOfAffixInCorpus
+=
Collections
.
frequency
(
allNounsOfCorpus
,
wordContainingAffix
);
if
(
Collections
.
frequency
(
allNounsOfCorpus
,
wordContainingAffix
)
==
1
)
{
HapaxLegonoma
.
add
(
wordContainingAffix
);
System
.
out
.
print
(
wordContainingAffix
+
" "
);
}
}
System
.
out
.
println
();
System
.
out
.
println
(
"Number of Hapaxes: "
+
HapaxLegonoma
.
size
());
System
.
out
.
println
(
"Total number of word types containing the "
+
key
+
"-morpheme: "
+
morphemeWordList
.
get
(
s
).
size
());
System
.
out
.
println
(
"Total number of word tokens containing the "
+
key
+
"-morpheme: "
+
numberOfAffixInCorpus
);
}
System
.
out
.
println
(
"size noun types: "
+
nounTypes
.
size
());
System
.
out
.
println
(
"size morphemes: "
+
morphemeWordList
.
size
());
//remove inflected forms
/**
* loop through list: for each word do:
* 0. reduce to verbs, nouns, and adjectives in three different lists
...
...
This diff is collapsed.
Click to expand it.
Morphochron/src/SuffixEnum.java
+
1
−
1
View file @
83b0a69a
...
...
@@ -88,7 +88,7 @@ public enum SuffixEnum {
ward
(
"ward"
),
wards
(
"wards"
),
ware
(
"ware"
),
uaeras
(
"ware"
),
uaras
(
"ware"
),
uaro
(
"ware"
),
waeras
(
"ware"
),
wara
(
"ware"
),
waran
(
"ware"
),
waras
(
"ware"
),
waru
(
"ware"
),
wearan
(
"ware"
),
waeren
(
"ware"
),
warae
(
"ware"
),
wick
(
"wick"
),
y
(
"y"
),
ig
(
"y"
),
ye
(
"y"
),
igan
(
"y"
),
izen
(
"y"
),
ezen
(
"y"
),
yen
(
"y"
),
ey
(
"y"
),
yl
(
"yl"
),
yne
(
"yne"
);
private
String
morpheme
;
private
final
String
morpheme
;
//constructor
SuffixEnum
(
String
morpheme
)
{
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment