Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
corpus-services
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Lange, Dr. Herbert
corpus-services
Commits
b5ee6ac5
Commit
b5ee6ac5
authored
May 16, 2022
by
Lange, Dr. Herbert
Browse files
Options
Downloads
Patches
Plain Diff
add one parameter and change from hashmap to frequency list for all the statistics
parent
372b45e2
No related branches found
No related tags found
1 merge request
!6
add feature to load criteria file from resource and place all criteria files...
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/main/java/de/uni_hamburg/corpora/validation/quest/RefcoChecker.java
+34
-17
34 additions, 17 deletions
...de/uni_hamburg/corpora/validation/quest/RefcoChecker.java
with
34 additions
and
17 deletions
src/main/java/de/uni_hamburg/corpora/validation/quest/RefcoChecker.java
+
34
−
17
View file @
b5ee6ac5
...
@@ -9,6 +9,7 @@ import com.google.common.primitives.Chars;
...
@@ -9,6 +9,7 @@ import com.google.common.primitives.Chars;
import
de.uni_hamburg.corpora.*
;
import
de.uni_hamburg.corpora.*
;
import
de.uni_hamburg.corpora.utilities.quest.DictionaryAutomaton
;
import
de.uni_hamburg.corpora.utilities.quest.DictionaryAutomaton
;
import
de.uni_hamburg.corpora.utilities.quest.FileTools
;
import
de.uni_hamburg.corpora.utilities.quest.FileTools
;
import
de.uni_hamburg.corpora.utilities.quest.FrequencyList
;
import
de.uni_hamburg.corpora.utilities.quest.XMLTools
;
import
de.uni_hamburg.corpora.utilities.quest.XMLTools
;
import
de.uni_hamburg.corpora.validation.Checker
;
import
de.uni_hamburg.corpora.validation.Checker
;
import
org.apache.commons.lang.time.DurationFormatUtils
;
import
org.apache.commons.lang.time.DurationFormatUtils
;
...
@@ -557,23 +558,26 @@ public class RefcoChecker extends Checker implements CorpusFunction {
...
@@ -557,23 +558,26 @@ public class RefcoChecker extends Checker implements CorpusFunction {
/**
/**
* The frequency list of all transcription tokens in the corpus
* The frequency list of all transcription tokens in the corpus
*/
*/
private
HashMap
<
String
,
Integer
>
tokenFreq
=
new
HashMap
<>();
// private HashMap<String,Integer> tokenFreq = new HashMap<>();
private
FrequencyList
tokenFreq
=
new
FrequencyList
();
/**
/**
* The frequency list of all segmented annotation/morphology glosses in the corpus
* The frequency list of all segmented annotation/morphology glosses in the corpus
*/
*/
private
HashMap
<
String
,
Integer
>
morphemeFreq
=
new
HashMap
<>();
// private HashMap<String,Integer> morphemeFreq = new HashMap<>();
private
FrequencyList
morphemeFreq
=
new
FrequencyList
();
/**
/**
* The frequency list of all non-segmented annotation/morphology glosses in the corpus
* The frequency list of all non-segmented annotation/morphology glosses in the corpus
*/
*/
private
HashMap
<
String
,
Integer
>
glossFreq
=
new
HashMap
<>();
// private HashMap<String,Integer> glossFreq = new HashMap<>();
private
FrequencyList
glossFreq
=
new
FrequencyList
();
/**
/**
* The frequency list of all non-segmentable gloss tokens
* The frequency list of all non-segmentable gloss tokens
*/
*/
private
HashMap
<
String
,
Integer
>
missingGlossFreq
=
new
HashMap
<>();
//
private HashMap<String,Integer> missingGlossFreq = new HashMap<>();
private
FrequencyList
missingGlossFreq
=
new
FrequencyList
();
/**
/**
* The global report, will be filled by the constructor and the function applied to the complete corpus
* The global report, will be filled by the constructor and the function applied to the complete corpus
*/
*/
...
@@ -697,9 +701,9 @@ public class RefcoChecker extends Checker implements CorpusFunction {
...
@@ -697,9 +701,9 @@ public class RefcoChecker extends Checker implements CorpusFunction {
// Set the RefCo corpus
// Set the RefCo corpus
setRefcoCorpus
(
c
);
setRefcoCorpus
(
c
);
// Initialize frequency list for glosses
// Initialize frequency list for glosses
for
(
Gloss
gloss
:
criteria
.
glosses
)
{
//
for (Gloss gloss : criteria.glosses) {
morphemeFreq
.
put
(
gloss
.
gloss
,
0
);
//
morphemeFreq.put(gloss.gloss, 0);
}
//
}
// Run the generic tests and merge their reports into the current report
// Run the generic tests and merge their reports into the current report
// but flag allows skipping it
// but flag allows skipping it
if
(!
props
.
containsKey
(
"skip-documentation-check"
)
if
(!
props
.
containsKey
(
"skip-documentation-check"
)
...
@@ -711,7 +715,7 @@ public class RefcoChecker extends Checker implements CorpusFunction {
...
@@ -711,7 +715,7 @@ public class RefcoChecker extends Checker implements CorpusFunction {
function
(
cdata
,
fix
);
function
(
cdata
,
fix
);
}
}
// Check for morpheme glosses that never occurred in the complete corpus
// Check for morpheme glosses that never occurred in the complete corpus
for
(
Map
.
Entry
<
String
,
Integer
>
e
:
morphemeFreq
.
entrySet
())
{
for
(
Map
.
Entry
<
String
,
Integer
>
e
:
morphemeFreq
.
getMap
().
entrySet
())
{
if
(
e
.
getValue
()
==
0
)
if
(
e
.
getValue
()
==
0
)
report
.
addWarning
(
getFunction
(),
ReportItem
.
newParamMap
(
new
String
[]{
"function"
,
"filename"
,
"description"
,
report
.
addWarning
(
getFunction
(),
ReportItem
.
newParamMap
(
new
String
[]{
"function"
,
"filename"
,
"description"
,
"howtoFix"
},
"howtoFix"
},
...
@@ -719,11 +723,18 @@ public class RefcoChecker extends Checker implements CorpusFunction {
...
@@ -719,11 +723,18 @@ public class RefcoChecker extends Checker implements CorpusFunction {
"Corpus data: Morpheme gloss never encountered in corpus: "
+
e
.
getKey
(),
"Corpus data: Morpheme gloss never encountered in corpus: "
+
e
.
getKey
(),
"Check for potential errors or remove gloss from documentation"
}));
"Check for potential errors or remove gloss from documentation"
}));
}
}
if
(!
missingGlossFreq
.
isEmpty
()
&&
props
.
containsKey
(
"missing-gloss-stats"
)
&&
if
(!
missingGlossFreq
.
isEmpty
())
props
.
getProperty
(
"missing-gloss-stats"
).
equalsIgnoreCase
(
"true"
))
report
.
addNote
(
getFunction
(),
"Corpus data: Morpheme glosses missing from documentations:\n"
+
report
.
addNote
(
getFunction
(),
"Corpus data: Morpheme glosses missing from documentations:\n"
+
missingGlossFreq
.
keySet
().
stream
().
map
((
k
)
->
k
+
":"
+
missingGlossFreq
.
get
(
k
))
missingGlossFreq
.
toString
());
.
collect
(
Collectors
.
joining
(
"\n"
)));
// missingGlossFreq.keySet().stream().map((k) -> k + ":" + missingGlossFreq.get(k))
// .collect(Collectors.joining("\n")));
if
(!
glossFreq
.
isEmpty
()
&&
props
.
containsKey
(
"gloss-stats"
)
&&
props
.
getProperty
(
"gloss-stats"
).
equalsIgnoreCase
(
"true"
))
{
report
.
addNote
(
getFunction
(),
"Corpus data: Glosses encountered in the corpus:\n"
+
glossFreq
.
toString
());
// glossFreq.keySet().stream().map((k) -> k + ":" + glossFreq.get(k))
// .collect(Collectors.joining("\n")));
}
// Check all gloss tokens (not-segmented) for rare ones very similar to quite common ones, i.e. tokens with
// Check all gloss tokens (not-segmented) for rare ones very similar to quite common ones, i.e. tokens with
// Levenshtein difference 1 with a higher frequency count
// Levenshtein difference 1 with a higher frequency count
/*DictionaryAutomaton glossDictionary =
/*DictionaryAutomaton glossDictionary =
...
@@ -2029,7 +2040,8 @@ public class RefcoChecker extends Checker implements CorpusFunction {
...
@@ -2029,7 +2040,8 @@ public class RefcoChecker extends Checker implements CorpusFunction {
// Check if token either is a gloss or each character is in the valid characters
// Check if token either is a gloss or each character is in the valid characters
mismatch
=
false
;
mismatch
=
false
;
// Update frequency list
// Update frequency list
tokenFreq
.
compute
(
token
,(
k
,
v
)
->
(
v
==
null
)
?
1
:
v
+
1
);
//tokenFreq.compute(token,(k,v) -> (v == null) ? 1 : v + 1);
tokenFreq
.
put
(
token
);
// Token is not one of the glosses
// Token is not one of the glosses
if
(!
glosses
.
contains
(
token
))
{
if
(!
glosses
.
contains
(
token
))
{
// Check if we can segment the token using the chunks
// Check if we can segment the token using the chunks
...
@@ -2236,7 +2248,8 @@ public class RefcoChecker extends Checker implements CorpusFunction {
...
@@ -2236,7 +2248,8 @@ public class RefcoChecker extends Checker implements CorpusFunction {
List
<
String
>
segments
=
glossAutomaton
.
segmentWord
(
normalizedMorpheme
);
List
<
String
>
segments
=
glossAutomaton
.
segmentWord
(
normalizedMorpheme
);
if
(
segments
==
null
||
segments
.
isEmpty
())
{
if
(
segments
==
null
||
segments
.
isEmpty
())
{
missing
+=
1
;
missing
+=
1
;
missingGlossFreq
.
compute
(
normalizedMorpheme
,
(
k
,
v
)
->
(
v
==
null
)
?
1
:
v
+
1
);
// missingGlossFreq.compute(normalizedMorpheme, (k, v) -> (v == null) ? 1 : v + 1);
missingGlossFreq
.
put
(
normalizedMorpheme
);
// his would lead to large amount of warnings
// his would lead to large amount of warnings
try
{
try
{
// Location l = getLocation((ELANData) cd, morpheme);
// Location l = getLocation((ELANData) cd, morpheme);
...
@@ -2260,14 +2273,17 @@ public class RefcoChecker extends Checker implements CorpusFunction {
...
@@ -2260,14 +2273,17 @@ public class RefcoChecker extends Checker implements CorpusFunction {
matched
+=
1
;
matched
+=
1
;
for
(
String
segment
:
segments
)
{
for
(
String
segment
:
segments
)
{
// Remove initial periods and keep track of the count
// Remove initial periods and keep track of the count
morphemeFreq
.
compute
(
segment
.
replaceAll
(
"^\\."
,
""
),
(
k
,
v
)
->
(
v
==
null
)
?
1
:
v
+
1
);
//morphemeFreq.compute(segment.replaceAll("^\\.",""), (k, v) -> (v == null) ? 1 : v +
// 1);
morphemeFreq
.
put
(
segment
.
replaceAll
(
"^\\."
,
""
));
}
}
}
}
}
}
// OLD
// OLD
// morphemeFreq.compute(normalizedMorpheme,(k, v) -> (v == null) ? 1 : v + 1);
// morphemeFreq.compute(normalizedMorpheme,(k, v) -> (v == null) ? 1 : v + 1);
}
}
glossFreq
.
compute
(
token
,(
k
,
v
)
->
(
v
==
null
)
?
1
:
v
+
1
);
// glossFreq.compute(token,(k, v) -> (v == null) ? 1 : v + 1);
glossFreq
.
put
(
token
);
}
}
}
}
float
percentValid
=
(
float
)
matched
/(
matched
+
missing
)
;
float
percentValid
=
(
float
)
matched
/(
matched
+
missing
)
;
...
@@ -2573,6 +2589,7 @@ public class RefcoChecker extends Checker implements CorpusFunction {
...
@@ -2573,6 +2589,7 @@ public class RefcoChecker extends Checker implements CorpusFunction {
params
.
put
(
"skip-documentation-check"
,
"Flag to skip the documentation check"
);
params
.
put
(
"skip-documentation-check"
,
"Flag to skip the documentation check"
);
params
.
put
(
"skip-transcription-check"
,
"Flag to skip the transcription check"
);
params
.
put
(
"skip-transcription-check"
,
"Flag to skip the transcription check"
);
params
.
put
(
"skip-gloss-check"
,
"Flag to skip the gloss check"
);
params
.
put
(
"skip-gloss-check"
,
"Flag to skip the gloss check"
);
params
.
put
(
"gloss-stats"
,
"Includes stats about all glosses"
);
return
params
;
return
params
;
}
}
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment