Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
corpus-services
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Lange, Dr. Herbert
corpus-services
Commits
b5ee6ac5
Commit
b5ee6ac5
authored
3 years ago
by
Lange, Dr. Herbert
Browse files
Options
Downloads
Patches
Plain Diff
add one parameter and change from hashmap to frequency list for all the statistics
parent
372b45e2
Branches
Branches containing commit
No related tags found
1 merge request
!6
add feature to load criteria file from resource and place all criteria files...
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/main/java/de/uni_hamburg/corpora/validation/quest/RefcoChecker.java
+34
-17
34 additions, 17 deletions
...de/uni_hamburg/corpora/validation/quest/RefcoChecker.java
with
34 additions
and
17 deletions
src/main/java/de/uni_hamburg/corpora/validation/quest/RefcoChecker.java
+
34
−
17
View file @
b5ee6ac5
...
...
@@ -9,6 +9,7 @@ import com.google.common.primitives.Chars;
import
de.uni_hamburg.corpora.*
;
import
de.uni_hamburg.corpora.utilities.quest.DictionaryAutomaton
;
import
de.uni_hamburg.corpora.utilities.quest.FileTools
;
import
de.uni_hamburg.corpora.utilities.quest.FrequencyList
;
import
de.uni_hamburg.corpora.utilities.quest.XMLTools
;
import
de.uni_hamburg.corpora.validation.Checker
;
import
org.apache.commons.lang.time.DurationFormatUtils
;
...
...
@@ -557,23 +558,26 @@ public class RefcoChecker extends Checker implements CorpusFunction {
/**
* The frequency list of all transcription tokens in the corpus
*/
private
HashMap
<
String
,
Integer
>
tokenFreq
=
new
HashMap
<>();
// private HashMap<String,Integer> tokenFreq = new HashMap<>();
private
FrequencyList
tokenFreq
=
new
FrequencyList
();
/**
* The frequency list of all segmented annotation/morphology glosses in the corpus
*/
private
HashMap
<
String
,
Integer
>
morphemeFreq
=
new
HashMap
<>();
// private HashMap<String,Integer> morphemeFreq = new HashMap<>();
private
FrequencyList
morphemeFreq
=
new
FrequencyList
();
/**
* The frequency list of all non-segmented annotation/morphology glosses in the corpus
*/
private
HashMap
<
String
,
Integer
>
glossFreq
=
new
HashMap
<>();
// private HashMap<String,Integer> glossFreq = new HashMap<>();
private
FrequencyList
glossFreq
=
new
FrequencyList
();
/**
* The frequency list of all non-segmentable gloss tokens
*/
private
HashMap
<
String
,
Integer
>
missingGlossFreq
=
new
HashMap
<>();
//
private HashMap<String,Integer> missingGlossFreq = new HashMap<>();
private
FrequencyList
missingGlossFreq
=
new
FrequencyList
();
/**
* The global report, will be filled by the constructor and the function applied to the complete corpus
*/
...
...
@@ -697,9 +701,9 @@ public class RefcoChecker extends Checker implements CorpusFunction {
// Set the RefCo corpus
setRefcoCorpus
(
c
);
// Initialize frequency list for glosses
for
(
Gloss
gloss
:
criteria
.
glosses
)
{
morphemeFreq
.
put
(
gloss
.
gloss
,
0
);
}
//
for (Gloss gloss : criteria.glosses) {
//
morphemeFreq.put(gloss.gloss, 0);
//
}
// Run the generic tests and merge their reports into the current report
// but flag allows skipping it
if
(!
props
.
containsKey
(
"skip-documentation-check"
)
...
...
@@ -711,7 +715,7 @@ public class RefcoChecker extends Checker implements CorpusFunction {
function
(
cdata
,
fix
);
}
// Check for morpheme glosses that never occurred in the complete corpus
for
(
Map
.
Entry
<
String
,
Integer
>
e
:
morphemeFreq
.
entrySet
())
{
for
(
Map
.
Entry
<
String
,
Integer
>
e
:
morphemeFreq
.
getMap
().
entrySet
())
{
if
(
e
.
getValue
()
==
0
)
report
.
addWarning
(
getFunction
(),
ReportItem
.
newParamMap
(
new
String
[]{
"function"
,
"filename"
,
"description"
,
"howtoFix"
},
...
...
@@ -719,11 +723,18 @@ public class RefcoChecker extends Checker implements CorpusFunction {
"Corpus data: Morpheme gloss never encountered in corpus: "
+
e
.
getKey
(),
"Check for potential errors or remove gloss from documentation"
}));
}
if
(!
missingGlossFreq
.
isEmpty
()
&&
props
.
containsKey
(
"missing-gloss-stats"
)
&&
props
.
getProperty
(
"missing-gloss-stats"
).
equalsIgnoreCase
(
"true"
))
if
(!
missingGlossFreq
.
isEmpty
())
report
.
addNote
(
getFunction
(),
"Corpus data: Morpheme glosses missing from documentations:\n"
+
missingGlossFreq
.
keySet
().
stream
().
map
((
k
)
->
k
+
":"
+
missingGlossFreq
.
get
(
k
))
.
collect
(
Collectors
.
joining
(
"\n"
)));
missingGlossFreq
.
toString
());
// missingGlossFreq.keySet().stream().map((k) -> k + ":" + missingGlossFreq.get(k))
// .collect(Collectors.joining("\n")));
if
(!
glossFreq
.
isEmpty
()
&&
props
.
containsKey
(
"gloss-stats"
)
&&
props
.
getProperty
(
"gloss-stats"
).
equalsIgnoreCase
(
"true"
))
{
report
.
addNote
(
getFunction
(),
"Corpus data: Glosses encountered in the corpus:\n"
+
glossFreq
.
toString
());
// glossFreq.keySet().stream().map((k) -> k + ":" + glossFreq.get(k))
// .collect(Collectors.joining("\n")));
}
// Check all gloss tokens (not-segmented) for rare ones very similar to quite common ones, i.e. tokens with
// Levenshtein difference 1 with a higher frequency count
/*DictionaryAutomaton glossDictionary =
...
...
@@ -2029,7 +2040,8 @@ public class RefcoChecker extends Checker implements CorpusFunction {
// Check if token either is a gloss or each character is in the valid characters
mismatch
=
false
;
// Update frequency list
tokenFreq
.
compute
(
token
,(
k
,
v
)
->
(
v
==
null
)
?
1
:
v
+
1
);
//tokenFreq.compute(token,(k,v) -> (v == null) ? 1 : v + 1);
tokenFreq
.
put
(
token
);
// Token is not one of the glosses
if
(!
glosses
.
contains
(
token
))
{
// Check if we can segment the token using the chunks
...
...
@@ -2236,7 +2248,8 @@ public class RefcoChecker extends Checker implements CorpusFunction {
List
<
String
>
segments
=
glossAutomaton
.
segmentWord
(
normalizedMorpheme
);
if
(
segments
==
null
||
segments
.
isEmpty
())
{
missing
+=
1
;
missingGlossFreq
.
compute
(
normalizedMorpheme
,
(
k
,
v
)
->
(
v
==
null
)
?
1
:
v
+
1
);
// missingGlossFreq.compute(normalizedMorpheme, (k, v) -> (v == null) ? 1 : v + 1);
missingGlossFreq
.
put
(
normalizedMorpheme
);
// his would lead to large amount of warnings
try
{
// Location l = getLocation((ELANData) cd, morpheme);
...
...
@@ -2260,14 +2273,17 @@ public class RefcoChecker extends Checker implements CorpusFunction {
matched
+=
1
;
for
(
String
segment
:
segments
)
{
// Remove initial periods and keep track of the count
morphemeFreq
.
compute
(
segment
.
replaceAll
(
"^\\."
,
""
),
(
k
,
v
)
->
(
v
==
null
)
?
1
:
v
+
1
);
//morphemeFreq.compute(segment.replaceAll("^\\.",""), (k, v) -> (v == null) ? 1 : v +
// 1);
morphemeFreq
.
put
(
segment
.
replaceAll
(
"^\\."
,
""
));
}
}
}
// OLD
// morphemeFreq.compute(normalizedMorpheme,(k, v) -> (v == null) ? 1 : v + 1);
}
glossFreq
.
compute
(
token
,(
k
,
v
)
->
(
v
==
null
)
?
1
:
v
+
1
);
// glossFreq.compute(token,(k, v) -> (v == null) ? 1 : v + 1);
glossFreq
.
put
(
token
);
}
}
float
percentValid
=
(
float
)
matched
/(
matched
+
missing
)
;
...
...
@@ -2573,6 +2589,7 @@ public class RefcoChecker extends Checker implements CorpusFunction {
params
.
put
(
"skip-documentation-check"
,
"Flag to skip the documentation check"
);
params
.
put
(
"skip-transcription-check"
,
"Flag to skip the transcription check"
);
params
.
put
(
"skip-gloss-check"
,
"Flag to skip the gloss check"
);
params
.
put
(
"gloss-stats"
,
"Includes stats about all glosses"
);
return
params
;
}
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment