Skip to content
Snippets Groups Projects
Commit b5ee6ac5 authored by Lange, Dr. Herbert's avatar Lange, Dr. Herbert
Browse files

add one parameter and change from hashmap to frequency list for all the statistics

parent 372b45e2
No related branches found
No related tags found
1 merge request!6add feature to load criteria file from resource and place all criteria files...
...@@ -9,6 +9,7 @@ import com.google.common.primitives.Chars; ...@@ -9,6 +9,7 @@ import com.google.common.primitives.Chars;
import de.uni_hamburg.corpora.*; import de.uni_hamburg.corpora.*;
import de.uni_hamburg.corpora.utilities.quest.DictionaryAutomaton; import de.uni_hamburg.corpora.utilities.quest.DictionaryAutomaton;
import de.uni_hamburg.corpora.utilities.quest.FileTools; import de.uni_hamburg.corpora.utilities.quest.FileTools;
import de.uni_hamburg.corpora.utilities.quest.FrequencyList;
import de.uni_hamburg.corpora.utilities.quest.XMLTools; import de.uni_hamburg.corpora.utilities.quest.XMLTools;
import de.uni_hamburg.corpora.validation.Checker; import de.uni_hamburg.corpora.validation.Checker;
import org.apache.commons.lang.time.DurationFormatUtils; import org.apache.commons.lang.time.DurationFormatUtils;
...@@ -557,23 +558,26 @@ public class RefcoChecker extends Checker implements CorpusFunction { ...@@ -557,23 +558,26 @@ public class RefcoChecker extends Checker implements CorpusFunction {
/** /**
* The frequency list of all transcription tokens in the corpus * The frequency list of all transcription tokens in the corpus
*/ */
private HashMap<String,Integer> tokenFreq = new HashMap<>(); // private HashMap<String,Integer> tokenFreq = new HashMap<>();
private FrequencyList tokenFreq = new FrequencyList();
/** /**
* The frequency list of all segmented annotation/morphology glosses in the corpus * The frequency list of all segmented annotation/morphology glosses in the corpus
*/ */
private HashMap<String,Integer> morphemeFreq = new HashMap<>(); // private HashMap<String,Integer> morphemeFreq = new HashMap<>();
private FrequencyList morphemeFreq = new FrequencyList();
/** /**
* The frequency list of all non-segmented annotation/morphology glosses in the corpus * The frequency list of all non-segmented annotation/morphology glosses in the corpus
*/ */
private HashMap<String,Integer> glossFreq = new HashMap<>(); // private HashMap<String,Integer> glossFreq = new HashMap<>();
private FrequencyList glossFreq = new FrequencyList();
/** /**
* The frequency list of all non-segmentable gloss tokens * The frequency list of all non-segmentable gloss tokens
*/ */
private HashMap<String,Integer> missingGlossFreq = new HashMap<>(); // private HashMap<String,Integer> missingGlossFreq = new HashMap<>();
private FrequencyList missingGlossFreq = new FrequencyList();
/** /**
* The global report, will be filled by the constructor and the function applied to the complete corpus * The global report, will be filled by the constructor and the function applied to the complete corpus
*/ */
...@@ -697,9 +701,9 @@ public class RefcoChecker extends Checker implements CorpusFunction { ...@@ -697,9 +701,9 @@ public class RefcoChecker extends Checker implements CorpusFunction {
// Set the RefCo corpus // Set the RefCo corpus
setRefcoCorpus(c); setRefcoCorpus(c);
// Initialize frequency list for glosses // Initialize frequency list for glosses
for (Gloss gloss : criteria.glosses) { // for (Gloss gloss : criteria.glosses) {
morphemeFreq.put(gloss.gloss, 0); // morphemeFreq.put(gloss.gloss, 0);
} // }
// Run the generic tests and merge their reports into the current report // Run the generic tests and merge their reports into the current report
// but flag allows skipping it // but flag allows skipping it
if (!props.containsKey("skip-documentation-check") if (!props.containsKey("skip-documentation-check")
...@@ -711,7 +715,7 @@ public class RefcoChecker extends Checker implements CorpusFunction { ...@@ -711,7 +715,7 @@ public class RefcoChecker extends Checker implements CorpusFunction {
function(cdata, fix); function(cdata, fix);
} }
// Check for morpheme glosses that never occurred in the complete corpus // Check for morpheme glosses that never occurred in the complete corpus
for (Map.Entry<String, Integer> e : morphemeFreq.entrySet()) { for (Map.Entry<String, Integer> e : morphemeFreq.getMap().entrySet()) {
if (e.getValue() == 0) if (e.getValue() == 0)
report.addWarning(getFunction(), ReportItem.newParamMap(new String[]{"function", "filename", "description", report.addWarning(getFunction(), ReportItem.newParamMap(new String[]{"function", "filename", "description",
"howtoFix"}, "howtoFix"},
...@@ -719,11 +723,18 @@ public class RefcoChecker extends Checker implements CorpusFunction { ...@@ -719,11 +723,18 @@ public class RefcoChecker extends Checker implements CorpusFunction {
"Corpus data: Morpheme gloss never encountered in corpus: " + e.getKey(), "Corpus data: Morpheme gloss never encountered in corpus: " + e.getKey(),
"Check for potential errors or remove gloss from documentation"})); "Check for potential errors or remove gloss from documentation"}));
} }
if (!missingGlossFreq.isEmpty() && props.containsKey("missing-gloss-stats") && if (!missingGlossFreq.isEmpty())
props.getProperty("missing-gloss-stats").equalsIgnoreCase("true"))
report.addNote(getFunction(),"Corpus data: Morpheme glosses missing from documentations:\n" + report.addNote(getFunction(),"Corpus data: Morpheme glosses missing from documentations:\n" +
missingGlossFreq.keySet().stream().map((k) -> k + ":" + missingGlossFreq.get(k)) missingGlossFreq.toString());
.collect(Collectors.joining("\n"))); // missingGlossFreq.keySet().stream().map((k) -> k + ":" + missingGlossFreq.get(k))
// .collect(Collectors.joining("\n")));
if (!glossFreq.isEmpty() && props.containsKey("gloss-stats") &&
props.getProperty("gloss-stats").equalsIgnoreCase("true")) {
report.addNote(getFunction(), "Corpus data: Glosses encountered in the corpus:\n" +
glossFreq.toString());
// glossFreq.keySet().stream().map((k) -> k + ":" + glossFreq.get(k))
// .collect(Collectors.joining("\n")));
}
// Check all gloss tokens (not-segmented) for rare ones very similar to quite common ones, i.e. tokens with // Check all gloss tokens (not-segmented) for rare ones very similar to quite common ones, i.e. tokens with
// Levenshtein difference 1 with a higher frequency count // Levenshtein difference 1 with a higher frequency count
/*DictionaryAutomaton glossDictionary = /*DictionaryAutomaton glossDictionary =
...@@ -2029,7 +2040,8 @@ public class RefcoChecker extends Checker implements CorpusFunction { ...@@ -2029,7 +2040,8 @@ public class RefcoChecker extends Checker implements CorpusFunction {
// Check if token either is a gloss or each character is in the valid characters // Check if token either is a gloss or each character is in the valid characters
mismatch = false ; mismatch = false ;
// Update frequency list // Update frequency list
tokenFreq.compute(token,(k,v) -> (v == null) ? 1 : v + 1); //tokenFreq.compute(token,(k,v) -> (v == null) ? 1 : v + 1);
tokenFreq.put(token);
// Token is not one of the glosses // Token is not one of the glosses
if (!glosses.contains(token)) { if (!glosses.contains(token)) {
// Check if we can segment the token using the chunks // Check if we can segment the token using the chunks
...@@ -2236,7 +2248,8 @@ public class RefcoChecker extends Checker implements CorpusFunction { ...@@ -2236,7 +2248,8 @@ public class RefcoChecker extends Checker implements CorpusFunction {
List<String> segments = glossAutomaton.segmentWord(normalizedMorpheme); List<String> segments = glossAutomaton.segmentWord(normalizedMorpheme);
if (segments == null || segments.isEmpty()) { if (segments == null || segments.isEmpty()) {
missing += 1; missing += 1;
missingGlossFreq.compute(normalizedMorpheme, (k, v) -> (v == null) ? 1 : v + 1); // missingGlossFreq.compute(normalizedMorpheme, (k, v) -> (v == null) ? 1 : v + 1);
missingGlossFreq.put(normalizedMorpheme);
// his would lead to large amount of warnings // his would lead to large amount of warnings
try { try {
// Location l = getLocation((ELANData) cd, morpheme); // Location l = getLocation((ELANData) cd, morpheme);
...@@ -2260,14 +2273,17 @@ public class RefcoChecker extends Checker implements CorpusFunction { ...@@ -2260,14 +2273,17 @@ public class RefcoChecker extends Checker implements CorpusFunction {
matched += 1; matched += 1;
for (String segment : segments) { for (String segment : segments) {
// Remove initial periods and keep track of the count // Remove initial periods and keep track of the count
morphemeFreq.compute(segment.replaceAll("^\\.",""), (k, v) -> (v == null) ? 1 : v + 1); //morphemeFreq.compute(segment.replaceAll("^\\.",""), (k, v) -> (v == null) ? 1 : v +
// 1);
morphemeFreq.put(segment.replaceAll("^\\.",""));
} }
} }
} }
// OLD // OLD
// morphemeFreq.compute(normalizedMorpheme,(k, v) -> (v == null) ? 1 : v + 1); // morphemeFreq.compute(normalizedMorpheme,(k, v) -> (v == null) ? 1 : v + 1);
} }
glossFreq.compute(token,(k, v) -> (v == null) ? 1 : v + 1); // glossFreq.compute(token,(k, v) -> (v == null) ? 1 : v + 1);
glossFreq.put(token);
} }
} }
float percentValid = (float)matched/(matched+missing) ; float percentValid = (float)matched/(matched+missing) ;
...@@ -2573,6 +2589,7 @@ public class RefcoChecker extends Checker implements CorpusFunction { ...@@ -2573,6 +2589,7 @@ public class RefcoChecker extends Checker implements CorpusFunction {
params.put("skip-documentation-check", "Flag to skip the documentation check"); params.put("skip-documentation-check", "Flag to skip the documentation check");
params.put("skip-transcription-check", "Flag to skip the transcription check"); params.put("skip-transcription-check", "Flag to skip the transcription check");
params.put("skip-gloss-check", "Flag to skip the gloss check"); params.put("skip-gloss-check", "Flag to skip the gloss check");
params.put("gloss-stats", "Includes stats about all glosses");
return params; return params;
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment