add one parameter and change from hashmap to frequency list for all the statistics

b5ee6ac5 · Lange, Dr. Herbert · 372b45e2 · b5ee6ac5
Commit b5ee6ac5 authored May 16, 2022 by Lange, Dr. Herbert
--- a/src/main/java/de/uni_hamburg/corpora/validation/quest/RefcoChecker.java
+++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/RefcoChecker.java
@@ -9,6 +9,7 @@ import com.google.common.primitives.Chars;
 import de.uni_hamburg.corpora.*;
 import de.uni_hamburg.corpora.utilities.quest.DictionaryAutomaton;
 import de.uni_hamburg.corpora.utilities.quest.FileTools;
+import de.uni_hamburg.corpora.utilities.quest.FrequencyList;
 import de.uni_hamburg.corpora.utilities.quest.XMLTools;
 import de.uni_hamburg.corpora.validation.Checker;
 import org.apache.commons.lang.time.DurationFormatUtils;
@@ -557,23 +558,26 @@ public class RefcoChecker extends Checker implements CorpusFunction {
    /**
     *  The frequency list of all transcription tokens in the corpus
     */
-    private HashMap<String,Integer> tokenFreq = new HashMap<>();
+    // private HashMap<String,Integer> tokenFreq = new HashMap<>();
+    private FrequencyList tokenFreq = new FrequencyList();
    /**
     * The frequency list of all segmented annotation/morphology glosses in the corpus
     */
-    private HashMap<String,Integer> morphemeFreq = new HashMap<>();
+    // private HashMap<String,Integer> morphemeFreq = new HashMap<>();
+    private FrequencyList morphemeFreq = new FrequencyList();
    /**
     * The frequency list of all non-segmented annotation/morphology glosses in the corpus
     */
-    private HashMap<String,Integer> glossFreq = new HashMap<>();
+    // private HashMap<String,Integer> glossFreq = new HashMap<>();
+    private FrequencyList glossFreq = new FrequencyList();
    /**
     * The frequency list of all non-segmentable gloss tokens
     */
-    private HashMap<String,Integer> missingGlossFreq = new HashMap<>();
+    // private HashMap<String,Integer> missingGlossFreq = new HashMap<>();
+    private FrequencyList missingGlossFreq = new FrequencyList();
    /**
     * The global report, will be filled by the constructor and the function applied to the complete corpus
     */
@@ -697,9 +701,9 @@ public class RefcoChecker extends Checker implements CorpusFunction {
            // Set the RefCo corpus
            setRefcoCorpus(c);
            // Initialize frequency list for glosses
-            for (Gloss gloss : criteria.glosses) {
+//            for (Gloss gloss : criteria.glosses) {
-                morphemeFreq.put(gloss.gloss, 0);
+//                morphemeFreq.put(gloss.gloss, 0);
-            }
+//            }
            // Run the generic tests and merge their reports into the current report
            // but flag allows skipping it
            if (!props.containsKey("skip-documentation-check")
@@ -711,7 +715,7 @@ public class RefcoChecker extends Checker implements CorpusFunction {
                function(cdata, fix);
            }
            // Check for morpheme glosses that never occurred in the complete corpus
-            for (Map.Entry<String, Integer> e : morphemeFreq.entrySet()) {
+            for (Map.Entry<String, Integer> e : morphemeFreq.getMap().entrySet()) {
                if (e.getValue() == 0)
                    report.addWarning(getFunction(), ReportItem.newParamMap(new String[]{"function", "filename", "description",
                                    "howtoFix"},
@@ -719,11 +723,18 @@ public class RefcoChecker extends Checker implements CorpusFunction {
                                    "Corpus data: Morpheme gloss never encountered in corpus: " + e.getKey(),
                                    "Check for potential errors or remove gloss from documentation"}));
            }
-            if (!missingGlossFreq.isEmpty() && props.containsKey("missing-gloss-stats") &&
+            if (!missingGlossFreq.isEmpty())
-                    props.getProperty("missing-gloss-stats").equalsIgnoreCase("true"))
                report.addNote(getFunction(),"Corpus data: Morpheme glosses missing from documentations:\n" +
-                                            missingGlossFreq.keySet().stream().map((k) -> k + ":" + missingGlossFreq.get(k))
+                        missingGlossFreq.toString());
-                                                    .collect(Collectors.joining("\n")));
+//                                            missingGlossFreq.keySet().stream().map((k) -> k + ":" + missingGlossFreq.get(k))
+//                                                    .collect(Collectors.joining("\n")));
+            if (!glossFreq.isEmpty() && props.containsKey("gloss-stats") &&
+                    props.getProperty("gloss-stats").equalsIgnoreCase("true")) {
+                report.addNote(getFunction(), "Corpus data: Glosses encountered in the corpus:\n" +
+                        glossFreq.toString());
+//                        glossFreq.keySet().stream().map((k) -> k + ":" + glossFreq.get(k))
+//                                .collect(Collectors.joining("\n")));
+            }
            // Check all gloss tokens (not-segmented) for rare ones very similar to quite common ones, i.e. tokens with
            // Levenshtein difference 1 with a higher frequency count
        /*DictionaryAutomaton glossDictionary =
@@ -2029,7 +2040,8 @@ public class RefcoChecker extends Checker implements CorpusFunction {
                // Check if token either is a gloss or each character is in the valid characters
                mismatch = false ;
                // Update frequency list
-                tokenFreq.compute(token,(k,v) -> (v == null) ? 1 : v + 1);
+                //tokenFreq.compute(token,(k,v) -> (v == null) ? 1 : v + 1);
+                tokenFreq.put(token);
                // Token is not one of the glosses
                if (!glosses.contains(token)) {
                    // Check if we can segment the token using the chunks
@@ -2236,7 +2248,8 @@ public class RefcoChecker extends Checker implements CorpusFunction {
                        List<String> segments = glossAutomaton.segmentWord(normalizedMorpheme);
                        if (segments == null || segments.isEmpty()) {
                            missing += 1;
-                             missingGlossFreq.compute(normalizedMorpheme, (k, v) -> (v == null) ? 1 : v + 1);
+//                             missingGlossFreq.compute(normalizedMorpheme, (k, v) -> (v == null) ? 1 : v + 1);
+                            missingGlossFreq.put(normalizedMorpheme);
                            // his would lead to large amount of warnings
                            try {
                                // Location l = getLocation((ELANData) cd, morpheme);
@@ -2260,14 +2273,17 @@ public class RefcoChecker extends Checker implements CorpusFunction {
                            matched += 1;
                            for (String segment : segments) {
                                // Remove initial periods and keep track of the count
-                                morphemeFreq.compute(segment.replaceAll("^\\.",""), (k, v) -> (v == null) ? 1 : v + 1);
+                                //morphemeFreq.compute(segment.replaceAll("^\\.",""), (k, v) -> (v == null) ? 1 : v +
+                                // 1);
+                                morphemeFreq.put(segment.replaceAll("^\\.",""));
                            }
                        }
                    }
                // OLD
 //                    morphemeFreq.compute(normalizedMorpheme,(k, v) -> (v == null) ? 1 : v + 1);
                }
-                glossFreq.compute(token,(k, v) -> (v == null) ? 1 : v + 1);
+//                glossFreq.compute(token,(k, v) -> (v == null) ? 1 : v + 1);
+                glossFreq.put(token);
            }
        }
        float percentValid = (float)matched/(matched+missing) ;
@@ -2573,6 +2589,7 @@ public class RefcoChecker extends Checker implements CorpusFunction {
        params.put("skip-documentation-check", "Flag to skip the documentation check");
        params.put("skip-transcription-check", "Flag to skip the transcription check");
        params.put("skip-gloss-check", "Flag to skip the gloss check");
+        params.put("gloss-stats", "Includes stats about all glosses");
        return params;
    }