diff --git a/README.RefCo.md b/README.RefCo.md deleted file mode 100644 index 10a905be03bfeb0544ba8358680042b9c65423ea..0000000000000000000000000000000000000000 --- a/README.RefCo.md +++ /dev/null @@ -1,6 +0,0 @@ -The Refco Checker - -How to use: -... -Source: -... diff --git a/doc/README.RefCo.md b/doc/README.RefCo.md new file mode 100644 index 0000000000000000000000000000000000000000..043aa8a02b3cfa643de424e85b128c9c0fef35fe --- /dev/null +++ b/doc/README.RefCo.md @@ -0,0 +1,11 @@ +# The Refco Checker + +## How to use: + +The RefCo checker is part of the corpus services. Compile using `mvn clean compile assembly:single` or following the instructions in [Build_with_Maven.md]. +After running `mvn assembly:single` you can run the corpus services using `java -jar target/corpus-services-1.0.jar` which gives you a list of all included checkers. +To run the RefCo checks use the command `java -jar target/corpus-services-1.0.jar -c RefcoChecker -i <PathToYourCorpus> -o <ReportOutputFile -p refco-file=<RefCoCorpusDocumentationFile> --corpus <CorpusName>` after adjusting the pathes and file names to your corpus. + +## Resources: +- The souce file for the RefCoChecker: [../src/main/java/de/uni_hamburg/corpora/validation/quest/RefcoChecker.java] +- The RefCo documentation is available on Zenodo: [https://zenodo.org/record/5825304] diff --git a/pom.xml b/pom.xml index e880749e60b57b15b35eb791a31d26ad27321ba9..b42baa7b1dc34006d0b6820d522c3dad64672684 100644 --- a/pom.xml +++ b/pom.xml @@ -103,12 +103,20 @@ <directory>src/main/java/de/uni_hamburg/corpora/utilities/resources</directory> </resource> <resource> - <directory>src/test/java/de/uni_hamburg/corpora/utilities/resources</directory> - </resource> - <resource> - <directory>src/test/java/de/uni_hamburg/corpora/resources</directory> + <directory>src/main/java/de/uni_hamburg/corpora/validation/quest/resources</directory> </resource> </resources> + <testResources> + <testResource> + <directory>src/test/java/de/uni_hamburg/corpora/utilities/resources</directory> + </testResource> + <testResource> + <directory>src/test/java/de/uni_hamburg/corpora/resources</directory> + </testResource> + <testResource> + <directory>src/test/java/de/uni_hamburg/corpora/validation/quest/resources</directory> + </testResource> + </testResources> </build> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> @@ -260,5 +268,11 @@ <artifactId>jmimemagic</artifactId> <version>0.1.3</version> </dependency> + <!-- https://mvnrepository.com/artifact/com.fasterxml.jackson.module/jackson-module-jsonSchema --> + <dependency> + <groupId>com.fasterxml.jackson.module</groupId> + <artifactId>jackson-module-jsonSchema</artifactId> + <version>2.13.2</version> + </dependency> </dependencies> </project> diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/EXMARaLDATierFinder.java b/src/main/java/de/uni_hamburg/corpora/validation/quest/EXMARaLDATierFinder.java index 81772955219be666f76b5c258aad45970559ffcc..7568f56a7c28ccb2d1c5ea7483700fb345f4ca20 100644 --- a/src/main/java/de/uni_hamburg/corpora/validation/quest/EXMARaLDATierFinder.java +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/EXMARaLDATierFinder.java @@ -40,19 +40,22 @@ public class EXMARaLDATierFinder extends TierFinder { @Override void findTiers(CorpusData cd, String pattern) throws JDOMException { - Document dom = ((EXMARaLDATranscriptionData) cd).getJdom(); // Get all id attributes for tiers matching the pattern, get the values and add them to a new list List<String> tierIds = new ArrayList<>(); - if (cd instanceof EXMARaLDATranscriptionData) + if (cd instanceof EXMARaLDATranscriptionData) { + Document dom = ((EXMARaLDATranscriptionData) cd).getJdom(); tierIds.addAll(((List<Attribute>) Collections.checkedList(XPath.newInstance( String.format("//tier[contains(@%s,\"%s\")]/@id", attribute, pattern)).selectNodes(dom), Attribute.class)) .stream().map(Attribute::getValue).collect(Collectors.toList())); - else if (cd instanceof EXMARaLDASegmentedTranscriptionData) + } + else if (cd instanceof EXMARaLDASegmentedTranscriptionData) { + Document dom = ((EXMARaLDASegmentedTranscriptionData) cd).getJdom(); tierIds.addAll(((List<Attribute>) Collections.checkedList(XPath.newInstance( String.format("//segmented-tier[contains(@%s,\"%s\")]/@id", attribute, pattern)).selectNodes(dom), Attribute.class)) .stream().map(Attribute::getValue).collect(Collectors.toList())); + } // Add found tiers to frequency list tiers.putAll(tierIds); } diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/RefcoChecker.java b/src/main/java/de/uni_hamburg/corpora/validation/quest/RefcoChecker.java index 2579da0158f8b418ffba67226465b5f4fb84d95f..f3f66396d485d483e8076ce6edc4f1510f274a5a 100644 --- a/src/main/java/de/uni_hamburg/corpora/validation/quest/RefcoChecker.java +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/RefcoChecker.java @@ -2,6 +2,7 @@ package de.uni_hamburg.corpora.validation.quest; import com.fasterxml.jackson.annotation.JsonAutoDetect; import com.fasterxml.jackson.annotation.PropertyAccessor; +import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.SerializationFeature; import com.google.common.primitives.Chars; @@ -19,8 +20,15 @@ import org.jdom.output.XMLOutputter; import org.jdom.xpath.XPath; import org.xml.sax.SAXException; +import javax.xml.bind.JAXBContext; +import javax.xml.bind.JAXBException; +import javax.xml.bind.SchemaOutputResolver; +import javax.xml.bind.annotation.XmlElement; +import javax.xml.bind.annotation.XmlElementWrapper; import javax.xml.parsers.ParserConfigurationException; +import javax.xml.transform.Result; import javax.xml.transform.TransformerException; +import javax.xml.transform.stream.StreamResult; import javax.xml.xpath.XPathExpressionException; import java.io.*; import java.net.*; @@ -39,6 +47,9 @@ import java.util.stream.Stream; import java.util.zip.ZipEntry; import java.util.zip.ZipFile; +import com.fasterxml.jackson.module.jsonSchema.JsonSchema; +import com.fasterxml.jackson.module.jsonSchema.customProperties.HyperSchemaFactoryWrapper; + /** * @author bba1792 Dr. Herbert Lange * @version 20220324 @@ -46,6 +57,8 @@ import java.util.zip.ZipFile; */ public class RefcoChecker extends Checker implements CorpusFunction { + private final String REFCO_CHECKER_VERSION="20220324"; + // The local logger that can be used for debugging private final Logger logger = Logger.getLogger(this.getClass().toString()); @@ -87,8 +100,9 @@ public class RefcoChecker extends Checker implements CorpusFunction { this.information = information; this.notes = notes; } - + @XmlElement(required=true) String information ; + @XmlElement(required=true) String notes; @Override @@ -103,6 +117,14 @@ public class RefcoChecker extends Checker implements CorpusFunction { public int hashCode() { return Objects.hash(information, notes); } + + public String getInformation() { + return information; + } + + public String getNotes() { + return notes; + } } /** @@ -110,13 +132,21 @@ public class RefcoChecker extends Checker implements CorpusFunction { * speaker information, location and date */ static class Session { + @XmlElement(required=true) String sessionName ; + @XmlElement(required=true) String fileName ; + @XmlElement(required=true) String speakerName ; + @XmlElement(required=true) String speakerAge ; + @XmlElement(required=true) String speakerGender ; + @XmlElement(required=true) String recordingLocation ; + @XmlElement(required=true) String recordingDate ; + @XmlElement(required=true) String genre ; // is this a controlled vocabulary? // String ageGroup ; // is this a controlled vocabulary? @@ -133,6 +163,38 @@ public class RefcoChecker extends Checker implements CorpusFunction { public int hashCode() { return Objects.hash(sessionName, fileName, speakerName, speakerAge, speakerGender, recordingLocation, recordingDate, genre); } + + public String getSessionName() { + return sessionName; + } + + public String getFileName() { + return fileName; + } + + public String getSpeakerName() { + return speakerName; + } + + public String getSpeakerAge() { + return speakerAge; + } + + public String getSpeakerGender() { + return speakerGender; + } + + public String getRecordingLocation() { + return recordingLocation; + } + + public String getRecordingDate() { + return recordingDate; + } + + public String getGenre() { + return genre; + } } @@ -141,9 +203,13 @@ public class RefcoChecker extends Checker implements CorpusFunction { * tier functions and languages */ static class Tier { + @XmlElement(required=true) String tierName ; + @XmlElement(required=true) List<String> tierFunctions; + @XmlElement(required=true) String segmentationStrategy ; + @XmlElement(required=true) String languages ; @Override @@ -158,6 +224,22 @@ public class RefcoChecker extends Checker implements CorpusFunction { public int hashCode() { return Objects.hash(tierName, tierFunctions, segmentationStrategy, languages); } + + public String getTierName() { + return tierName; + } + + public List<String> getTierFunctions() { + return tierFunctions; + } + + public String getSegmentationStrategy() { + return segmentationStrategy; + } + + public String getLanguages() { + return languages; + } } /** @@ -165,8 +247,11 @@ public class RefcoChecker extends Checker implements CorpusFunction { * the list of valid graphemes used in transcription tiers */ static class Transcription { + @XmlElement(required=true) String grapheme ; + @XmlElement(required=true) String linguisticValue ; + @XmlElement(required=true) String linguisticConvention ; @Override @@ -181,6 +266,18 @@ public class RefcoChecker extends Checker implements CorpusFunction { public int hashCode() { return Objects.hash(grapheme, linguisticValue, linguisticConvention); } + + public String getGrapheme() { + return grapheme; + } + + public String getLinguisticValue() { + return linguisticValue; + } + + public String getLinguisticConvention() { + return linguisticConvention; + } } /** @@ -188,9 +285,13 @@ public class RefcoChecker extends Checker implements CorpusFunction { * list of expected glosses and the tiers they are valid in */ static class Gloss { + @XmlElement(required=true) String gloss ; + @XmlElement(required=true) String meaning ; + @XmlElement String comments ; + @XmlElement(required=true) String tiers ; @Override @@ -205,6 +306,22 @@ public class RefcoChecker extends Checker implements CorpusFunction { public int hashCode() { return Objects.hash(gloss, meaning, comments, tiers); } + + public String getGloss() { + return gloss; + } + + public String getMeaning() { + return meaning; + } + + public String getComments() { + return comments; + } + + public String getTiers() { + return tiers; + } } /** @@ -212,10 +329,15 @@ public class RefcoChecker extends Checker implements CorpusFunction { * valid punctuation characters and the tiers they are valid in */ static class Punctuation { + @XmlElement(required=true) String character ; + @XmlElement(required=true) String meaning ; + @XmlElement String comments ; + @XmlElement(required=true) String tiers ; + @XmlElement(required=true) String function; @Override @@ -230,6 +352,26 @@ public class RefcoChecker extends Checker implements CorpusFunction { public int hashCode() { return Objects.hash(character, meaning, comments, tiers, function); } + + public String getCharacter() { + return character; + } + + public String getMeaning() { + return meaning; + } + + public String getComments() { + return comments; + } + + public String getTiers() { + return tiers; + } + + public String getFunction() { + return function; + } } /** @@ -238,33 +380,57 @@ public class RefcoChecker extends Checker implements CorpusFunction { public static class RefcoCriteria { // Tab: Overview // Corpus information + @XmlElement(required=true) String corpusTitle ; + @XmlElement(required=true) String subjectLanguages ; + @XmlElement(required=true) String archive ; + @XmlElement(required=true) String persistentId ; // should be an url to either a doi or handle + @XmlElement(required=true) String annotationLicense ; + @XmlElement(required=true) String recordingLicense ; + @XmlElement(required=true) String creatorName ; + @XmlElement(required=true) String creatorContact ; // usually mail address + @XmlElement(required=true) String creatorInstitution ; // Certification information + @XmlElement(required=true) InformationNotes refcoVersion ; // Quantitative Summary + @XmlElement(required=true) InformationNotes numberSessions ; + @XmlElement(required=true) InformationNotes numberTranscribedWords ; + @XmlElement(required=true) InformationNotes numberAnnotatedWords ; // Annotation Strategies // All languages are in a single cell + @XmlElement(required=true) InformationNotes translationLanguages ; // Tab: Corpus Compositions + @XmlElementWrapper(name="sessions") + @XmlElement(required=true,name="session") ArrayList<Session> sessions = new ArrayList<>() ; // Tab: Annotation Tiers + @XmlElementWrapper(name="tiers") + @XmlElement(required = true,name="tier") ArrayList<Tier> tiers = new ArrayList<>() ; // Tab: Transcriptions + @XmlElementWrapper(name="transcriptions") + @XmlElement(required = true, name="transcription") ArrayList<Transcription> transcriptions = new ArrayList<>() ; // Tab: Glosses + @XmlElementWrapper(name="glosses") + @XmlElement(required = true, name="gloss") ArrayList<Gloss> glosses = new ArrayList<>() ; // Tab: Punctuation + @XmlElementWrapper(name="punctuations") + @XmlElement(required = true, name="punctuation") ArrayList<Punctuation> punctuations = new ArrayList<>() ; public String getCorpusTitle() { @@ -450,6 +616,17 @@ public class RefcoChecker extends Checker implements CorpusFunction { else { report.addCritical(getFunction(),"Missing corpus documentation file property"); } + if (properties.containsKey("get-schema")) { + + try { + System.out.println(deriveXMLSpecification()); + System.out.println(); + System.out.println(deriveJSONSpecification()); + System.exit(0); + } catch (Exception e) { + e.printStackTrace(); + } + } } /** @@ -511,6 +688,9 @@ public class RefcoChecker extends Checker implements CorpusFunction { @Override public Report function(Corpus c, Boolean fix) throws NoSuchAlgorithmException, ClassNotFoundException, FSMException, URISyntaxException, SAXException, IOException, ParserConfigurationException, JexmaraldaException, TransformerException, XPathExpressionException, JDOMException { if (refcoFileLoaded) { + report.addNote(getFunction(),"Report created by RefCo checker version " + REFCO_CHECKER_VERSION + + " based on documentation following RefCo " + criteria.refcoVersion + + " specification version"); System.out.println("... running the corpus function"); // Create the current report //Report report = new Report(); @@ -2636,4 +2816,26 @@ public class RefcoChecker extends Checker implements CorpusFunction { return false; } } + + public String deriveXMLSpecification() throws JAXBException, IOException { + JAXBContext ctx = JAXBContext.newInstance(RefcoCriteria.class); + StringWriter sw = new StringWriter(); + ctx.generateSchema(new SchemaOutputResolver() { + @Override + public Result createOutput(String namespaceUri, String suggestedFileName) throws IOException { + StreamResult result = new StreamResult(sw); + result.setSystemId("StringWriter"); + return result; + } + }); + return sw.toString(); + } + + private String deriveJSONSpecification() throws JsonProcessingException { + ObjectMapper om = new ObjectMapper(); + HyperSchemaFactoryWrapper schemaVisitor = new HyperSchemaFactoryWrapper(); + om.acceptJsonFormatVisitor(RefcoCriteria.class, schemaVisitor); + JsonSchema jsonSchema = schemaVisitor.finalSchema(); + return om.writerWithDefaultPrettyPrinter().writeValueAsString(jsonSchema); + } } diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/TranscriptionChecker.java b/src/main/java/de/uni_hamburg/corpora/validation/quest/TranscriptionChecker.java index d105dd80baae53da0ba5c8f6e1e483cd87ea8662..8459d244890197de2e7fe9727cca112a62c31219 100644 --- a/src/main/java/de/uni_hamburg/corpora/validation/quest/TranscriptionChecker.java +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/TranscriptionChecker.java @@ -75,7 +75,38 @@ abstract class TranscriptionChecker extends Checker implements CorpusFunction { // "))" // doppelte runde schließende Klammer }).map((c) -> String.valueOf(Character.toChars(Integer.decode(c.replace("U+","0x"))))) .collect(Collectors.toList())); - + private final Set<String> didaSpecial = new HashSet<>( + Arrays.asList("←", "→", "*") + ); + + private final Set<String> gatSpecial = new HashSet<>( + Arrays.asList( + "(", ".", ")", + "(","-",")", + "(","-","-",")", + "(","-","-","-",")", + "'", + "?", + ",", + "-", + ";", + ".", + "↑", + "↓", + "ˋ", + "ˊ", + "ˉ", + "ˆ", + "ˇ", + "<", ">") + ); + + private final Set<String> ipaSpecial = new HashSet<>( + Arrays.asList("ɐ", "ɑ", "ɒ", "ɓ", "ɔ", "ɕ", "ɖ", "ɗ", "ɘ", "ə", "ɚ", "ɛ", "ɜ", "ɝ", "ɞ", "ɟ", "ɠ", "ɡ", + "ɢ", "ɣ", "ɤ", "ɥ", "ɦ", "ɧ", "ɨ", "ɩ", "ɪ", "ɫ", "ɬ", "ɭ", "ɮ", "ɯ", "ɰ", "ɱ", "ɲ", "ɳ", "ɴ", "ɵ", + "ɶ", "ɷ", "ɸ", "ɹ", "ɺ", "ɻ", "ɼ", "ɽ", "ɾ", "ɿ", "ʀ", "ʁ", "ʂ", "ʃ", "ʄ", "ʅ", "ʆ", "ʇ", "ʈ", "ʉ", + "ʊ", "ʋ", "ʌ", "ʍ", "ʎ", "ʏ", "ʐ", "ʑ", "ʒ", "ʓ", "ʔ", "ʕ", "ʖ", "ʗ", "ʘ", "ʙ", "ʚ", "ʛ", "ʜ", "ʝ", + "ʞ", "ʟ", "ʠ", "ʡ", "ʢ", "ʣ", "ʤ", "ʥ", "ʦ", "ʧ", "ʨ")); /** * Function to enumerate all alphabetic characters * @return all alphabetic characters in the unicode standard @@ -100,13 +131,33 @@ abstract class TranscriptionChecker extends Checker implements CorpusFunction { knownGraphemes.addAll(digitChars); knownGraphemes.addAll(hiatSpecial); } + else if (properties.getProperty("transcription-method").equalsIgnoreCase("dida")) { + knownGraphemes.addAll(alphaChars); + knownGraphemes.addAll(digitChars); + knownGraphemes.addAll(didaSpecial); + } + else if (properties.getProperty("transcription-method").equalsIgnoreCase("gat")) { + knownGraphemes.addAll(alphaChars); + knownGraphemes.addAll(digitChars); + knownGraphemes.addAll(gatSpecial); + } + else if (properties.getProperty("transcription-method").equalsIgnoreCase("ipa")) { + knownGraphemes.addAll(ipaSpecial); + knownGraphemes.addAll( + Arrays.asList("abcdefghijklmnopqrstuvwzyz".split("")) + ); + knownGraphemes.addAll( + Arrays.asList("abcdefghijklmnopqrstuvwzyz".toUpperCase().split("")) + ); + } + + } } @Override public Report function(CorpusData cd, Boolean fix) throws NoSuchAlgorithmException, ClassNotFoundException, FSMException, URISyntaxException, SAXException, IOException, ParserConfigurationException, JexmaraldaException, TransformerException, XPathExpressionException, JDOMException { Report report = new Report(); - logger.info("Checking: " + cd.getURL()); try { // Find transcription tiers List<Element> transcriptionTiers = getTranscriptionTiers(cd); @@ -175,7 +226,8 @@ abstract class TranscriptionChecker extends Checker implements CorpusFunction { public Map<String, String> getParameters() { Map<String,String> params = super.getParameters(); params.put("transcription-graphemes","List of transcription graphemes, separated by commas"); - params.put("transcription-method", "Standard transcription method used, if any. Currently only HIAT"); + params.put("transcription-method", "Standard transcription method used, if any. Currently HIAT, DIDA, GAT and" + + " IPA"); return params; } }