diff --git a/doc/README.RefCo.md b/doc/README.RefCo.md index b5e57803d6d2299fa5a4f306b67101f117e1d174..660d42f5b5ab3856771e44ca0bdb81826c64c80a 100644 --- a/doc/README.RefCo.md +++ b/doc/README.RefCo.md @@ -20,4 +20,4 @@ To run the RefCo checks use the command `java -jar corpus-services-1.0.jar -c Re ## Resources: - The souce file for the RefCoChecker: [../src/main/java/de/uni_hamburg/corpora/validation/quest/RefcoChecker.java](../src/main/java/de/uni_hamburg/corpora/validation/quest/RefcoChecker.java) -- The RefCo documentation is available on Zenodo: [https://zenodo.org/record/5825304](https://zenodo.org/record/5825304) +- The RefCo documentation is available on Zenodo: [https://zenodo.org/record/6470807](https://zenodo.org/record/6470807) diff --git a/pom.xml b/pom.xml index b42baa7b1dc34006d0b6820d522c3dad64672684..dfb04e5e2ce30e86a27335c0e5718611f6dac834 100644 --- a/pom.xml +++ b/pom.xml @@ -85,6 +85,19 @@ <!--mainClass>de.uni_hamburg.corpora.utilities.MediaFileChecker</mainClass--> </configuration> </plugin> + <plugin> + <groupId>net.alchim31.maven</groupId> + <artifactId>scala-maven-plugin</artifactId> + <version>4.5.6</version> + <executions> + <execution> + <goals> + <goal>compile</goal> + <goal>testCompile</goal> + </goals> + </execution> + </executions> + </plugin> </plugins> <resources> <resource> @@ -274,5 +287,11 @@ <artifactId>jackson-module-jsonSchema</artifactId> <version>2.13.2</version> </dependency> + <!-- https://mvnrepository.com/artifact/org.scala-lang/scala-compiler --> + <dependency> + <groupId>org.scala-lang</groupId> + <artifactId>scala-library</artifactId> + <version>2.13.8</version> + </dependency> </dependencies> </project> diff --git a/src/main/java/de/uni_hamburg/corpora/CorpusMagician.java b/src/main/java/de/uni_hamburg/corpora/CorpusMagician.java index f9feb36f39bd754bb4a2e6fcdbdc650d08c0044d..c537caf105058ad9e6de2460eb68a9d4caa4ff0d 100644 --- a/src/main/java/de/uni_hamburg/corpora/CorpusMagician.java +++ b/src/main/java/de/uni_hamburg/corpora/CorpusMagician.java @@ -1058,6 +1058,7 @@ public class CorpusMagician { String desc; String hasfix; StringBuilder usable ; + String params; for (CorpusFunction cf : getAllExistingCFsAsCFs()) { desc = cf.getFunction() + ": " + cf.getDescription(); usable = new StringBuilder("\nThe function can be used on:\n"); @@ -1065,7 +1066,16 @@ public class CorpusMagician { usable.append(cl.getSimpleName() + " "); } hasfix = "\nThe function has a fixing option: " + cf.getCanFix().toString(); - footerverbose.append(desc + hasfix + usable + "\n\n"); + if (cf.getParameters().isEmpty()) { + params = ""; + } + else { + params = + "\nThe function accepts the following parameters:\n" + cf.getParameters().keySet() + .stream().map((k) -> k + ": " + cf.getParameters().get(k)) + .collect(Collectors.joining("\n")); + } + footerverbose.append(desc + hasfix + usable + params + "\n\n"); } footerverbose.append("\n\nPlease report issues at https://lab.multilingua.uni-hamburg" + ".de/redmine/projects/corpus-services/issues"); diff --git a/src/main/java/de/uni_hamburg/corpora/Report.java b/src/main/java/de/uni_hamburg/corpora/Report.java index eda5fab3a2f93c40edce9e9c213fad9cc5d631c0..7facbd436f61607e2529d03f023af4eec108669e 100644 --- a/src/main/java/de/uni_hamburg/corpora/Report.java +++ b/src/main/java/de/uni_hamburg/corpora/Report.java @@ -9,10 +9,18 @@ */ package de.uni_hamburg.corpora; +import com.fasterxml.jackson.annotation.JsonAutoDetect; +import com.fasterxml.jackson.annotation.PropertyAccessor; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.SerializationFeature; import de.uni_hamburg.corpora.ReportItem.Severity; + +import java.io.File; +import java.io.IOException; import java.text.MessageFormat; import java.text.SimpleDateFormat; import java.util.*; + import org.jdom.JDOMException; /** @@ -679,4 +687,20 @@ public class Report { return line; } + /** + * Dumps the complete report into a JSON file + * @param filename the filename of the target JSON file + */ + public void dump(String filename) { + // Generate pretty-printed json + ObjectMapper mapper = new ObjectMapper(); + // Allows serialization even when getters are missing + mapper.setVisibility(PropertyAccessor.FIELD, JsonAutoDetect.Visibility.ANY); + mapper.configure(SerializationFeature.INDENT_OUTPUT,true); + try { + mapper.writeValue(new File(filename),this.getRawStatistics()); + } catch (IOException e) { + e.printStackTrace(); + } + } } diff --git a/src/main/java/de/uni_hamburg/corpora/publication/HandlePidRegistration.java b/src/main/java/de/uni_hamburg/corpora/publication/HandlePidRegistration.java index 4d0ef6fb610f055944038cae2477ea61955fd550..950a9091421bfafc986e72cecb8a578a0ae8a9fc 100644 --- a/src/main/java/de/uni_hamburg/corpora/publication/HandlePidRegistration.java +++ b/src/main/java/de/uni_hamburg/corpora/publication/HandlePidRegistration.java @@ -5,7 +5,6 @@ */ package de.uni_hamburg.corpora.publication; -import com.sun.org.apache.xerces.internal.impl.dv.util.Base64; import de.uni_hamburg.corpora.CmdiData; import de.uni_hamburg.corpora.Corpus; import de.uni_hamburg.corpora.CorpusData; @@ -22,6 +21,7 @@ import java.net.HttpURLConnection; import java.net.URL; import java.text.DateFormat; import java.text.SimpleDateFormat; +import java.util.Base64; import java.util.Calendar; import java.util.Collection; import java.util.Date; @@ -160,7 +160,7 @@ public class HandlePidRegistration extends Publisher implements CorpusFunction { //http://pid.gwdg.de/handles/11022?URL=http://www.corpora.uni-hamburg.de/repository String authString = EpicApiUser + ":" + EpicApiPass; - String authStringEnc = Base64.encode(authString.getBytes("UTF-8")); + String authStringEnc = Base64.getEncoder().encodeToString(authString.getBytes("UTF-8")); URL url = new URL(HandleEndpoint + HandlePrefix + "?URL=" + handleURL); HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection(); @@ -193,7 +193,7 @@ public class HandlePidRegistration extends Publisher implements CorpusFunction { //http://pid.gwdg.de/handles/11022?URL=http://www.corpora.uni-hamburg.de/repository String authString = EpicApiUser + ":" + EpicApiPass; - String authStringEnc = Base64.encode(authString.getBytes("UTF-8")); + String authStringEnc = Base64.getEncoder().encodeToString(authString.getBytes("UTF-8")); URL object=new URL(HandleEndpoint + HandlePrefix + "/"); HttpURLConnection con = (HttpURLConnection) object.openConnection(); diff --git a/src/main/java/de/uni_hamburg/corpora/validation/Checker.java b/src/main/java/de/uni_hamburg/corpora/validation/Checker.java index 0c8b29f884c923d0f7b93efe5a0f097dbe696461..13e2321deb44f5b07d7bcc745845da33d1a429f8 100644 --- a/src/main/java/de/uni_hamburg/corpora/validation/Checker.java +++ b/src/main/java/de/uni_hamburg/corpora/validation/Checker.java @@ -13,10 +13,7 @@ import de.uni_hamburg.corpora.Report; import java.io.IOException; import java.net.URISyntaxException; import java.security.NoSuchAlgorithmException; -import java.util.Collection; -import java.util.Collections; -import java.util.Map; -import java.util.Properties; +import java.util.*; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.TransformerException; import javax.xml.xpath.XPathExpressionException; @@ -166,6 +163,6 @@ public abstract class Checker implements CorpusFunction { * @return The map of all parameters and their description */ public Map<String, String> getParameters() { - return Collections.EMPTY_MAP; + return new HashMap<>(); } } diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/AnnotationChecker.java b/src/main/java/de/uni_hamburg/corpora/validation/quest/AnnotationChecker.java index 0b4a611a80f75c90f8b18a5fa25134018201b5fe..94facd7e6993daee0635a4fe11c95f940d396de2 100644 --- a/src/main/java/de/uni_hamburg/corpora/validation/quest/AnnotationChecker.java +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/AnnotationChecker.java @@ -43,17 +43,17 @@ abstract class AnnotationChecker extends Checker implements CorpusFunction { private boolean showTagStats = false; // List of tiers to be checked - private final List<String> tierIds = new ArrayList<>(); + protected final List<String> tierIds = new ArrayList<>(); // Regex to separate tokens private final String tokenSeparator = "\\s+" ; // Check if the minimal setup is done - private boolean setUp = false; + protected boolean setUp = false; public AnnotationChecker(Properties properties) { super(false, properties); - if (properties.containsKey("tier-ids")) { - tierIds.addAll(Arrays.asList(properties.getProperty("tier-ids").split(","))); + if (properties.containsKey("annotation-tier-ids")) { + tierIds.addAll(Arrays.asList(properties.getProperty("annotation-tier-ids").split(","))); setUp = true; } // Tags as list in parameter @@ -73,14 +73,15 @@ abstract class AnnotationChecker extends Checker implements CorpusFunction { /** * Loads the tags from an annotation specification file * see https://exmaralda.org/en/utilities/ Templates for working with the Annotation Panel - * @param fileName the name of the annotation specification file + * @param fileName the name of the annotation specification file as a resource * @return the list of tags specified */ private Collection<String> loadAnnotationSpecification(String fileName) { SAXBuilder sb = new SAXBuilder(); List<String> tags = new ArrayList<>(); try { - Document dom = sb.build(new File(fileName)); + //Document dom = sb.build(new File(fileName)); + Document dom = sb.build(this.getClass().getClassLoader().getResourceAsStream(fileName)); List<Attribute> names = Collections.checkedList(XPath.newInstance("//tag/@name").selectNodes(dom), Attribute.class); // Extract attribute values and add them to the tags list @@ -99,10 +100,8 @@ abstract class AnnotationChecker extends Checker implements CorpusFunction { String text = getTierText(cd, tier); if (!text.isEmpty()) { List<String> tokens = Arrays.asList(text.split(tokenSeparator)); - if (tokens.size() == 1) - - // Put all tokens into the summary - tagStats.putAll(tokens); + // Put all tokens into the summary + tagStats.putAll(tokens); for (String token : tokens) { // Check if the token is in the tag list if (!tags.isEmpty() && !tags.contains(token)) { @@ -161,7 +160,7 @@ abstract class AnnotationChecker extends Checker implements CorpusFunction { @Override public Map<String, String> getParameters() { Map<String,String> params = super.getParameters(); - params.put("tier-ids","Mandatory identificator(s) for the tiers to be checked, separated by commas"); + params.put("annotation-tier-ids","Mandatory identifier(s) for the tiers to be checked, separated by commas"); params.put("annotation-tags", "Optional list of expected annotation tags, separated by comma"); params.put("annotation-specification", "Optional list of expected annotation tags, in the EXMARaLDA " + "Annotation Panel compatible format"); diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/CMDIGenericMetadataChecker.java b/src/main/java/de/uni_hamburg/corpora/validation/quest/CMDIGenericMetadataChecker.java index 3d177e91121058010bac2076ca5a9d868cdef4a5..6310efec8f423e1c7eb608a6dec300df5d5b98a6 100644 --- a/src/main/java/de/uni_hamburg/corpora/validation/quest/CMDIGenericMetadataChecker.java +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/CMDIGenericMetadataChecker.java @@ -25,6 +25,9 @@ public class CMDIGenericMetadataChecker extends GenericMetadataChecker implement super(properties); if (properties != null && !properties.isEmpty() && properties.containsKey("cmdi-criteria-file")) setCriteriaFile(properties.getProperty("cmdi-criteria-file")); + else { + loadCriteriaResource("cmdi-generic.csv"); + } } diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/ChildesGenericMetadataChecker.java b/src/main/java/de/uni_hamburg/corpora/validation/quest/ChildesGenericMetadataChecker.java index eabc7ae668a905fa66694aa3585c74bc72a20c89..bc108ec01b5779ee50caca1409ba9824ee60846f 100644 --- a/src/main/java/de/uni_hamburg/corpora/validation/quest/ChildesGenericMetadataChecker.java +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/ChildesGenericMetadataChecker.java @@ -16,6 +16,11 @@ public class ChildesGenericMetadataChecker extends GenericMetadataChecker implem public ChildesGenericMetadataChecker(Properties properties) { super(properties); + if (properties != null && !properties.isEmpty() && properties.containsKey("childes-criteria-file")) + setCriteriaFile(properties.getProperty("childes-criteria-file")); + else { + loadCriteriaResource("childes-generic.csv"); + } } diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/ComaGenericMetadataChecker.java b/src/main/java/de/uni_hamburg/corpora/validation/quest/ComaGenericMetadataChecker.java index 0ce0d4813487c74d498b92a89aa62e7f6c0c8e46..f378603867c2e6204c7725d629ce2fba61308d3b 100644 --- a/src/main/java/de/uni_hamburg/corpora/validation/quest/ComaGenericMetadataChecker.java +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/ComaGenericMetadataChecker.java @@ -22,6 +22,11 @@ public class ComaGenericMetadataChecker extends GenericMetadataChecker { */ public ComaGenericMetadataChecker(Properties properties) { super(properties); + if (properties != null && !properties.isEmpty() && properties.containsKey("coma-criteria-file")) + setCriteriaFile(properties.getProperty("coma-criteria-file")); + else { + loadCriteriaResource("coma-generic.csv"); + } } /** * Function providing a description of a checker @@ -100,4 +105,11 @@ public class ComaGenericMetadataChecker extends GenericMetadataChecker { Optional<String> optPath = path.stream().reduce((s1, s2) -> s1 + "/" + s2); return optPath.orElse("") ; } + + @Override + public Map<String, String> getParameters() { + Map<String,String> p = super.getParameters(); + p.put("coma-criteria-file", "The file for Coma generic metadata criteria"); + return p; + } } diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/ELANTierFinder.java b/src/main/java/de/uni_hamburg/corpora/validation/quest/ELANTierFinder.java index 61f3f50ca4fa126d9cbd2c4d1a260e0fab75f687..f3967a66e0cb198459e6be3a3a0b9426b2326698 100644 --- a/src/main/java/de/uni_hamburg/corpora/validation/quest/ELANTierFinder.java +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/ELANTierFinder.java @@ -2,8 +2,6 @@ package de.uni_hamburg.corpora.validation.quest; import de.uni_hamburg.corpora.CorpusData; import de.uni_hamburg.corpora.ELANData; -import de.uni_hamburg.corpora.EXMARaLDASegmentedTranscriptionData; -import de.uni_hamburg.corpora.EXMARaLDATranscriptionData; import org.jdom.Attribute; import org.jdom.Document; import org.jdom.JDOMException; @@ -21,8 +19,8 @@ public class ELANTierFinder extends TierFinder { public ELANTierFinder(Properties properties) { super(properties); - if (attribute == null || attribute.isEmpty()) { - attribute = "TIER_ID"; + if (attribute_name == null || attribute_name.isEmpty()) { + attribute_name = "TIER_ID"; } } @@ -42,7 +40,7 @@ public class ELANTierFinder extends TierFinder { // Get all id attributes for tiers matching the pattern, get the values and add them to a new list List<String> tierIds = new ArrayList<>(((List<Attribute>) Collections.checkedList(XPath.newInstance( String.format("//TIER[contains(@%s,\"%s\")]/@TIER_ID", - attribute, pattern)).selectNodes(dom), Attribute.class)) + attribute_name, pattern)).selectNodes(dom), Attribute.class)) .stream().map(Attribute::getValue).collect(Collectors.toList())); // Add found tiers to frequency list tiers.putAll(tierIds); diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/ELANTierStructureChecker.java b/src/main/java/de/uni_hamburg/corpora/validation/quest/ELANTierStructureChecker.java index ef249b42a88505b2d7a8a6bf73a5883ed903652a..83fd2ec516c0d9c549fde39a1b6be15f4abde8b2 100644 --- a/src/main/java/de/uni_hamburg/corpora/validation/quest/ELANTierStructureChecker.java +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/ELANTierStructureChecker.java @@ -55,7 +55,8 @@ public class ELANTierStructureChecker extends TierStructureChecker { String.join("|", speakers),"speaker")); // Also get constraints Element linguisticTypeElement = - (Element) XPath.newInstance("//LINGUISTIC_TYPE[@LINGUISTIC_TYPE_ID=" + id.getValue() + "]") + (Element) XPath.newInstance("//LINGUISTIC_TYPE[@LINGUISTIC_TYPE_ID=\"" + + id.getValue() + "\"]") .selectSingleNode(((ELANData) cd).getJdom()); if (linguisticTypeElement != null) { Attribute constraints = linguisticTypeElement.getAttribute("CONSTRAINTS"); diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/ELANTranscriptionChecker.java b/src/main/java/de/uni_hamburg/corpora/validation/quest/ELANTranscriptionChecker.java index e43d85a4a9bc60c297d3833a0fadb1057fa20980..ecf61f17d5074bfd376b818cad4fd55f232e9a29 100644 --- a/src/main/java/de/uni_hamburg/corpora/validation/quest/ELANTranscriptionChecker.java +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/ELANTranscriptionChecker.java @@ -1,13 +1,24 @@ package de.uni_hamburg.corpora.validation.quest; +import de.uni_hamburg.corpora.Corpus; import de.uni_hamburg.corpora.CorpusData; import de.uni_hamburg.corpora.ELANData; +import de.uni_hamburg.corpora.Report; import de.uni_hamburg.corpora.utilities.quest.XMLTools; +import org.exmaralda.partitureditor.fsm.FSMException; +import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException; import org.jdom.Document; import org.jdom.Element; import org.jdom.JDOMException; import org.jdom.xpath.XPath; +import org.xml.sax.SAXException; +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.transform.TransformerException; +import javax.xml.xpath.XPathExpressionException; +import java.io.IOException; +import java.net.URISyntaxException; +import java.security.NoSuchAlgorithmException; import java.util.*; import java.util.logging.Logger; @@ -20,14 +31,22 @@ public class ELANTranscriptionChecker extends TranscriptionChecker { private final Logger logger = Logger.getLogger(getFunction()); - private final Set<String> tierIds = new HashSet<>(); - public ELANTranscriptionChecker(Properties properties) { super(properties); - logger.info("PROPS: " + props); - if (properties.containsKey("transcription-tiers")) { - tierIds.addAll(Arrays.asList(properties.getProperty("transcription-tiers").split(","))); + + } + + @Override + public Report function(Corpus c, Boolean fix) throws NoSuchAlgorithmException, ClassNotFoundException, FSMException, URISyntaxException, SAXException, IOException, ParserConfigurationException, JexmaraldaException, TransformerException, XPathExpressionException, JDOMException { + Report report = new Report(); + if (props.containsKey("transcription-tier-pattern")) { + Properties properties = new Properties(props); + properties.put("tier-pattern", props.getProperty("transcription-tier-pattern")); + ELANTierFinder etf = new ELANTierFinder(props); + report.merge(etf.function(c, fix)); } + report.merge(super.function(c, fix)); + return report; } @Override @@ -56,21 +75,10 @@ public class ELANTranscriptionChecker extends TranscriptionChecker { else if (props.containsKey("transcription-method") && props.getProperty("transcription-method").equalsIgnoreCase("hiat")) { logger.info("HIAT"); - tiers = Collections.checkedList(XPath.newInstance("//TIER[@LINGUISTIC_TYPE_REF=\"v\"]").selectNodes(dom), - Element.class); + tiers.addAll(Collections.checkedList(XPath.newInstance("//TIER[@LINGUISTIC_TYPE_REF=\"v\"]").selectNodes(dom), + Element.class)); } return tiers; } - @Override - String getTranscriptionText(Element tier) { - return XMLTools.showAllText(tier); - } - - @Override - public Map<String, String> getParameters() { - Map<String,String> params = super.getParameters(); - params.put("transcription-tiers","List of transcription tier IDs separated by commas"); - return params; - } } diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/ELANValidatorChecker.java b/src/main/java/de/uni_hamburg/corpora/validation/quest/ELANValidatorChecker.java index ce88b2e2418af93207f46d5ad91f3d9841692107..a9ae4f79bcfdb69cafd5428232307139189c1655 100644 --- a/src/main/java/de/uni_hamburg/corpora/validation/quest/ELANValidatorChecker.java +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/ELANValidatorChecker.java @@ -21,6 +21,7 @@ import java.security.NoSuchAlgorithmException; import java.util.Collection; import java.util.Collections; import java.util.Properties; +import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -36,7 +37,7 @@ import mpi.eudico.server.corpora.clomimpl.util.EAFValidator ; public class ELANValidatorChecker extends Checker implements CorpusFunction { // The local logger that can be used for debugging - // Logger logger = Logger.getLogger(this.getClass().toString()); + Logger logger = Logger.getLogger(this.getClass().toString()); public ELANValidatorChecker(Properties properties) { super(false, properties) ; @@ -55,22 +56,31 @@ public class ELANValidatorChecker extends Checker implements CorpusFunction { EAFValidator eafValidator = new EAFValidator(fileUri); eafValidator.validate(); ProcessReport eafReport = eafValidator.getReport(); + // TODO: Analyse report to give better feedback // Extract the errors and warnings from the validator report - Matcher m = Pattern.compile("Received (\\d+) warnings and (\\d+) errors").matcher(eafReport.getReportAsString()); - if (m.matches()) { - int warningCount = Integer.parseInt(m.group(1)); - int errorCount = Integer.parseInt(m.group(2)); - if (warningCount == 0 && errorCount == 0) - report.addNote(getFunction(), "No errors and warnings"); - else if (warningCount > 0 && errorCount == 0) - report.addWarning(getFunction(), "Encountered " + warningCount + " warnings and no errors"); - else if (warningCount == 0 && errorCount > 0) - report.addCritical(getFunction(), "Encountered no warnings and " + errorCount + " errors"); - else - report.addCritical(getFunction(), "Encountered " + warningCount + " warnings and " + errorCount + " errors"); - } - else - report.addCritical(getFunction(), "Error extracting warning and error counts"); +// Boolean matched = false; +// for (String line : eafReport.getReportAsString().split("\n+")) { +// Matcher m = Pattern.compile("Received (\\d+) warnings and (\\d+) errors").matcher(line); +// if (m.matches()) { +// matched = true; +// int warningCount = Integer.parseInt(m.group(1)); +// int errorCount = Integer.parseInt(m.group(2)); +// if (warningCount == 0 && errorCount == 0) +// report.addNote(getFunction(), "No errors and warnings"); +// else if (warningCount > 0 && errorCount == 0) +// report.addWarning(getFunction(), "Encountered " + warningCount + " warnings and no errors"); +// else if (warningCount == 0 && errorCount > 0) +// report.addCritical(getFunction(), "Encountered no warnings and " + errorCount + " errors"); +// else +// report.addCritical(getFunction(), "Encountered " + warningCount + " warnings and " + errorCount + " errors"); +// } +// if (!matched) +// report.addCritical(getFunction(), +// ReportItem.newParamMap( +// new String[]{"function", "filename", "description"}, +// new Object[]{getFunction(), cd.getFilename(), "Error extracting warning and error counts"})); +// } + report.addNote(getFunction(),cd,"EAF Validator report:\n" + eafReport.getReportAsString()); return report; } diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/EXMARaLDAAnnotationChecker.java b/src/main/java/de/uni_hamburg/corpora/validation/quest/EXMARaLDAAnnotationChecker.java index a9bbe682b7e023bbd992264abd563f1edf7b9733..2d4b456d5b3de55d7b77e7dd3a69a9e7becaf3bf 100644 --- a/src/main/java/de/uni_hamburg/corpora/validation/quest/EXMARaLDAAnnotationChecker.java +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/EXMARaLDAAnnotationChecker.java @@ -1,14 +1,21 @@ package de.uni_hamburg.corpora.validation.quest; -import de.uni_hamburg.corpora.CorpusData; -import de.uni_hamburg.corpora.EXMARaLDATranscriptionData; -import de.uni_hamburg.corpora.EXMARaLDASegmentedTranscriptionData; +import de.uni_hamburg.corpora.*; import de.uni_hamburg.corpora.utilities.quest.XMLTools; +import org.exmaralda.partitureditor.fsm.FSMException; +import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException; import org.jdom.Document; import org.jdom.Element; import org.jdom.JDOMException; import org.jdom.xpath.XPath; +import org.xml.sax.SAXException; +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.transform.TransformerException; +import javax.xml.xpath.XPathExpressionException; +import java.io.IOException; +import java.net.URISyntaxException; +import java.security.NoSuchAlgorithmException; import java.util.*; import java.util.stream.Collectors; @@ -18,6 +25,26 @@ public class EXMARaLDAAnnotationChecker extends AnnotationChecker { super(properties); } + @Override + public Report function(Corpus c, Boolean fix) throws NoSuchAlgorithmException, ClassNotFoundException, FSMException, URISyntaxException, SAXException, IOException, ParserConfigurationException, JexmaraldaException, TransformerException, XPathExpressionException, JDOMException { + Report report = new Report(); + // Check if we have a tier pattern. if yes we use the tier finder to get all tier ids + if (props.containsKey("annotation-tier-pattern")) { + // Copy old properties + Properties newProperties = new Properties(); + newProperties.putAll(props); + // convert tier pattern + newProperties.put("tier-pattern", props.getProperty("annotation-tier-pattern")); + // run tier finder + EXMARaLDATierFinder etf = new EXMARaLDATierFinder(newProperties); + report.merge(etf.function(c, fix)); + tierIds.addAll(etf.getTierList()); + setUp = true; + } + report.merge(super.function(c, fix)); + return report; + } + @Override public String getDescription() { return "Either checks the annotation in a list of tiers or generates statistics about the tags used in an " + diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/EXMARaLDASpeakerChecker.java b/src/main/java/de/uni_hamburg/corpora/validation/quest/EXMARaLDASpeakerChecker.java index 0f8c53802143efdc3e7465a80d6783070145e0df..5dd5e6faed861479cc32fbdc0fafd5d19d1b7eae 100644 --- a/src/main/java/de/uni_hamburg/corpora/validation/quest/EXMARaLDASpeakerChecker.java +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/EXMARaLDASpeakerChecker.java @@ -42,11 +42,18 @@ public class EXMARaLDASpeakerChecker extends SpeakerChecker { */ @Override protected List<String> getCorpusSpeakerList(Corpus c) throws JDOMException { - Document dom = c.getComaData().getJdom(); - uniqueSpeakerDistinction = dom.getRootElement().getAttributeValue("uniqueSpeakerDistinction"); - List<Text> sigles = Collections.checkedList(XPath.newInstance("//Speaker/Sigle/text()").selectNodes(dom), - Text.class); - return sigles.stream().map(Text::getText).collect(Collectors.toList()); + List<String> speakers = new ArrayList<>(); + for (CorpusData cd : c.getCorpusData()) { + if (cd.getClass().equals(ComaData.class)) { + Document dom = ((ComaData) cd).getJdom(); + uniqueSpeakerDistinction = dom.getRootElement().getAttributeValue("uniqueSpeakerDistinction"); + List<Text> sigles = Collections.checkedList(XPath.newInstance("//Speaker/Sigle/text()").selectNodes(dom), + Text.class); + speakers.addAll(sigles.stream().map(Text::getText).collect(Collectors.toList())); + } + + } + return speakers; } /** diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/EXMARaLDATierFinder.java b/src/main/java/de/uni_hamburg/corpora/validation/quest/EXMARaLDATierFinder.java index 7568f56a7c28ccb2d1c5ea7483700fb345f4ca20..9a65105f376e84374fa045d3f6a350c0b97cdb4b 100644 --- a/src/main/java/de/uni_hamburg/corpora/validation/quest/EXMARaLDATierFinder.java +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/EXMARaLDATierFinder.java @@ -20,8 +20,9 @@ public class EXMARaLDATierFinder extends TierFinder { public EXMARaLDATierFinder(Properties properties) { super(properties); - if (attribute == null || attribute.isEmpty()) { - attribute = "id"; + // Use default attribute + if (attribute_name == null || attribute_name.isEmpty()) { + attribute_name = "id"; } } @@ -44,16 +45,16 @@ public class EXMARaLDATierFinder extends TierFinder { List<String> tierIds = new ArrayList<>(); if (cd instanceof EXMARaLDATranscriptionData) { Document dom = ((EXMARaLDATranscriptionData) cd).getJdom(); - tierIds.addAll(((List<Attribute>) Collections.checkedList(XPath.newInstance( - String.format("//tier[contains(@%s,\"%s\")]/@id", - attribute, pattern)).selectNodes(dom), Attribute.class)) + String xpath = String.format("//tier[contains(@%s,\"%s\")]/@id", + attribute_name, pattern); + tierIds.addAll(((List<Attribute>) Collections.checkedList(XPath.newInstance(xpath).selectNodes(dom), Attribute.class)) .stream().map(Attribute::getValue).collect(Collectors.toList())); } else if (cd instanceof EXMARaLDASegmentedTranscriptionData) { Document dom = ((EXMARaLDASegmentedTranscriptionData) cd).getJdom(); tierIds.addAll(((List<Attribute>) Collections.checkedList(XPath.newInstance( String.format("//segmented-tier[contains(@%s,\"%s\")]/@id", - attribute, pattern)).selectNodes(dom), Attribute.class)) + attribute_name, pattern)).selectNodes(dom), Attribute.class)) .stream().map(Attribute::getValue).collect(Collectors.toList())); } // Add found tiers to frequency list diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/EXMARaLDATranscriptionChecker.java b/src/main/java/de/uni_hamburg/corpora/validation/quest/EXMARaLDATranscriptionChecker.java index 1f061c01132b2a51905e5822ac5dd55b8a2b8218..f1a612964c1e5c8abe3d733ab91edb7cc0699b0f 100644 --- a/src/main/java/de/uni_hamburg/corpora/validation/quest/EXMARaLDATranscriptionChecker.java +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/EXMARaLDATranscriptionChecker.java @@ -1,19 +1,55 @@ package de.uni_hamburg.corpora.validation.quest; -import de.uni_hamburg.corpora.CorpusData; -import de.uni_hamburg.corpora.EXMARaLDATranscriptionData; +import de.uni_hamburg.corpora.*; +import org.exmaralda.partitureditor.fsm.FSMException; +import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException; +import org.jdom.Document; import org.jdom.Element; +import org.jdom.JDOMException; +import org.jdom.xpath.XPath; +import org.xml.sax.SAXException; -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.Properties; +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.transform.TransformerException; +import javax.xml.xpath.XPathExpressionException; +import java.io.IOException; +import java.net.URISyntaxException; +import java.security.NoSuchAlgorithmException; +import java.util.*; +import java.util.logging.Logger; +/** + * Checker for transcription data in an EXMARaLDA file + * @author bba1792, Dr. Herbert Lange + * @version 20220516 + */ public class EXMARaLDATranscriptionChecker extends TranscriptionChecker { + + private final Logger logger = Logger.getLogger(getFunction()); + public EXMARaLDATranscriptionChecker(Properties properties) { super(properties); } + @Override + public Report function(Corpus c, Boolean fix) throws NoSuchAlgorithmException, ClassNotFoundException, FSMException, URISyntaxException, SAXException, IOException, ParserConfigurationException, JexmaraldaException, TransformerException, XPathExpressionException, JDOMException { + Report report = new Report(); + // Check if we have a tier pattern. if yes we use the tier finder to get all tier ids + if (props.containsKey("transcription-tier-pattern")) { + // Copy old properties + Properties newProperties = new Properties(); + newProperties.putAll(props); + // convert tier pattern + newProperties.put("tier-pattern", props.getProperty("transcription-tier-pattern")); + // run tier finder + EXMARaLDATierFinder etf = new EXMARaLDATierFinder(newProperties); + report.merge(etf.function(c, fix)); + tierIds.addAll(etf.getTierList()); + } + report.merge(super.function(c, fix)); + return report; + } + @Override public String getDescription() { return "Checker for the transcription in an EXMARaLDA transcription file"; @@ -25,12 +61,27 @@ public class EXMARaLDATranscriptionChecker extends TranscriptionChecker { } @Override - List<Element> getTranscriptionTiers(CorpusData cd) { - return null; - } + List<Element> getTranscriptionTiers(CorpusData cd) throws JDOMException { + List<Element> tiers = new ArrayList<>(); + Document dom = ((EXMARaLDATranscriptionData) cd).getJdom(); + // Explicit list of tiers + if (!tierIds.isEmpty()) + for (String id : tierIds) { + Element tier = + (Element) XPath.newInstance(String.format("//tier[@id=\"%s\"]",id)).selectSingleNode(dom); + if (tier != null) + tiers.add(tier); + } + // HIAT tiers of category v (verbal) + else if (props.containsKey("transcription-method") && + props.getProperty("transcription-method").equalsIgnoreCase("hiat")) { + logger.info("HIAT"); + tiers.addAll(Collections.checkedList(XPath.newInstance("//tier[@category=\"v\"]").selectNodes(dom), + Element.class)); + } + return tiers; + - @Override - String getTranscriptionText(Element tier) { - return null; } + } diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/EXMARaLDAValidatorChecker.java b/src/main/java/de/uni_hamburg/corpora/validation/quest/EXMARaLDAValidatorChecker.java index a2d2059625796ec93fb637b53f0f1a0d0b70eee5..fa224ae28dbef961c8eb31e02eebfe78eb1b29f9 100644 --- a/src/main/java/de/uni_hamburg/corpora/validation/quest/EXMARaLDAValidatorChecker.java +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/EXMARaLDAValidatorChecker.java @@ -21,6 +21,7 @@ import java.nio.file.Paths; import java.security.NoSuchAlgorithmException; import java.util.*; import java.util.logging.Logger; +import java.util.stream.Collectors; /** @@ -46,6 +47,7 @@ public class EXMARaLDAValidatorChecker extends Checker implements CorpusFunction @Override public Report function(CorpusData cd, Boolean fix) throws NoSuchAlgorithmException, ClassNotFoundException, FSMException, URISyntaxException, SAXException, IOException, ParserConfigurationException, JexmaraldaException, TransformerException, XPathExpressionException, JDOMException { logger.info("Running function"); + boolean problem = false; Report report = new Report(); URL fileUri = Paths.get(cd.getURL().toURI()).toAbsolutePath().toUri().toURL(); BasicTranscription bt = new BasicTranscription(); @@ -53,6 +55,7 @@ public class EXMARaLDAValidatorChecker extends Checker implements CorpusFunction bt.BasicTranscriptionFromJDOMDocument(((EXMARaLDATranscriptionData) cio.readFileURL(fileUri)).getJdom()); String[] duplicateTranscriptionTiers = bt.getDuplicateTranscriptionTiers(); if (duplicateTranscriptionTiers.length > 0) { + problem = true; report.addCritical(getFunction(), ReportItem.newParamMap(new String[]{"function", "filename", "description"}, new String[]{getFunction(), cd.getFilename(), "Duplicate transcription tiers: " + String.join( @@ -60,6 +63,7 @@ public class EXMARaLDAValidatorChecker extends Checker implements CorpusFunction } String[] orphanedTranscriptionTiers = bt.getOrphanedTranscriptionTiers(); if (orphanedTranscriptionTiers.length > 0) { + problem = true; report.addWarning(getFunction(), ReportItem.newParamMap(new String[]{"function", "filename", "description"}, new String[]{getFunction(), cd.getFilename(), "Orphaned transcription tiers: " + String.join( @@ -67,6 +71,7 @@ public class EXMARaLDAValidatorChecker extends Checker implements CorpusFunction } String[] orphanedAnnotationTiers = bt.getOrphanedAnnotationTiers(); if (orphanedAnnotationTiers.length > 0) { + problem = true; report.addWarning(getFunction(), ReportItem.newParamMap(new String[]{"function", "filename", "description"}, new String[]{getFunction(), cd.getFilename(), "Orphaned annotation tiers: " + String.join( @@ -74,6 +79,7 @@ public class EXMARaLDAValidatorChecker extends Checker implements CorpusFunction } String[] inconsistencies = bt.getBody().getCommonTimeline().getInconsistencies(); if (inconsistencies.length > 0) { + problem = true; report.addCritical(getFunction(), ReportItem.newParamMap(new String[]{"function", "filename", "description"}, new String[]{getFunction(), cd.getFilename(), @@ -81,18 +87,30 @@ public class EXMARaLDAValidatorChecker extends Checker implements CorpusFunction ",", inconsistencies)})); } Hashtable<String, String[]> annotationMismatches = bt.getAnnotationMismatches(); - if (!annotationMismatches.isEmpty()) - report.addCritical(getFunction(),ReportItem.newParamMap(new String[]{"function", "filename", "description"}, - new String[]{getFunction(), cd.getFilename(),"Annotation mismatch in tiers: " + String.join(",", - annotationMismatches.keySet())})); + // Only check tiers where we have a non-empty list in the map + Set<String> missmatchTiers = annotationMismatches.keySet().stream().filter((k) -> + annotationMismatches.get(k).length != 0).collect(Collectors.toSet()); + if (!missmatchTiers.isEmpty()) { + problem = true; + report.addCritical(getFunction(), ReportItem.newParamMap(new String[]{"function", "filename", "description"}, + new String[]{getFunction(), cd.getFilename(), "Annotation mismatch in tiers: " + String.join(",", + missmatchTiers)})); + } Vector segmentationErrors = new HIATSegmentation().getSegmentationErrors(bt); // TODO the exact reason and form of segmentation errors is not clear if (!segmentationErrors.isEmpty()) { + problem = true; for (Object o : segmentationErrors) { report.addCritical(getFunction(),ReportItem.newParamMap(new String[]{"function", "filename", "description"}, new String[]{getFunction(), cd.getFilename(),"HIAT Segmentation error: " + o.toString()})); } } + if (!problem) { + report.addCorrect(getFunction(),ReportItem.newParamMap( + new String[]{"function","filename","description"}, + new Object[]{getFunction(),cd.getFilename(),"No problems found in file"} + )); + } return report; } diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/ExbLangCodes.java b/src/main/java/de/uni_hamburg/corpora/validation/quest/ExbLangCodes.java index c14ccb392f875947188666c70327dd1a9db9a043..5b73854a98c868837f5e6a303e7b3b5a4c4ebe51 100644 --- a/src/main/java/de/uni_hamburg/corpora/validation/quest/ExbLangCodes.java +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/ExbLangCodes.java @@ -76,7 +76,8 @@ public class ExbLangCodes extends Checker implements CorpusFunction { public Report function(Corpus c, Boolean fix) throws NoSuchAlgorithmException, ClassNotFoundException, FSMException, URISyntaxException, SAXException, IOException, ParserConfigurationException, JexmaraldaException, TransformerException, XPathExpressionException, JDOMException { Report stats = new Report(); for (CorpusData cdata : c.getBasicTranscriptionData()) { - stats.merge(function(cdata, fix)); + if (getIsUsableFor().contains(cdata.getClass())) + stats.merge(function(cdata, fix)); } return stats; } diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/FileListChecker.java b/src/main/java/de/uni_hamburg/corpora/validation/quest/FileListChecker.java index 9b791a674da63dfb0108e44e251f97d3c0717567..ec36096e9d62902cdf11014b08a236c0b2b0bee4 100644 --- a/src/main/java/de/uni_hamburg/corpora/validation/quest/FileListChecker.java +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/FileListChecker.java @@ -48,17 +48,39 @@ public class FileListChecker extends Checker implements CorpusFunction { return uris; } + /** + * Splits a file list on commas and converts to URIs + * @param fileList the comma-separated list + * @return the set of URIs + * @throws MalformedURLException if the file name cannot be converted into a URL + * @throws URISyntaxException if the URL cannot be converted into a URI + */ + private static Set<URI> splitFileList(String fileList) throws MalformedURLException, URISyntaxException { + Set<URI> uris = new HashSet<>(); + for (String fname : fileList.split(",")) { + uris.add(new URL(fname).toURI().normalize()); + } + return uris; + } + Set<URI> expectedFiles = new HashSet<>(); Set<URI> presentFiles = new HashSet<>(); public FileListChecker(Properties properties) throws FileNotFoundException, MalformedURLException, URISyntaxException { super(false, properties); - if (properties.containsKey("expected-files-list")) { - expectedFiles = readFileList(properties.getProperty("expected-files-list")); + if (properties.containsKey("expected-files-file")) { + expectedFiles = readFileList(properties.getProperty("expected-files-file")); + } + else if (properties.containsKey("expected-files-list")) { + expectedFiles = splitFileList(properties.getProperty("expected-files-list")); + } + if (properties.containsKey("present-files-file")) { + presentFiles = readFileList(properties.getProperty("present-files-file")); } - if (properties.containsKey("present-files-list")) { - presentFiles = readFileList(properties.getProperty("present-files-list")); + else if (properties.containsKey("present-files-list")) { + expectedFiles = splitFileList(properties.getProperty("present-files-list")); } + } /** @@ -95,17 +117,22 @@ public class FileListChecker extends Checker implements CorpusFunction { @Override public Report function(Corpus c, Boolean fix) throws NoSuchAlgorithmException, ClassNotFoundException, FSMException, URISyntaxException, SAXException, IOException, ParserConfigurationException, JexmaraldaException, TransformerException, XPathExpressionException, JDOMException { + if (props.containsKey("expected-files-linked")) { + LinkedFileChecker lfc = new LinkedFileChecker(props); + lfc.function(c, fix); + expectedFiles.addAll(lfc.getFileList()); + } Report report = new Report(); // Try to read corpus directory instead if (presentFiles.isEmpty()){ presentFiles.addAll(FileTools.listFiles(Paths.get(c.getBaseDirectory().toURI()))); } Set<URI> unexpectedFiles = - presentFiles.stream().filter((f) -> !expectedFiles.contains(f)).collect(Collectors.toSet()); + presentFiles.stream().filter((f) -> !(expectedFiles.contains(f) || new File(f).isDirectory())).collect(Collectors.toSet()); Set<URI> missingFiles = expectedFiles.stream().filter((f) -> !presentFiles.contains(f)).collect(Collectors.toSet()); if (!unexpectedFiles.isEmpty()) - report.addCritical(getFunction(), ReportItem.newParamMap(new String[]{"function", + report.addWarning(getFunction(), ReportItem.newParamMap(new String[]{"function", "description", "howtoFix"}, new Object[]{getFunction(), "Unexpected files encountered:\n" + @@ -116,7 +143,7 @@ public class FileListChecker extends Checker implements CorpusFunction { report.addCritical(getFunction(), ReportItem.newParamMap(new String[]{"function", "description", "howtoFix"}, new Object[]{getFunction(), - "Files does not exist:\n" + + "Files do not exist:\n" + missingFiles.stream().map(URI::toString).collect(Collectors.joining("\n")), "Check the file references in the documentation and remove the reference to " + "the files if they have been removed intentionally"})); @@ -131,9 +158,13 @@ public class FileListChecker extends Checker implements CorpusFunction { @Override public Map<String, String> getParameters() { Map<String,String> params = super.getParameters(); - params.put("expected-files-list","A file containing names of all expected file names, one name per line"); - params.put("present-files-list", "A file containing names of all file names of files present, one name per " + + params.put("expected-files-file","A file containing names of all expected file names, one name per line"); + params.put("present-files-file", "A file containing names of all file names of files present, one name per " + "line"); + params.put("expected-files-list","A list containing names of all expected file names, separated by commas"); + params.put("present-files-list", "A list containing names of all file names of files present, separated by " + + "commas"); + params.put("expected-files-linked", "Flag to use a files linked in corpus files as expected files"); return params; } } diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/GenericMetadataChecker.java b/src/main/java/de/uni_hamburg/corpora/validation/quest/GenericMetadataChecker.java index 7e3e65874a0f9529d69836baf0d9de6794020bca..e94878058e7c330769700bcf92df8220cadb04c3 100644 --- a/src/main/java/de/uni_hamburg/corpora/validation/quest/GenericMetadataChecker.java +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/GenericMetadataChecker.java @@ -13,6 +13,8 @@ import javax.xml.transform.TransformerException; import javax.xml.xpath.XPathExpressionException; import java.io.FileReader; import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; @@ -101,11 +103,6 @@ abstract class GenericMetadataChecker extends Checker implements CorpusFunction @Override public Report function(CorpusData cd, Boolean fix) throws NoSuchAlgorithmException, ClassNotFoundException, FSMException, URISyntaxException, SAXException, IOException, ParserConfigurationException, JexmaraldaException, TransformerException, XPathExpressionException, JDOMException { Report report = new Report(); - // Validate XML data if possible - if (cd instanceof XMLData) { - XsdChecker xsdc = new XsdChecker(new Properties()); - report.merge(xsdc.function(cd,false)); - } // Only work if properly set up if (setUp && shouldBeChecked(cd.getURL())) { for (GenericMetadataCriterion c : criteria) { @@ -376,6 +373,27 @@ abstract class GenericMetadataChecker extends Checker implements CorpusFunction } } + /** + * Loads the criteria as a resource + * @param name the name of the resource + */ + public void loadCriteriaResource(String name) { + try { + InputStream resourceStream = this.getClass().getClassLoader().getResourceAsStream("metadata/"+name); + if (resourceStream != null) { + // Read CSV file + criteria = new CsvToBeanBuilder<GenericMetadataCriterion>(new InputStreamReader(resourceStream)) + .withType(GenericMetadataCriterion.class) + .withSkipLines(1) // skip header + .build() + .parse(); + setUp = true; + } + } catch (Exception e) { + logger.log(Level.SEVERE, "Encountered exception when loading criteria ", e); + } + } + @Override public Map<String, String> getParameters() { Map<String, String> params = super.getParameters(); diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/GlossChecker.java b/src/main/java/de/uni_hamburg/corpora/validation/quest/GlossChecker.java index d6b098e0aeaf0f48fbde90f9dafaf6594a8489c8..ba2bacb0c3fb85fb2af25db631cb1cae06637b92 100644 --- a/src/main/java/de/uni_hamburg/corpora/validation/quest/GlossChecker.java +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/GlossChecker.java @@ -186,7 +186,13 @@ public abstract class GlossChecker extends Checker implements CorpusFunction { @Override public Report function(Corpus c, Boolean fix) throws NoSuchAlgorithmException, ClassNotFoundException, FSMException, URISyntaxException, SAXException, IOException, ParserConfigurationException, JexmaraldaException, TransformerException, XPathExpressionException, JDOMException { - return null; + Report report = new Report(); + for (CorpusData cd : c.getCorpusData()) { + if (getIsUsableFor().contains(cd.getClass())) { + report.merge(function(cd,fix)); + } + } + return report; } @Override diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/IMDIGenericMetadataChecker.java b/src/main/java/de/uni_hamburg/corpora/validation/quest/IMDIGenericMetadataChecker.java index 2c16918decea77e8e3252067b8db13097f298026..ea6b0a253f576b92726cd1a5325465b8c9bc6705 100644 --- a/src/main/java/de/uni_hamburg/corpora/validation/quest/IMDIGenericMetadataChecker.java +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/IMDIGenericMetadataChecker.java @@ -24,9 +24,11 @@ public class IMDIGenericMetadataChecker extends GenericMetadataChecker implement */ public IMDIGenericMetadataChecker(Properties properties) { super(properties); - logger.info(properties.toString()); if (properties != null && !properties.isEmpty() && properties.containsKey("imdi-criteria-file")) setCriteriaFile(properties.getProperty("imdi-criteria-file")); + else { + loadCriteriaResource("imdi-generic.csv"); + } } /** diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/LinkedFileChecker.java b/src/main/java/de/uni_hamburg/corpora/validation/quest/LinkedFileChecker.java index 307df798ae8b4447073478762bfc5473ce5ca704..fe37d5b47fbbe93168dd0bf87be3070bfa3b20a9 100644 --- a/src/main/java/de/uni_hamburg/corpora/validation/quest/LinkedFileChecker.java +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/LinkedFileChecker.java @@ -138,13 +138,15 @@ public class LinkedFileChecker extends Checker implements CorpusFunction { List<URI> refFiles = new ArrayList<>(); try { if (cd instanceof ComaData) - refFiles.addAll(getReferencedFiles(report, (ComaData) cd)); + refFiles.addAll(getReferencedFiles((ComaData) cd)); else if (cd instanceof EXMARaLDATranscriptionData) - refFiles.addAll(getReferencedFiles(report, (EXMARaLDATranscriptionData) cd)); + refFiles.addAll(getReferencedFiles((EXMARaLDATranscriptionData) cd)); + else if (cd instanceof EXMARaLDASegmentedTranscriptionData) + refFiles.addAll(getReferencedFiles((EXMARaLDASegmentedTranscriptionData) cd)); else if (cd instanceof ELANData) - refFiles.addAll(getReferencedFiles(report, (ELANData) cd)); + refFiles.addAll(getReferencedFiles((ELANData) cd)); else if (cd instanceof IMDIData) - refFiles.addAll(getReferencedFiles(report, (IMDIData) cd)); + refFiles.addAll(getReferencedFiles((IMDIData) cd)); } catch (JDOMException | MalformedURLException | URISyntaxException e) { report.addCritical(getFunction(), ReportItem.newParamMap( @@ -163,7 +165,8 @@ public class LinkedFileChecker extends Checker implements CorpusFunction { public Report function(Corpus c, Boolean fix) throws NoSuchAlgorithmException, ClassNotFoundException, FSMException, URISyntaxException, SAXException, IOException, ParserConfigurationException, JexmaraldaException, TransformerException, XPathExpressionException, JDOMException { Report report = new Report(); for (CorpusData cd : c.getCorpusData()) - report.merge(function(cd, fix)); + if (getIsUsableFor().contains(cd.getClass())) + report.merge(function(cd, fix)); return report; } @@ -172,6 +175,7 @@ public class LinkedFileChecker extends Checker implements CorpusFunction { Set<Class<? extends CorpusData>> usableFor = new HashSet<>(); usableFor.add(ComaData.class); usableFor.add(EXMARaLDATranscriptionData.class); + usableFor.add(EXMARaLDASegmentedTranscriptionData.class); usableFor.add(ELANData.class); usableFor.add(IMDIData.class); // usableFor.add(TEIData.class); @@ -184,14 +188,13 @@ public class LinkedFileChecker extends Checker implements CorpusFunction { /** * Gets the list of files from an Coma corpus file * - * @param report the report to store potential problems * @param cd the corpus file * @return the list of URIs for all referenced files * @throws JDOMException on problems accessing information using xpath * @throws MalformedURLException on problems creating URIs * @throws URISyntaxException on problems creating URIs */ - private List<URI> getReferencedFiles(Report report, ComaData cd) throws JDOMException, MalformedURLException, URISyntaxException { + private List<URI> getReferencedFiles(ComaData cd) throws JDOMException, MalformedURLException, URISyntaxException { ArrayList<URI> files = new ArrayList<>(); Set<String> part1 = new HashSet<>(Arrays.asList("Transcription", "transcription", "Media", "media")); @@ -216,17 +219,17 @@ public class LinkedFileChecker extends Checker implements CorpusFunction { /** * Gets the list of files from an EXMARaLDA corpus file - * - * @param report the report to store potential problems + * * @param cd the corpus file * @return the list of URIs for all referenced files * @throws JDOMException on problems accessing information using xpath * @throws MalformedURLException on problems creating URIs * @throws URISyntaxException on problems creating URIs */ - private List<URI> getReferencedFiles(Report report, EXMARaLDATranscriptionData cd) throws JDOMException, MalformedURLException, URISyntaxException { + private List<URI> getReferencedFiles(EXMARaLDATranscriptionData cd) throws JDOMException, MalformedURLException, URISyntaxException { ArrayList<URI> files = new ArrayList<>(); - List<Element> referencedFiles = XPath.newInstance("//referenced-file").selectNodes(cd.getJdom()); + List<Element> referencedFiles = + new ArrayList<>(XPath.newInstance("//referenced-file").selectNodes(cd.getJdom())); for (Element file : referencedFiles) { File tmpFile = new File(new URL(cd.getParentURL() + file.getAttribute("url").getValue()).toURI()); @@ -237,38 +240,62 @@ public class LinkedFileChecker extends Checker implements CorpusFunction { } /** - * Gets the list of files from an ELAN corpus file + * Gets the list of files from an EXMARaLDA segmented corpus file * - * @param report the report to store potential problems * @param cd the corpus file * @return the list of URIs for all referenced files * @throws JDOMException on problems accessing information using xpath * @throws MalformedURLException on problems creating URIs * @throws URISyntaxException on problems creating URIs */ - private List<URI> getReferencedFiles(Report report, ELANData cd) throws JDOMException, MalformedURLException, URISyntaxException { + private List<URI> getReferencedFiles(EXMARaLDASegmentedTranscriptionData cd) throws JDOMException, MalformedURLException, URISyntaxException { ArrayList<URI> files = new ArrayList<>(); - List<Element> referencedFiles = XPath.newInstance("//MEDIA_DESCRIPTOR").selectNodes(cd.getJdom()); + List<Element> referencedFiles = + new ArrayList<>(XPath.newInstance("//referenced-file").selectNodes(cd.getJdom())); for (Element file : referencedFiles) { File tmpFile = new File(new URL(cd.getParentURL() + - file.getAttribute("RELATIVE_MEDIA_URL").getValue()).toURI()); + file.getAttribute("url").getValue()).toURI()); URI fileUri = tmpFile.toURI(); files.add(fileUri); } return files; } + /** + * Gets the list of files from an ELAN corpus file + * + * @param cd the corpus file + * @return the list of URIs for all referenced files + * @throws JDOMException on problems accessing information using xpath + * @throws MalformedURLException on problems creating URIs + * @throws URISyntaxException on problems creating URIs + */ + private List<URI> getReferencedFiles(ELANData cd) throws JDOMException, MalformedURLException, URISyntaxException { + ArrayList<URI> files = new ArrayList<>(); + List<Element> referencedFiles = + new ArrayList<>(XPath.newInstance("//MEDIA_DESCRIPTOR").selectNodes(cd.getJdom())); + for (Element file : referencedFiles) { + logger.info(file.toString()); + if (file.getAttributeValue("RELATIVE_MEDIA_URL") != null) { + File tmpFile = new File(new URL(cd.getParentURL() + + file.getAttributeValue("RELATIVE_MEDIA_URL")).toURI()); + URI fileUri = tmpFile.toURI(); + files.add(fileUri); + } + } + return files; + } + /** * Gets the list of files from an IMDI corpus file * - * @param report the report to store potential problems * @param cd the corpus file * @return the list of URIs for all referenced files * @throws JDOMException on problems accessing information using xpath * @throws MalformedURLException on problems creating URIs * @throws URISyntaxException on problems creating URIs */ - private List<URI> getReferencedFiles(Report report, IMDIData cd) throws JDOMException, MalformedURLException, URISyntaxException { + private List<URI> getReferencedFiles(IMDIData cd) throws JDOMException, MalformedURLException, URISyntaxException { ArrayList<URI> files = new ArrayList<>(); List<Element> referencedFiles = XPath.newInstance("//MEDIA_DESCRIPTOR").selectNodes(cd.getJdom()); for (Element file : referencedFiles) { diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/NullChecker.java b/src/main/java/de/uni_hamburg/corpora/validation/quest/NullChecker.java index 9e927071de3184a2da4c34efd96836e3631daada..2ea6e57242da793a70d4e9e8b021aa4ddcf2934b 100644 --- a/src/main/java/de/uni_hamburg/corpora/validation/quest/NullChecker.java +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/NullChecker.java @@ -32,7 +32,8 @@ public class NullChecker extends Checker implements CorpusFunction { public Report function(Corpus c, Boolean fix) { Report stats = new Report(); for (CorpusData cdata : c.getCorpusData()) { - stats.merge(execute(cdata)); + if (getIsUsableFor().contains(cdata.getClass())) + stats.merge(execute(cdata)); } return stats; } diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/RefcoChecker.java b/src/main/java/de/uni_hamburg/corpora/validation/quest/RefcoChecker.java index f3f66396d485d483e8076ce6edc4f1510f274a5a..e344b59cdaee6e34b4f3f7e0b0a0ea818530331f 100644 --- a/src/main/java/de/uni_hamburg/corpora/validation/quest/RefcoChecker.java +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/RefcoChecker.java @@ -9,6 +9,7 @@ import com.google.common.primitives.Chars; import de.uni_hamburg.corpora.*; import de.uni_hamburg.corpora.utilities.quest.DictionaryAutomaton; import de.uni_hamburg.corpora.utilities.quest.FileTools; +import de.uni_hamburg.corpora.utilities.quest.FrequencyList; import de.uni_hamburg.corpora.utilities.quest.XMLTools; import de.uni_hamburg.corpora.validation.Checker; import org.apache.commons.lang.time.DurationFormatUtils; @@ -557,23 +558,26 @@ public class RefcoChecker extends Checker implements CorpusFunction { /** * The frequency list of all transcription tokens in the corpus */ - private HashMap<String,Integer> tokenFreq = new HashMap<>(); + // private HashMap<String,Integer> tokenFreq = new HashMap<>(); + private FrequencyList tokenFreq = new FrequencyList(); /** * The frequency list of all segmented annotation/morphology glosses in the corpus */ - private HashMap<String,Integer> morphemeFreq = new HashMap<>(); + // private HashMap<String,Integer> morphemeFreq = new HashMap<>(); + private FrequencyList morphemeFreq = new FrequencyList(); /** * The frequency list of all non-segmented annotation/morphology glosses in the corpus */ - private HashMap<String,Integer> glossFreq = new HashMap<>(); + // private HashMap<String,Integer> glossFreq = new HashMap<>(); + private FrequencyList glossFreq = new FrequencyList(); /** * The frequency list of all non-segmentable gloss tokens */ - private HashMap<String,Integer> missingGlossFreq = new HashMap<>(); - + // private HashMap<String,Integer> missingGlossFreq = new HashMap<>(); + private FrequencyList missingGlossFreq = new FrequencyList(); /** * The global report, will be filled by the constructor and the function applied to the complete corpus */ @@ -689,7 +693,7 @@ public class RefcoChecker extends Checker implements CorpusFunction { public Report function(Corpus c, Boolean fix) throws NoSuchAlgorithmException, ClassNotFoundException, FSMException, URISyntaxException, SAXException, IOException, ParserConfigurationException, JexmaraldaException, TransformerException, XPathExpressionException, JDOMException { if (refcoFileLoaded) { report.addNote(getFunction(),"Report created by RefCo checker version " + REFCO_CHECKER_VERSION + - " based on documentation following RefCo " + criteria.refcoVersion + + " based on documentation following RefCo " + criteria.refcoVersion.information + " specification version"); System.out.println("... running the corpus function"); // Create the current report @@ -697,21 +701,27 @@ public class RefcoChecker extends Checker implements CorpusFunction { // Set the RefCo corpus setRefcoCorpus(c); // Initialize frequency list for glosses - for (Gloss gloss : criteria.glosses) { - morphemeFreq.put(gloss.gloss, 0); - } +// for (Gloss gloss : criteria.glosses) { +// morphemeFreq.put(gloss.gloss, 0); +// } // Run the generic tests and merge their reports into the current report // but flag allows skipping it if (!props.containsKey("skip-documentation-check") || !props.getProperty("skip-documentation-check").equalsIgnoreCase("true")) report.merge(refcoDocumentationCheck()); + // Generic tier test + props.put("elan-speakers", + criteria.sessions.stream().map((s) -> + s.speakerName).collect(Collectors.joining(","))); + ELANTierStructureChecker etsc = new ELANTierStructureChecker(props); + report.merge(etsc.function(c,false)); // Apply function for each of the supported file. Again merge the reports for (CorpusData cdata : c.getCorpusData()) { //report.merge(function(cdata, fix)); function(cdata, fix); } // Check for morpheme glosses that never occurred in the complete corpus - for (Map.Entry<String, Integer> e : morphemeFreq.entrySet()) { + for (Map.Entry<String, Integer> e : morphemeFreq.getMap().entrySet()) { if (e.getValue() == 0) report.addWarning(getFunction(), ReportItem.newParamMap(new String[]{"function", "filename", "description", "howtoFix"}, @@ -719,11 +729,18 @@ public class RefcoChecker extends Checker implements CorpusFunction { "Corpus data: Morpheme gloss never encountered in corpus: " + e.getKey(), "Check for potential errors or remove gloss from documentation"})); } - if (!missingGlossFreq.isEmpty() && props.containsKey("missing-gloss-stats") && - props.getProperty("missing-gloss-stats").equalsIgnoreCase("true")) + if (!missingGlossFreq.isEmpty()) report.addNote(getFunction(),"Corpus data: Morpheme glosses missing from documentations:\n" + - missingGlossFreq.keySet().stream().map((k) -> k + ":" + missingGlossFreq.get(k)) - .collect(Collectors.joining("\n"))); + missingGlossFreq.toString()); +// missingGlossFreq.keySet().stream().map((k) -> k + ":" + missingGlossFreq.get(k)) +// .collect(Collectors.joining("\n"))); + if (!glossFreq.isEmpty() && props.containsKey("gloss-stats") && + props.getProperty("gloss-stats").equalsIgnoreCase("true")) { + report.addNote(getFunction(), "Corpus data: Glosses encountered in the corpus:\n" + + glossFreq.toString()); +// glossFreq.keySet().stream().map((k) -> k + ":" + glossFreq.get(k)) +// .collect(Collectors.joining("\n"))); + } // Check all gloss tokens (not-segmented) for rare ones very similar to quite common ones, i.e. tokens with // Levenshtein difference 1 with a higher frequency count /*DictionaryAutomaton glossDictionary = @@ -1286,10 +1303,12 @@ public class RefcoChecker extends Checker implements CorpusFunction { if (punctuation.function.equalsIgnoreCase("morpheme break")) glossSeparator.add(punctuation.character); criteria.punctuations.add(punctuation); - } else if (columns.size() > 0 && !safeGetText(columns.get(0).getChild("p", textNamespace)).equals( - "Characters")) { - missingData = true; } + // TODO: this is weird +// else if (columns.size() > 0 && !safeGetText(columns.get(0).getChild("p", textNamespace)).equals( +// "Characters")) { +// missingData = true; +// } } if (missingData || rowList.size() <= 1) report.addCritical(getFunction(),ReportItem.newParamMap(new String[]{"function","filename", @@ -2029,7 +2048,8 @@ public class RefcoChecker extends Checker implements CorpusFunction { // Check if token either is a gloss or each character is in the valid characters mismatch = false ; // Update frequency list - tokenFreq.compute(token,(k,v) -> (v == null) ? 1 : v + 1); + //tokenFreq.compute(token,(k,v) -> (v == null) ? 1 : v + 1); + tokenFreq.put(token); // Token is not one of the glosses if (!glosses.contains(token)) { // Check if we can segment the token using the chunks @@ -2236,7 +2256,8 @@ public class RefcoChecker extends Checker implements CorpusFunction { List<String> segments = glossAutomaton.segmentWord(normalizedMorpheme); if (segments == null || segments.isEmpty()) { missing += 1; - missingGlossFreq.compute(normalizedMorpheme, (k, v) -> (v == null) ? 1 : v + 1); +// missingGlossFreq.compute(normalizedMorpheme, (k, v) -> (v == null) ? 1 : v + 1); + missingGlossFreq.put(normalizedMorpheme); // his would lead to large amount of warnings try { // Location l = getLocation((ELANData) cd, morpheme); @@ -2260,14 +2281,17 @@ public class RefcoChecker extends Checker implements CorpusFunction { matched += 1; for (String segment : segments) { // Remove initial periods and keep track of the count - morphemeFreq.compute(segment.replaceAll("^\\.",""), (k, v) -> (v == null) ? 1 : v + 1); + //morphemeFreq.compute(segment.replaceAll("^\\.",""), (k, v) -> (v == null) ? 1 : v + + // 1); + morphemeFreq.put(segment.replaceAll("^\\.","")); } } } // OLD // morphemeFreq.compute(normalizedMorpheme,(k, v) -> (v == null) ? 1 : v + 1); } - glossFreq.compute(token,(k, v) -> (v == null) ? 1 : v + 1); +// glossFreq.compute(token,(k, v) -> (v == null) ? 1 : v + 1); + glossFreq.put(token); } } float percentValid = (float)matched/(matched+missing) ; @@ -2363,6 +2387,21 @@ public class RefcoChecker extends Checker implements CorpusFunction { Report report = new Report() ; // Check for ELAN data if (cd instanceof ELANData) { + // Generic checks + try { + // Part of ELANValidatorCheckes + // XsdChecker xc = new XsdChecker(props); + // report.merge(xc.function(cd, false)); + ELANValidatorChecker evc = new ELANValidatorChecker(props); + report.merge(evc.function(cd,false)); + } + catch (Exception e){ + report.addCritical(getFunction(), + ReportItem.newParamMap(new String[]{"function","filename","description", "exception"}, + new Object[]{getFunction(), cd.getFilename(), "Exception encountered when calling " + + "validation checker ", e})); + } + // Check the transcription // but parameter allows skipping if (!props.containsKey("skip-transcription-check") || @@ -2573,6 +2612,7 @@ public class RefcoChecker extends Checker implements CorpusFunction { params.put("skip-documentation-check", "Flag to skip the documentation check"); params.put("skip-transcription-check", "Flag to skip the transcription check"); params.put("skip-gloss-check", "Flag to skip the gloss check"); + params.put("gloss-stats", "Includes stats about all glosses"); return params; } diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/TEIGenericMetadataChecker.java b/src/main/java/de/uni_hamburg/corpora/validation/quest/TEIGenericMetadataChecker.java index 4d171de9ae7fd771b1d47318d6eb84ba7e5f7c44..0a0388500e927beee1b0efc3ad7c1f13b5372a18 100644 --- a/src/main/java/de/uni_hamburg/corpora/validation/quest/TEIGenericMetadataChecker.java +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/TEIGenericMetadataChecker.java @@ -21,6 +21,11 @@ public class TEIGenericMetadataChecker extends GenericMetadataChecker implements */ public TEIGenericMetadataChecker(Properties properties) { super(properties); + if (properties != null && !properties.isEmpty() && properties.containsKey("tei-criteria-file")) + setCriteriaFile(properties.getProperty("tei-criteria-file")); + else { + loadCriteriaResource("tei-generic.csv"); + } } /** diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/TierFinder.java b/src/main/java/de/uni_hamburg/corpora/validation/quest/TierFinder.java index 3fcde3bb7ab42415b19e032a62319af3ca636c17..ba7c228bafc46a7dcc51de1d511dc4872dce6ce6 100644 --- a/src/main/java/de/uni_hamburg/corpora/validation/quest/TierFinder.java +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/TierFinder.java @@ -40,7 +40,7 @@ abstract class TierFinder extends Checker implements CorpusFunction { private boolean summary = false; // The attribute used for matching - protected String attribute; + protected String attribute_name; // Frequency list of tiers found protected FrequencyList tiers = new FrequencyList(); @@ -54,8 +54,8 @@ abstract class TierFinder extends Checker implements CorpusFunction { if (properties.containsKey("tier-summary") && properties.getProperty("tier-summary").equalsIgnoreCase("true")) { summary = true; } - if (properties.containsKey("attribute-name")) { - attribute = properties.getProperty("attribute-name"); + if (properties.containsKey("tier-attribute-name")) { + attribute_name = properties.getProperty("tier-attribute-name"); } } @@ -115,7 +115,8 @@ abstract class TierFinder extends Checker implements CorpusFunction { public Map<String, String> getParameters() { Map<String,String> params = super.getParameters(); params.put("tier-pattern","Pattern to identify the tier"); - params.put("attribute-name","Optional attribute name used for the matching, case-insensitive, defaults to ID"); + params.put("tier-attribute-name","Optional attribute name used for the matching, case-insensitive, defaults " + + "to ID"); params.put("tier-summary", "Optional flag if the summary over matching tiers in a corpus should be included " + "in the report"); return params; diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/TierStructureChecker.java b/src/main/java/de/uni_hamburg/corpora/validation/quest/TierStructureChecker.java index b915d9651525b7d69a62074c3a1f2e17cde77ade..04fbbaefe9c99f89ac6a31fe12aa88be55539c54 100644 --- a/src/main/java/de/uni_hamburg/corpora/validation/quest/TierStructureChecker.java +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/TierStructureChecker.java @@ -16,6 +16,7 @@ import java.net.URI; import java.net.URISyntaxException; import java.security.NoSuchAlgorithmException; import java.util.*; +import java.util.logging.Logger; import java.util.stream.Collectors; /** @@ -25,6 +26,8 @@ import java.util.stream.Collectors; */ abstract class TierStructureChecker extends Checker implements CorpusFunction { + Logger logger = Logger.getLogger(this.getClass().toString()); + // All tier structures Map<URI,Set<Map<String,String>>> tierStructure = new HashMap<>(); @@ -55,9 +58,10 @@ abstract class TierStructureChecker extends Checker implements CorpusFunction { Report report = new Report(); Set<Map<String,String>> tiers = getTierStructure(report, cd); tierStructure.put(cd.getURL().toURI(),tiers); - if (individualStructure) - report.addNote(getFunction(),cd,"All tiers:\n" + tiers.stream().map((o) -> o.toString()) + if (individualStructure) { + report.addNote(getFunction(), cd, "All tiers:\n" + tiers.stream().map((o) -> o.toString()) .collect(Collectors.joining("\n"))); + } return report; } @@ -66,13 +70,15 @@ abstract class TierStructureChecker extends Checker implements CorpusFunction { Report report = new Report(); Set<Map<String,String>> commonTiers = new HashSet<>(); for (CorpusData cd : c.getCorpusData()){ - report.merge(function(cd,getCanFix())); - if (commonTiers.isEmpty()) - commonTiers.addAll(tierStructure.get(cd.getURL().toURI())); - else - commonTiers = - Sets.intersection(commonTiers,tierStructure.get(cd.getURL().toURI())) - .stream().collect(Collectors.toSet()); + if (getIsUsableFor().contains(cd.getClass())) { + report.merge(function(cd, getCanFix())); + if (commonTiers.isEmpty()) + commonTiers.addAll(tierStructure.get(cd.getURL().toURI())); + else + commonTiers = + Sets.intersection(commonTiers, tierStructure.get(cd.getURL().toURI())) + .stream().collect(Collectors.toSet()); + } } for (URI file : tierStructure.keySet()) { Sets.SetView<Map<String,String>> missingTiers = Sets.difference(tierStructure.get(file),commonTiers); @@ -85,7 +91,7 @@ abstract class TierStructureChecker extends Checker implements CorpusFunction { } if (sharedStructure) - report.addNote(getFunction(),commonTiers.stream().map((o) -> o.toString()) + report.addNote(getFunction(),"Common tiers: " + commonTiers.stream().map((o) -> o.toString()) .collect(Collectors.joining("\n"))); return report; } diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/TranscriptionChecker.java b/src/main/java/de/uni_hamburg/corpora/validation/quest/TranscriptionChecker.java index 8459d244890197de2e7fe9727cca112a62c31219..46107f7c9c61aa77130ef4b418df04f595b0c76a 100644 --- a/src/main/java/de/uni_hamburg/corpora/validation/quest/TranscriptionChecker.java +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/TranscriptionChecker.java @@ -3,6 +3,7 @@ package de.uni_hamburg.corpora.validation.quest; import com.google.common.collect.Sets; import de.uni_hamburg.corpora.*; import de.uni_hamburg.corpora.utilities.quest.FrequencyList; +import de.uni_hamburg.corpora.utilities.quest.XMLTools; import de.uni_hamburg.corpora.validation.Checker; import org.exmaralda.partitureditor.fsm.FSMException; import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException; @@ -29,6 +30,9 @@ abstract class TranscriptionChecker extends Checker implements CorpusFunction { private final Logger logger = Logger.getLogger(this.getFunction()); + // List of all interesting tiers + protected final Set<String> tierIds = new HashSet<>(); + // Regex to split tokens private final String tokenSeparator = " "; @@ -150,7 +154,9 @@ abstract class TranscriptionChecker extends Checker implements CorpusFunction { Arrays.asList("abcdefghijklmnopqrstuvwzyz".toUpperCase().split("")) ); } - + if (properties.containsKey("transcription-tiers")) { + tierIds.addAll(Arrays.asList(properties.getProperty("transcription-tiers").split(","))); + } } } @@ -185,7 +191,9 @@ abstract class TranscriptionChecker extends Checker implements CorpusFunction { abstract List<Element> getTranscriptionTiers(CorpusData cd) throws JDOMException; - abstract String getTranscriptionText(Element tier) throws JDOMException; + public String getTranscriptionText(Element tier) throws JDOMException { + return XMLTools.showAllText(tier); + } @Override public Report function(Corpus c, Boolean fix) throws NoSuchAlgorithmException, ClassNotFoundException, FSMException, URISyntaxException, SAXException, IOException, ParserConfigurationException, JexmaraldaException, TransformerException, XPathExpressionException, JDOMException { @@ -228,6 +236,9 @@ abstract class TranscriptionChecker extends Checker implements CorpusFunction { params.put("transcription-graphemes","List of transcription graphemes, separated by commas"); params.put("transcription-method", "Standard transcription method used, if any. Currently HIAT, DIDA, GAT and" + " IPA"); + params.put("transcription-tiers","List of transcription tier IDs separated by commas"); + params.put("transcription-tier-pattern","A pattern, i.e. substring of tier IDs to identify transcription " + + "tiers"); return params; } } diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/XsdChecker.java b/src/main/java/de/uni_hamburg/corpora/validation/quest/XsdChecker.java index c49846ea58b6aa4ae148790f54889a27f8839f2b..2296818afe45c2be054d4705a0030f75ac6c262c 100644 --- a/src/main/java/de/uni_hamburg/corpora/validation/quest/XsdChecker.java +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/XsdChecker.java @@ -11,13 +11,15 @@ import org.xml.sax.ErrorHandler; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; +import javax.xml.XMLConstants; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.TransformerException; +import javax.xml.transform.stream.StreamSource; +import javax.xml.validation.SchemaFactory; import javax.xml.xpath.XPathExpressionException; -import java.io.File; -import java.io.IOException; +import java.io.*; import java.lang.reflect.Modifier; import java.net.URISyntaxException; import java.security.NoSuchAlgorithmException; @@ -29,6 +31,8 @@ public class XsdChecker extends Checker implements CorpusFunction { private final Logger logger = Logger.getLogger(getFunction()); + private Map<String,String> schemas = new HashMap<>(); + static final String JAXP_SCHEMA_LANGUAGE = "http://java.sun.com/xml/jaxp/properties/schemaLanguage"; static final String W3C_XML_SCHEMA = @@ -36,6 +40,10 @@ public class XsdChecker extends Checker implements CorpusFunction { public XsdChecker(Properties properties) { super(false, properties); + // Map for external schema files needed if the schema is not linked in the file format + schemas.put(ELANData.class.getSimpleName(),"xsd/eaf.xsd"); + schemas.put(EXMARaLDATranscriptionData.class.getSimpleName(),"xsd/exmaralda_exb.xsd"); + schemas.put(EXMARaLDASegmentedTranscriptionData.class.getSimpleName(),"xsd/exmaralda_exs.xsd"); } @Override @@ -48,9 +56,20 @@ public class XsdChecker extends Checker implements CorpusFunction { Report report = new Report(); DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); dbf.setNamespaceAware(true); - dbf.setValidating(true); try { - dbf.setAttribute(JAXP_SCHEMA_LANGUAGE, W3C_XML_SCHEMA); + // Add external schema if necessary + if (schemas.containsKey(cd.getClass().getSimpleName())) { + logger.info(schemas.get(cd.getClass().getSimpleName())); + InputStream is = + this.getClass().getClassLoader().getResourceAsStream(schemas.get(cd.getClass().getSimpleName())); + dbf.setSchema(SchemaFactory.newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI) + .newSchema(new StreamSource(is))); + } + else { + // Otherwise set the schema language + dbf.setAttribute(JAXP_SCHEMA_LANGUAGE, W3C_XML_SCHEMA); + dbf.setValidating(true); + } DocumentBuilder db = dbf.newDocumentBuilder(); db.setErrorHandler(new ErrorHandler() { @Override diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/resources/AnnotationPanel_STTS.xml b/src/main/java/de/uni_hamburg/corpora/validation/quest/resources/AnnotationPanel_STTS.xml new file mode 100644 index 0000000000000000000000000000000000000000..6f165c6dbbf2e14be40e94d23b30e46a38c95d50 --- /dev/null +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/resources/AnnotationPanel_STTS.xml @@ -0,0 +1,271 @@ +<?xml version="1.0" encoding="UTF-8"?> +<annotation-specification> + <annotation-set exmaralda-tier-category="POS"> + <category name="POS-tags"> + <description>reine POS-tags</description> + <category name="Nomina"> + <tag name="N"/> + <description/> + <category name="Apellativa"> + <tag name="NN"/> + <description>Tisch, Herr, [das] Reisen</description> + </category> + <category name="Eigennamen"> + <tag name="NE"/> + <description>Hans, Hamburg, HSV</description> + </category> + </category> + <category name="Verben"> + <tag name="V"/> + <description/> + <category name="finites Verb, voll"> + <tag name="VVFIN"/> + <description>[du] gehst, [wir] kommen [an]</description> + </category> + <category name="Imperativ, voll"> + <tag name="VVIMP"/> + <description>komm [!]</description> + </category> + <category name="Infinitiv, voll"> + <tag name="VVINF"/> + <description>gehen, ankommen</description> + </category> + <category name="Infinitiv mit 'zu', voll"> + <tag name="VVIZU"/> + <description>anzukommen, loszulassen</description> + </category> + <category name="Partizip Perfekt, voll"> + <tag name="VVPP"/> + <description>gegangen, angekommen</description> + </category> + <category name="finites Verb, aux"> + <tag name="VAFIN"/> + <description>[du] bist, [wir] werden</description> + </category> + <category name="Imperativ, aux"> + <tag name="VVIMP"/> + <description>sei [ruhig!]</description> + </category> + <category name="Infinitiv, aux"> + <tag name="VAINF"/> + <description>werden, sein</description> + </category> + <category name="Partizip Perfekt, aux"> + <tag name="VAPP"/> + <description>gewesen</description> + </category> + <category name="finites Verb, modal"> + <tag name="VMFIN"/> + <description>dürfen</description> + </category> + <category name="Infinitiv, modal"> + <tag name="VMINF"/> + <description>wollen</description> + </category> + <category name="Partizip Perfekt, modal"> + <tag name="VMPP"/> + <description>[er hat] gekonnt</description> + </category> + </category> + <category name="Artikel"> + <tag name="ART"/> + <description/> + <category name="bestimmter oder unbestimmter Artikel"> + <tag name="ART"/> + <description>der, die, das, ein, eine</description> + </category> + </category> + <category name="Adjektive"> + <tag name="ADJ"/> + <description/> + <category name="attributives Adjektiv"> + <tag name="ADJA"/> + <description>[das] große [Haus]</description> + </category> + <category name="adverbiales oder prädikatives Adjektiv"> + <tag name="ADJD"/> + <description>[er fährt] schnell, [er ist] schnell</description> + </category> + </category> + <category name="Pronomina"> + <tag name="P"/> + <description/> + <category name="substituierendes Demonstrativpronomen"> + <tag name="PDS"/> + <description>dieser, jener</description> + </category> + <category name="attribuierendes Demonstrativpronomen"> + <tag name="PDAT"/> + <description>jener [Mensch]</description> + </category> + <category name="substituierendes Indefinitpronomen"> + <tag name="PIS"/> + <description>keiner, viele, man, niemand</description> + </category> + <category name="attribuierendes Indefinitpronomen ohne Determiner"> + <tag name="PIAT"/> + <description>kein [Mensch], irgendein [Glas]</description> + </category> + <category name="attribuierendes Indefinitpronomen mit Determiner"> + <tag name="PIDAT"/> + <description>[ein] wenig [Wasser], [die] beiden [Brüder]</description> + </category> + <category name="irreflexives Personalpronomen"> + <tag name="PPER"/> + <description>ich, er, ihm, mich, der</description> + </category> + <category name="substituierendes Possesivpronomen"> + <tag name="PPOSS"/> + <description>meins, deiner</description> + </category> + <category name="attribuierendes Possesivpronomen"> + <tag name="PPOSAT"/> + <description>mein [Buch], deine [Mutter]</description> + </category> + <category name="substituierendes Relativpronomen"> + <tag name="PRELS"/> + <description>[der Hund,] der</description> + </category> + <category name="attribuierendes Relativpronomen"> + <tag name="PRELAT"/> + <description>[der Mann,] dessen [Hund]</description> + </category> + <category name="reflexives Personalpronomen"> + <tag name="PRF"/> + <description>sich, einander, dich, mir</description> + </category> + <category name="substituierendes Interrogativpronomen"> + <tag name="PWS"/> + <description>wer, was</description> + </category> + <category name="attribuierendes Interrogativpronomen"> + <tag name="PWAT"/> + <description>welche [Farbe], wessen [Hut]</description> + </category> + <category name="adverbiales Interrogativ- oder Relativpronomen"> + <tag name="PWAV"/> + <description>warum, wo, wann, worüber, wobei</description> + </category> + <category name="Pronominaladverb"> + <tag name="PAV"/> + <description>dafür, dabei, deswegen, trotzdem</description> + </category> + </category> + <category name="Kardinalzahlen"> + <tag name="CARD"/> + <description/> + <category name="Kardinalzahl"> + <tag name="CARD"/> + <description>zwei [Männer], [im Jahre] 1994</description> + </category> + </category> + <category name="Adverbien"> + <tag name="ADV"/> + <description/> + <category name="Adverb"> + <tag name="ADV"/> + <description>schon, bald, doch</description> + </category> + </category> + <category name="Konjunktionen"> + <tag name="KO"/> + <description/> + <category name="unterordnende Konjunktion mit 'zu' und Infinitiv"> + <tag name="KOUI"/> + <description>um [zu leben], anstatt [zu fragen]</description> + </category> + <category name="unterordnende Konjunktion mit Satz"> + <tag name="KOUS"/> + <description>weil, dass, damit, wenn, ob</description> + </category> + <category name="nebenordnende Konjunktion"> + <tag name="KON"/> + <description>und, ober, aber</description> + </category> + <category name="Vergleichspartikel, ohne Satz"> + <tag name="KOKOM"/> + <description>als, wie</description> + </category> + </category> + <category name="Adpositionen"> + <tag name="AP"/> + <description/> + <category name="Präposition; Zirkumposition links"> + <tag name="APPR"/> + <description>in [der Stadt], ohne [mich]</description> + </category> + <category name="Präposition mit Artikel"> + <tag name="APPRART"/> + <description>im [Haus], zur [Sache]</description> + </category> + <category name="Postposition"> + <tag name="APPO"/> + <description>[ihm] zufolge, [der Sache] wegen</description> + </category> + <category name="Zirkumposition rechts"> + <tag name="APZR"/> + <description>[von jetzt] an</description> + </category> + </category> + <category name="Interjektionen"> + <tag name="ITJ"/> + <description/> + <category name="Interjektion"> + <tag name="ITJ"/> + <description>mhm, ach, tja</description> + </category> + </category> + <category name="Partikeln"> + <tag name="PTK"/> + <description/> + <category name="'zu' vor Infinitiv"> + <tag name="PTKZU"/> + <description>zu [gehen]</description> + </category> + <category name="Negationspartikel"> + <tag name="PTKNEG"/> + <description>nicht</description> + </category> + <category name="abgetrennter Verbzusatz"> + <tag name="PTKVZ"/> + <description>[er kommt] an, [er fährt] rad</description> + </category> + <category name="Antwortpartikel"> + <tag name="PTKANT"/> + <description>ja, nein, danke, bitte</description> + </category> + <category name="Partikel bei Adjektiv oder Adverb"> + <tag name="PTKA"/> + <description>am [schönsten], zu [schnell]</description> + </category> + </category> + </category> + <category name="Extra-tags"> + <description>zusätzliche tags</description> + <category name="Fremdsprachliches Material"> + <tag name="FM"/> + <description>[er hat das mit "] A big fish [" übersetzt]</description> + </category> + <category name="Kompositions-Erstglied"> + <tag name="TRUNC"/> + <description>an [und Abreise]</description> + </category> + <category name="Nichtwort, Sonderzeichen enthaltend"> + <tag name="XY"/> + <description>D2XW3</description> + </category> + <category name="Komma"> + <tag name="$,"/> + <description>,</description> + </category> + <category name="Satzbeendende Interpunktion"> + <tag name="$."/> + <description>. ? ! ; :</description> + </category> + <category name="sonstige Satzzeichen; satzintern"> + <tag name="$("/> + <description>- ( ) [ ]</description> + </category> + </category> + </annotation-set> +</annotation-specification> \ No newline at end of file diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/resources/imdi-generic.csv b/src/main/java/de/uni_hamburg/corpora/validation/quest/resources/imdi-generic.csv deleted file mode 100644 index ad13a447eebed92e7732fd8f8a9827bbabb8b82c..0000000000000000000000000000000000000000 --- a/src/main/java/de/uni_hamburg/corpora/validation/quest/resources/imdi-generic.csv +++ /dev/null @@ -1,35 +0,0 @@ -Property,Requirement,Value type,IMDI,Comments -Identifier,1 - unbounded,URI,/imdi:METATRANSCRIPT/@ArchiveHandle,imdi: is making the implicit namespace http://www.mpi.nl/IMDI/Schema/IMDI exmplicit -Session,1 - 1, N/A,//imdi:Session,There is only one session supposed to be in an IMDI metadata file -Title,1 - unbounded,string,//imdi:Session//imdi:Name OR //imdi:Session//imdi:Title,Could be /imdi:METATRANSCRIPT/imdi:Session/imdi:Title or /imdi:METATRANSCRIPT/imdi:Session/imdi:MDGroup/imdi:Project/imdi:Title -Description,1 - unbounded,string,//imdi:Session/imdi:Description, -Version,0 - 1,string,/imdi:METATRANSCRIPT/@Version, -Keywords,0 - 1,string,//imdi:Session/imdi:Keys OR Session//imdi:Content/imdi:Keys, -License,1 - 1,URI OR string,//imdi:Session/imdi:Access/imdi:Description,Access does not seem to be a valid element directly within session; only for ressources -Rightsholder,0 - 1,string,//imdi:Session/imdi:Access/imdi:Owner,Access does not seem to be a valid element directly within session; only for ressources -Access Rights,0 - 1,string,//imdi:Session/imdi:Access/imdi:Availability,Access does not seem to be a valid element directly within session; only for ressources -PublicationYear,1 - 1,date,//imdi:Session/imdi:Date, -Publisher,1 - 1,string,//imdi:Session/imdi:Access/imdi:Publisher,Access does not seem to be a valid element directly within session only for ressources -Creator,1 - unbounded,string,//imdi:Session//imdi:Actors/imdi:Actor/imdi:Role[text()="Author" or text()="Researcher"],Limiting the roles of the actor; more roles could be suitable (see: https://www.mpi.nl/IMDI/Schema/Actor-Role.xml); IMDI schema guarantees that each creator has exactly one full name and so on -CreatorName,0 - unbounded,string,//imdi:Session//imdi:Actors/imdi:Actor[imdi:Role/text()="Researcher" or imdi:Role/text()="Author"]/imdi:Name OR //imdi:Session//imdi:Actors/imdi:Actor[imdi:Role/text()="Researcher" or imdi:Role/text()="Author"]/imdi:FullName,Only for the creator roles as above -FamilyName,1 - 1,string,N/A,Not expressable in IMDI -GivenName,0 - 1,string,N/A,Not expressable in IMDI -CreatorIdentifier,1 - unbounded,URI,N/A,Not expressable in IMDI -Affiliation,0 - unbounded,URI OR string,//imdi:Session//imdi:Actors/imdi:Actor[imdi:Role/text()="Researcher" or imdi:Role/text()="Author"]/imdi:Contact/imdi:Organization,Only for the creator roles as above -CreatorEmail,0 - unbounded,URI OR string,//imdi:Session//imdi:Actors/imdi:Actor[imdi:Role/text()="Researcher" or imdi:Role/text()="Author"]/imdi:Contact/imdi:Email,Only for the creator roles as above -Organisation,N/A,N/A,//imdi:Session//imdi:Actors/imdi:Actor[imdi:Role/text()="Researcher" or imdi:Role/text()="Author"]/imdi:Contact/imdi:Organisation OR //imdi:Session//imdi:Project/imdi:Contact/imdi:Organisation,Only for the creator roles as above or for the project -OrganisationName,1 - unbound,string,//imdi:Session//imdi:Actors/imdi:Actor[imdi:Role/text()="Researcher" or imdi:Role/text()="Author"]/imdi:Contact/imdi:Organisation OR //imdi:Session//imdi:Project/imdi:Contact/imdi:Organisation,Only for the creator roles as above or for the project -OrganisationIdentifier,0 - unbounded,N/A,N/A,Not expressable in IMDI -OrganisationURL,0 - 1,URI,N/A,Not expressable in IMDI -OrganisationEmail,0 - 1,URI OR string,N/A,Not expressable in IMDI -sameAs,0 - unbounded,URI,//imdi:Session/imdi:Resource OR //imdi:Reference/imdi:Link,No tag Resource within Session and Link within Reference; Link only available in ExternalResourceReference -isPartOf,0 - unbounded,URI,//imdi:Session/imdi:Resource OR //imdi:Reference/imdi:Link,No tag Resource within Session and Link within Reference; Link only available in ExternalResourceReference -hasPart,0 - unbounded,URI,//imdi:Session/imdi:Resource OR //imdi:Reference/imdi:Link,No tag Resource within Session and Link within Reference; Link only available in ExternalResourceReference -isBasedOn,0 - unbounded,URI,//imdi:Session/imdi:Resource OR //imdi:Reference/imdi:Link,No tag Resource within Session and Link within Reference; Link only available in ExternalResourceReference -ObjectLanguage,1 - unbounded,string,//imdi:Session//imdi:Content/imdi:Languages/imdi:Language, -LinguisticDataType,0 - unbounded,string,N/A,Not expressable in IMDI -Modality,0 - unbounded,string,//imdi:Session//imdi:Content/imdi:Modalities, -Language,1 - unbounded,N/A,//imdi:Session//imdi:Content/imdi:Languages/imdi:Language, -LanguageName,1 - unbounded,string,//imdi:Session//imdi:Content/imdi:Languages/imdi:Language/imdi:Name, -LanguagePreferredLable,0 - 1,URI OR string,N/A,Not expressable in IMDI -LanguageIdentifier,1 - unbounded,URI,//imdi:Session//imdi:Content/imdi:Languages/imdi:Language/imdi:Id, \ No newline at end of file diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/resources/childes-generic.csv b/src/main/java/de/uni_hamburg/corpora/validation/quest/resources/metadata/childes-generic.csv similarity index 100% rename from src/main/java/de/uni_hamburg/corpora/validation/quest/resources/childes-generic.csv rename to src/main/java/de/uni_hamburg/corpora/validation/quest/resources/metadata/childes-generic.csv diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/resources/metadata/cmdi-blam.csv b/src/main/java/de/uni_hamburg/corpora/validation/quest/resources/metadata/cmdi-blam.csv new file mode 100644 index 0000000000000000000000000000000000000000..5c22a13f15fce9a8a9cce87530b4c6d938dd7035 --- /dev/null +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/resources/metadata/cmdi-blam.csv @@ -0,0 +1,34 @@ +Property,Requirement,Value type,BLAM,,,,,,,Reference +Identifier,1 - unbounded,URI,"/cmd:CMD/cmd:Components/*[fn:contains(fn:name(),'BLAM-bundle')]/cmd:BundleGeneralInfo/cmd:BundleID OR /cmd:CMD/cmd:Components/*[contains(name(),'BLAM-collection')]/cmd:CollectionGeneralInfo/cmd:CollectionID",,,,,,,https://github.com/fxru/blam-metadata/blob/master/BLAMBundleRepository.md +Title,1 - unbounded,string,"/cmd:CMD/cmd:Components/*[contains(name(),'BLAM-bundle')]/cmd:BundleGeneralInfo/cmd:BundleDisplayTitle OR /cmd:CMD/cmd:Components/*[contains(name(),'BLAM-collection')]/cmd:CollectionGeneralInfo/cmd:CollectionDisplayTitle",,,,,,, +Description,1 - unbounded,string,"/cmd:CMD/cmd:Components/*[contains(name(),'BLAM-bundle')]/cmd:BundleGeneralInfo/cmd:BundleDescription OR /cmd:CMD/cmd:Components/*[contains(name(),'BLAM-collection')]/cmd:CollectionGeneralInfo/cmd:CollectionDescription",,,,,,, +Version,0 - 1,string,N/A,,,,,,, +Keywords,0 - 1,string,"/cmd:CMD/cmd:Components/*[contains(name(),'BLAM-bundle')]/cmd:BundleGeneralInfo/cmd:BundleKeywords OR /cmd:CMD/cmd:Components/*[contains(name(),'BLAM-collection')]/cmd:CollectionGeneralInfo/cmd:CollectionKeywords",,,,,,, +License,1 - 1,URI OR string,"/cmd:CMD/cmd:Components/*[contains(name(),'BLAM-bundle')]/cmd:BundleAdministrativeInfo/cmd:License OR /cmd:CMD/cmd:Components/*[contains(name(),'BLAM-collection')]/cmd:CollectionAdministrativeInfo/cmd:License",,,,,,, +Rightsholder,0 - 1,string,"/cmd:CMD/cmd:Components/*[contains(name(),'BLAM-bundle')]/cmd:BundleAdministrativeInfo/cmd:Rightsholder OR /cmd:CMD/cmd:Components/*[contains(name(),'BLAM-collection')]/cmd:CollectionAdministrativeInfo/cmd:Rightsholder",,,,,, +Access Rights,0 - 1,string,"/cmd:CMD/cmd:Components/*[contains(name(),'BLAM-bundle')]/cmd:BundleAdministrativeInfo/cmd:Access OR /cmd:CMD/cmd:Components/*[contains(name(),'BLAM-collection')]/cmd:CollectionAdministrativeInfo/cmd:Access",,,,,, +PublicationYear,1 - 1,date,"/cmd:CMD/cmd:Components/*[contains(name(),'BLAM-bundle')]/cmd:BundlePublicationInfo/cmd:BundlePublicationYear OR /cmd:CMD/cmd:Components/*[contains(name(),'BLAM-collection')]/cmd:CollectionPublicationInfo/cmd:CollectionPublicationYear",,,,,,, +Publisher,1 - 1,string,"/cmd:CMD/cmd:Components/*[contains(name(),'BLAM-bundle')]/cmd:BundlePublicationInfo/cmd:BundleDataProvider OR /cmd:CMD/cmd:Components/*[contains(name(),'BLAM-collection')]/cmd:CollectionPublicationInfo/cmd:CollectionDataProvider",,,,,, +Creator,1 - unbounded,N/A,"/cmd:CMD/cmd:Components/*[contains(name(),'BLAM-bundle')]/cmd:BundlePublicationInfo/cmd:BundleCreators/cmd:BundleCreator OR /cmd:CMD/cmd:Components/*[contains(name(),'BLAM-collection')]/cmd:CollectionPublicationInfo/cmd:CollectionCreators/cmd:CollectionCreator",,,,,, +CreatorName,0 - 1,string,"/cmd:CMD/cmd:Components/*[contains(name(),'BLAM-bundle')]/cmd:BundlePublicationInfo/cmd:BundleCreators/cmd:BundleCreator/cmd:CreatorName OR /cmd:CMD/cmd:Components/*[contains(name(),'BLAM-collection')]/cmd:CollectionPublicationInfo/cmd:CollectionCreators/cmd:CollectionCreator/cmd:CreatorName",,,,,, +FamilyName,1 - 1,string,"/cmd:CMD/cmd:Components/*[contains(name(),'BLAM-bundle')]/cmd:BundlePublicationInfo/cmd:BundleCreators/cmd:BundleCreator/cmd:CreatorName/cmd:CreatorFamilyName OR /cmd:CMD/cmd:Components/*[contains(name(),'BLAM-collection')]/cmd:CollectionPublicationInfo/cmd:CollectionCreators/cmd:CollectionCreator/cmd:CreatorName/cmd:CreatorFamilyName",,,,,, +GivenName,0 - 1,string,"/cmd:CMD/cmd:Components/*[contains(name(),'BLAM-bundle')]/cmd:BundlePublicationInfo/cmd:BundleCreators/cmd:BundleCreator/cmd:CreatorName/cmd:CreatorGivenName OR /cmd:CMD/cmd:Components/*[contains(name(),'BLAM-collection')]/cmd:CollectionPublicationInfo/cmd:CollectionCreators/cmd:CollectionCreator/cmd:CreatorName/cmd:CreatorGivenName",,,,,, +CreatorIdentifier,1 - unbounded,URI,"/cmd:CMD/cmd:Components/*[contains(name(),'BLAM-bundle')]/cmd:BundlePublicationInfo/cmd:BundleCreators/cmd:BundleCreator/cmd:CreatorNameIdentifier OR /cmd:CMD/cmd:Components/*[contains(name(),'BLAM-collection')]/cmd:CollectionPublicationInfo/cmd:CollectionCreators/cmd:CollectionCreator/cmd:CreatorNameIdentifier",,,,,, +Affiliation,0 - unbounded,URI OR string,"/cmd:CMD/cmd:Components/*[contains(name(),'BLAM-bundle')]/cmd:BundlePublicationInfo/cmd:BundleCreators/cmd:BundleCreator/cmd:CreatorAffiliation OR /cmd:CMD/cmd:Components/*[contains(name(),'BLAM-collection')]/cmd:CollectionPublicationInfo/cmd:CollectionCreators/cmd:CollectionCreator/cmd:CreatorAffiliation",,,,,, +CreatorEmail,0 - unbounded,URI OR string,N/A,,,,,,, +Organisation,N/A,N/A,N/A,,,,,,, +OrganisationName,1 - 1,string,N/A,,,,,,, +OrganisationIdentifier,0 - unbounded,N/A,N/A,,,,,,, +OrganisationURL,0 - 1,URI,N/A,,,,,,, +OrganisationEmail,0 - 1,URI OR string,N/A,,,,,,, +sameAs,0 - unbounded,URI,N/A,,,,,,, +isPartOf,0 - unbounded,URI,"/cmd:CMD/cmd:Resources/cmd:IsPartOfList/cmd:IsPartOf OR /cmd:CMD/cmd:Components/*[contains(name(),'BLAM-bundle')]/cmd:BundleStructuralInfo/cmd:BundleIsMemberOfCollection OR /cmd:CMD/cmd:Resources/cmd:IsPartOfList/cmd:IsPartOf OR /cmd:CMD/cmd:Components/*[contains(name(),'BLAM-collection')]/cmd:CollectionStructuralInfo/cmd:CollectionIsMemberOfCollection",,,,,, +hasPart,0 - unbounded,URI,N/A,,,,,,, +isBasedOn,0 - unbounded,URI,N/A,,,,,,, +ObjectLanguage,1 - unbounded,N/A,"/cmd:CMD/cmd:Components/*[contains(name(),'BLAM-bundle')]/cmd:BundleGeneralInfo/cmd:BundleObjectLanguages/cmd:BundleObjectLanguage OR /cmd:CMD/cmd:Components/*[contains(name(),'BLAM-collection')]/cmd:CollectionGeneralInfo/cmd:CollectionObjectLanguages/cmd:CollectionObjectLanguage",,,,,, +LinguisticDataType,0 - unbounded,string,N/A,,,,,,, +Modality,0 - unbounded,string,N/A,,,,,,, +Language,N/A,N/A,"/cmd:CMD/cmd:Components/*[contains(name(),'BLAM-bundle')]/cmd:BundleGeneralInfo/cmd:BundleObjectLanguages/cmd:BundleObjectLanguage OR /cmd:CMD/cmd:Components/*[contains(name(),'BLAM-collection')]/cmd:CollectionGeneralInfo/cmd:CollectionObjectLanguages/cmd:CollectionObjectLanguage",,,,,, +LanguageName,1 - 1,string,"/cmd:CMD/cmd:Components/*[contains(name(),'BLAM-bundle')]/cmd:BundleGeneralInfo/cmd:BundleObjectLanguages/cmd:BundleObjectLanguage/cmd:ObjectLanguageName OR /cmd:CMD/cmd:Components/*[contains(name(),'BLAM-collection')]/cmd:CollectionGeneralInfo/cmd:CollectionObjectLanguages/cmd:CollectionObjectLanguage/cmd:ObjectLanguageName",,,,,, +LanguagePreferredLable,0 - 1,URI OR string,"/cmd:CMD/cmd:Components/*[contains(name(),'BLAM-bundle')]/cmd:BundleGeneralInfo/cmd:BundleObjectLanguages/cmd:BundleObjectLanguage/cmd:ObjectLanguageDisplayName OR /cmd:CMD/cmd:Components/*[contains(name(),'BLAM-collection')]/cmd:CollectionGeneralInfo/cmd:CollectionObjectLanguages/cmd:CollectionObjectLanguage/cmd:ObjectLanguageDisplayName",,,,,, +LanguageIdentifier,1 - unbounded,URI,"/cmd:CMD/cmd:Components/*[contains(name(),'BLAM-bundle')]/cmd:BundleGeneralInfo/cmd:BundleObjectLanguages/cmd:BundleObjectLanguage/cmd:ObjectLanguageISO639-3Code OR /cmd:CMD/cmd:Components/*[contains(name(),'BLAM-bundle')]/cmd:BundleGeneralInfo/cmd:BundleObjectLanguages/cmd:BundleObjectLanguage/cmd:ObjectLanguageGlottologCode OR /cmd:CMD/cmd:Components/*[contains(name(),'BLAM-collection')]/cmd:CollectionGeneralInfo/cmd:CollectionObjectLanguages/cmd:CollectionObjectLanguage/cmd:ObjectLanguageISO639-3Code OR /cmd:CMD/cmd:Components/*[contains(name(),'BLAM-collection')]/cmd:CollectionGeneralInfo/cmd:CollectionObjectLanguages/cmd:CollectionObjectLanguage/cmd:ObjectLanguageGlottologCode",,,,,, diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/resources/metadata/cmdi-sign.csv b/src/main/java/de/uni_hamburg/corpora/validation/quest/resources/metadata/cmdi-sign.csv new file mode 100644 index 0000000000000000000000000000000000000000..76de1efd0b49f1221414554f6557d49128bd39cf --- /dev/null +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/resources/metadata/cmdi-sign.csv @@ -0,0 +1,34 @@ +Property,Requirement,Value type,CMDI,,Reference: https://catalog.clarin.eu/ds/ComponentRegistry#/?itemId=clarin.eu%3Acr1%3Ap_1417617523856®istrySpace=public +Identifier,1 - unbounded,URI,//cmd:MdSelfLink,, +Title,1 - unbounded,string,/cmd:CMD/cmd:Components/cmd:lat-SL-session/cmd:Name OR /cmd:CMD/cmd:Components/cmd:lat-SL-session/cmd:Title,, +Description,1 - unbounded,string,/cmd:CMD/cmd:Components/cmd:lat-SL-session/cmd:Project/cmd:descriptions/cmd:Description OR /cmd:CMD/cmd:Components/cmd:lat-SL-session/cmd:InfoLink/cmd:Description,,description seems to be a child of Project and InfoLink does not seem to appear in data +Version,0 - 1,string,N/A,, +Keywords,0 - 1,string,N/A,, +License,1 - 1,URI OR string,/cmd:CMD/cmd:Components/cmd:lat-SL-session/cmd:SL_CreativeCommonsLicense,, +Rightsholder,0 - unbounded,string,/cmd:CMD/cmd:Components/cmd:lat-SL-session/cmd:Resources/cmd:MediaFile/cmd:Access/cmd:Owner OR /cmd:CMD/cmd:Components/cmd:lat-SL-session/cmd:Resources/cmd:WrittenResource/cmd:Access/cmd:Owner OR /cmd:CMD/cmd:Components/cmd:lat-SL-session/cmd:Resources/cmd:Anonyms/cmd:Access/cmd:Owner,,finds one result for each linked file -> changed to unbounded +Access Rights,0 - unbounded,string,/cmd:CMD/cmd:Components/cmd:lat-SL-session/cmd:SL_CreativeCommonsLicense OR /cmd:CMD/cmd:Components/cmd:lat-SL-session/cmd:Resources/cmd:MediaFile/cmd:Access/cmd:Availability OR /cmd:CMD/cmd:Components/cmd:lat-SL-session/cmd:Resources/cmd:WrittenResource/cmd:Access/cmd:Availability OR /cmd:CMD/cmd:Components/cmd:lat-SL-session/cmd:Resources/cmd:Anonyms/cmd:Access/cmd:Availability,,same cardinality as rightsholder +PublicationYear,1 - 1,date,/cmd:CMD/cmd:Components/cmd:lat-SL-session/cmd:Date,, +Publisher,1 - unbounded,string,/cmd:CMD/cmd:Components/cmd:lat-SL-session/cmd:Resources/cmd:MediaFile/cmd:Access/cmd:Publisher OR /cmd:CMD/cmd:Components/cmd:lat-SL-session/cmd:Resources/cmd:WrittenResource/cmd:Access/cmd:Publisher OR /cmd:CMD/cmd:Components/cmd:lat-SL-session/cmd:Resources/cmd:Anonyms/cmd:Access/cmd:Publisher,,same cardinality as rightsholder +Creator,1 - unbounded,string,N/A,, +CreatorName,0 - 1,string,/cmd:CMD/cmd:Components/cmd:lat-SL-session/cmd:Project/cmd:Contact/cmd:Name OR /cmd:CMD/cmd:Header/cmd:MdCreator,, +FamilyName,1 - 1,string,N/A,,was the same as creator name +GivenName,0 - 1,string,N/A,,was the same as creator name +CreatorIdentifier,1 - unbounded,URI,N/A,, +Affiliation,0 - unbounded,URI OR string,/cmd:CMD/cmd:Components/cmd:lat-SL-session/cmd:Project/cmd:Contact/cmd:Organisation,, +CreatorEmail,0 - unbounded,URI OR string,/cmd:CMD/cmd:Components/cmd:lat-SL-session/cmd:Project/cmd:Contact/cmd:Email,, +Organisation,N/A,N/A,N/A,, +OrganisationName,1 - 1,string,/cmd:CMD/cmd:Components/cmd:lat-SL-session/cmd:Project/cmd:Contact/cmd:Organisation,, +OrganisationIdentifier,0 - unbounded,N/A,N/A,, +OrganisationURL,0 - 1,URI,N/A,, +OrganisationEmail,0 - 1,URI OR string,N/A,, +sameAs,0 - unbounded,URI,N/A,, +isPartOf,0 - unbounded,URI,N/A,, +hasPart,0 - unbounded,URI,N/A,, +isBasedOn,0 - unbounded,URI,N/A,, +ObjectLanguage,1 - unbounded,string,/cmd:CMD/cmd:Components/cmd:lat-SL-session/cmd:SL-Content/cmd:Content_Languages,, +LinguisticDataType,0 - unbounded,string,N/A,, +Modality,0 - unbounded,string,/cmd:CDM/cmd:Components/cmd:lat-SL-session/cmd:SL-Content/cmd:Modalities,, +Language,N/A,N/A,/cmd:CMD/cmd:Components/cmd:lat-SL-session/cmd:SL-Content/cmd:Content_Languages/cmd:Content_Language,, +LanguageName,1 - 1,string,/cmd:CMD/cmd:Components/cmd:lat-SL-session/cmd:SL-Content/cmd:Content_Languages/cmd:Content_Language/cmd:Name,, +LanguagePreferredLable,0 - 1,URI OR string,N/A,, +LanguageIdentifier,1 - unbounded,URI,/cmd:CMD/cmd:Components/cmd:lat-SL-session/cmd:SL-Content/cmd:Content_Languages/cmd:Content_Language/cmd:Id,, \ No newline at end of file diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/resources/coma-generic.csv b/src/main/java/de/uni_hamburg/corpora/validation/quest/resources/metadata/coma-generic.csv similarity index 100% rename from src/main/java/de/uni_hamburg/corpora/validation/quest/resources/coma-generic.csv rename to src/main/java/de/uni_hamburg/corpora/validation/quest/resources/metadata/coma-generic.csv diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/resources/metadata/imdi-generic.csv b/src/main/java/de/uni_hamburg/corpora/validation/quest/resources/metadata/imdi-generic.csv new file mode 100644 index 0000000000000000000000000000000000000000..49da7323cb94d87452181f7d552cab3a7e824bd0 --- /dev/null +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/resources/metadata/imdi-generic.csv @@ -0,0 +1,35 @@ +Property,Requirement,Value type,IMDI,Comments,, +Identifier,1 - unbounded,URI,/imdi:METATRANSCRIPT/@ArchiveHandle,imdi: is making the implicit namespace http://www.mpi.nl/IMDI/Schema/IMDI exmplicit,, +Session,1 - 1, N/A,//imdi:Session,There is only one session supposed to be in an IMDI metadata file,, +Title,1 - unbounded,string,//imdi:Session/imdi:Name OR //imdi:Session//imdi:Title,Could be /imdi:METATRANSCRIPT/imdi:Session/imdi:Title or /imdi:METATRANSCRIPT/imdi:Session/imdi:MDGroup/imdi:Project/imdi:Title,, +Description,1 - unbounded,string,//imdi:Session/imdi:Description,,, +Version,0 - 1,string,/imdi:METATRANSCRIPT/@Version,,, +Keywords,0 - 1,string,//imdi:Session/imdi:Keys OR Session//imdi:Content/imdi:Keys,,, +License,1 - 1,URI OR string,//imdi:Session/imdi:Access/imdi:Description,Access does not seem to be a valid element directly within session, only for ressources, +Rightsholder,0 - 1,string,//imdi:Session/imdi:Access/imdi:Owner,Access does not seem to be a valid element directly within session, only for ressources, +Access Rights,0 - 1,string,//imdi:Session/imdi:Access/imdi:Availability,Access does not seem to be a valid element directly within session, only for ressources, +PublicationYear,1 - 1,date,//imdi:Session/imdi:Date,,, +Publisher,1 - 1,string,//imdi:Session/imdi:Access/imdi:Publisher,Access does not seem to be a valid element directly within session only for ressources,, +Creator,1 - unbounded,string,"//imdi:Session//imdi:Actors/imdi:Actor[imdi:Role/text()=""Author"" or imdi:Role/text()=""Researcher""]",Limiting the roles of the actor, more roles could be suitable (see: https://www.mpi.nl/IMDI/Schema/Actor-Role.xml), IMDI schema guarantees that each creator has exactly one full name and so on +CreatorName,0 - unbounded,string,"//imdi:Session//imdi:Actors/imdi:Actor[imdi:Role/text()=""Researcher"" or imdi:Role/text()=""Author""]/imdi:Name OR //imdi:Session//imdi:Actors/imdi:Actor[imdi:Role/text()=""Researcher"" or imdi:Role/text()=""Author""]/imdi:FullName",Only for the creator roles as above,, +FamilyName,1 - 1,string,N/A,Not expressable in IMDI,, +GivenName,0 - 1,string,N/A,Not expressable in IMDI,, +CreatorIdentifier,1 - unbounded,URI,N/A,Not expressable in IMDI,, +Affiliation,0 - unbounded,URI OR string,"//imdi:Session//imdi:Actors/imdi:Actor[imdi:Role/text()=""Researcher"" or imdi:Role/text()=""Author""]/imdi:Contact/imdi:Organization",Only for the creator roles as above,, +CreatorEmail,0 - unbounded,URI OR string,"//imdi:Session//imdi:Actors/imdi:Actor[imdi:Role/text()=""Researcher"" or imdi:Role/text()=""Author""]/imdi:Contact/imdi:Email",Only for the creator roles as above,, +Organisation,N/A,N/A,"//imdi:Session//imdi:Actors/imdi:Actor[imdi:Role/text()=""Researcher"" or imdi:Role/text()=""Author""]/imdi:Contact/imdi:Organisation OR //imdi:Session//imdi:Project/imdi:Contact/imdi:Organisation",Only for the creator roles as above or for the project,, +OrganisationName,1 - unbound,string,"//imdi:Session//imdi:Actors/imdi:Actor[imdi:Role/text()=""Researcher"" or imdi:Role/text()=""Author""]/imdi:Contact/imdi:Organisation OR //imdi:Session//imdi:Project/imdi:Contact/imdi:Organisation",Only for the creator roles as above or for the project,, +OrganisationIdentifier,0 - unbounded,N/A,N/A,Not expressable in IMDI,, +OrganisationURL,0 - 1,URI,N/A,Not expressable in IMDI,, +OrganisationEmail,0 - 1,URI OR string,N/A,Not expressable in IMDI,, +sameAs,0 - unbounded,URI,//imdi:Session/imdi:Resource OR //imdi:Reference/imdi:Link,No tag Resource within Session and Link within Reference, Link only available in ExternalResourceReference, +isPartOf,0 - unbounded,URI,//imdi:Session/imdi:Resource OR //imdi:Reference/imdi:Link,No tag Resource within Session and Link within Reference, Link only available in ExternalResourceReference, +hasPart,0 - unbounded,URI,//imdi:Session/imdi:Resource OR //imdi:Reference/imdi:Link,No tag Resource within Session and Link within Reference, Link only available in ExternalResourceReference, +isBasedOn,0 - unbounded,URI,//imdi:Session/imdi:Resource OR //imdi:Reference/imdi:Link,No tag Resource within Session and Link within Reference, Link only available in ExternalResourceReference, +ObjectLanguage,1 - unbounded,string,//imdi:Session//imdi:Content/imdi:Languages/imdi:Language,,, +LinguisticDataType,0 - unbounded,string,N/A,Not expressable in IMDI,, +Modality,0 - unbounded,string,//imdi:Session//imdi:Content/imdi:Modalities,,, +Language,1 - unbounded,N/A,//imdi:Session//imdi:Content/imdi:Languages/imdi:Language,,, +LanguageName,1 - unbounded,string,//imdi:Session//imdi:Content/imdi:Languages/imdi:Language/imdi:Name,,, +LanguagePreferredLable,0 - 1,URI OR string,N/A,Not expressable in IMDI,, +LanguageIdentifier,1 - unbounded,URI,//imdi:Session//imdi:Content/imdi:Languages/imdi:Language/imdi:Id,,, diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/resources/tei-generic.csv b/src/main/java/de/uni_hamburg/corpora/validation/quest/resources/metadata/tei-generic.csv similarity index 100% rename from src/main/java/de/uni_hamburg/corpora/validation/quest/resources/tei-generic.csv rename to src/main/java/de/uni_hamburg/corpora/validation/quest/resources/metadata/tei-generic.csv diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/resources/xsd/coma.xsd b/src/main/java/de/uni_hamburg/corpora/validation/quest/resources/xsd/coma.xsd new file mode 100644 index 0000000000000000000000000000000000000000..559f75a6ec1d7a1ed5ae4da0880fe23d0390756c --- /dev/null +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/resources/xsd/coma.xsd @@ -0,0 +1,383 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- edited with XMLSPY v2004 rel. 3 U (http://www.xmlspy.com) by eval (eval) --> +<!-- schema version 0.5a040130 --> +<!--changes: + 2.7_20111114: AsocFileType also for speakers and corpora. + 2.6_20110125: AsocFileType. Für Hanna. + 2.5_20101222: replaced all file/URL/path references with FileType (HH) + 2.1_20101216: moved Availability from Recording to Media (HH), renamed "Filestores" to "Mirrors" + 2.0: added "role" as replacement for speaker-linking via "setting" + 1.6_20100412: added FileType for attaching files + 1.6_20090326: elements below personType do not need a specific order anymore. removed "related person"-element, made PeriodExact optional + 1.5_20081215: changed schema so Communication and Speakers can appear in any sequence + changed RecodringDuration to milliseconds (xs:long type) + 1.1_20080603: added "Annotation" and "AnnotationType" to "Transcription" to use it with "sextant" + 1.0_20080402: changed "known human" from mandatory to optional since nobody understands it anyway + 0.9_20070320: changed CorpusData from xs:sequence to xs:choice maxOccurs="unbounded" to allow speaker / comm-childs unordered + 0.8_20070320: added attribute "uniqueSpeakerDistinction" to CorpusType for usage with the PartiturEditor + 0.7_20070222: added attribute "Type" to Languagetype + 0.6_20070222: added attribute "Type" to Location-Complextype + 0.5a_20040130 - initial +--> +<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema" elementFormDefault="qualified" attributeFormDefault="qualified"> + <xs:element name="Corpus"> + <xs:annotation> + <xs:documentation>Mother Of All Metadata Elements (MOAME(tm))</xs:documentation> + </xs:annotation> + <xs:complexType> + <xs:complexContent> + <xs:restriction base="CorpusType"> + <xs:sequence> + <xs:element name="DBNode" minOccurs="0"> + <xs:annotation> + <xs:documentation>deprecated: forgot it's purpose</xs:documentation> + </xs:annotation> + </xs:element> + <xs:element name="Description" type="DescriptionType" minOccurs="0"> + <xs:annotation> + <xs:documentation>description of the (sub)corpus</xs:documentation> + </xs:annotation> + </xs:element> + <xs:choice minOccurs="0"> + <xs:element name="CorpusData" type="CorpusData"/> + <xs:element name="Corpus" type="CorpusType" maxOccurs="unbounded"/> + </xs:choice> + </xs:sequence> + </xs:restriction> + </xs:complexContent> + </xs:complexType> + </xs:element> + <xs:complexType name="LocationType"> + <xs:annotation> + <xs:documentation>type for storing addresses</xs:documentation> + </xs:annotation> + <xs:all> + <xs:element name="Street" type="xs:string" minOccurs="0"/> + <xs:element name="City" type="xs:string" minOccurs="0"/> + <xs:element name="PostalCode" type="xs:string" minOccurs="0"/> + <xs:element name="Country" type="xs:string" minOccurs="0"/> + <xs:element name="Period" type="PeriodType" minOccurs="0"/> + <xs:element name="Description" type="DescriptionType" minOccurs="0"/> + </xs:all> + <xs:attribute name="Type" type="xs:string" use="optional"/> + </xs:complexType> + <xs:simpleType name="non-empty-string"> + <xs:annotation> + <xs:documentation>should not be empty ;)</xs:documentation> + </xs:annotation> + <xs:restriction base="xs:string"> + <xs:minLength value="1"/> + </xs:restriction> + </xs:simpleType> + <xs:complexType name="CorpusType"> + <xs:annotation> + <xs:documentation>Type for storing corpus information</xs:documentation> + </xs:annotation> + <xs:sequence> + <xs:element name="DBNode" minOccurs="0"> + <xs:annotation> + <xs:documentation>link to the node containing the corpus</xs:documentation> + </xs:annotation> + </xs:element> + <xs:element name="Description" type="DescriptionType" minOccurs="0"> + <xs:annotation> + <xs:documentation>description of the (sub)corpus</xs:documentation> + </xs:annotation> + </xs:element> + <xs:element name="Mirrors" type="DescriptionType" minOccurs="0" maxOccurs="unbounded"> + <xs:annotation> + <xs:documentation>description can hold links to mirrored versions of the + corpus</xs:documentation> + </xs:annotation> + </xs:element> + <xs:choice minOccurs="0"> + <xs:element name="CorpusData" type="CorpusData"/> + <xs:element name="Corpus" type="CorpusType" maxOccurs="unbounded"/> + </xs:choice> + <xs:element name="AsocFile" type="AsocFileType" minOccurs="0" maxOccurs="unbounded"/> + </xs:sequence> + <xs:attribute name="Name" type="non-empty-string" use="required"/> + <xs:attribute name="Id" type="xs:ID" use="required"/> + <xs:attribute name="Parent" type="xs:IDREF" use="optional"/> + <xs:attribute name="uniqueSpeakerDistinction" type="xs:string" use="optional"> </xs:attribute> + <xs:attribute name="schemaVersion" type="xs:string" use="required"/> + </xs:complexType> + <xs:complexType name="RecordingType"> + <xs:annotation> + <xs:documentation>type for storing information about recordings</xs:documentation> + </xs:annotation> + <xs:sequence> + <xs:element name="Name" type="xs:string" minOccurs="0"/> + <xs:element name="Description" type="DescriptionType" minOccurs="0"/> + <xs:element name="Media" type="MediaType" minOccurs="0" maxOccurs="unbounded"> + <xs:annotation> + <xs:documentation>deprecated: use File Element instead</xs:documentation> + </xs:annotation> + </xs:element> + <xs:element name="File" type="FileType" minOccurs="0" maxOccurs="unbounded"/> + <xs:element name="RecordingDateTime" type="xs:dateTime" minOccurs="0"/> + <xs:element name="RecordingDuration" type="xs:long" minOccurs="0"/> + <xs:element name="Availablilty" type="AvailabilityType" minOccurs="0"> + <xs:annotation> + <xs:documentation>deprecated: exists inside File</xs:documentation> + </xs:annotation> + </xs:element> + </xs:sequence> + <xs:attribute name="Id" type="xs:ID" use="required"/> + </xs:complexType> + <xs:complexType name="AvailabilityType"> + <xs:annotation> + <xs:documentation>type for storing copyright/obtaining data</xs:documentation> + </xs:annotation> + <xs:sequence> + <xs:element name="Available" type="xs:boolean"/> + <xs:element name="URL" type="xs:anyURI" minOccurs="0" maxOccurs="unbounded"/> + <xs:element name="Copyright" type="xs:string" minOccurs="0"/> + <xs:element name="ObtainingInformation" type="DescriptionType" minOccurs="0"/> + </xs:sequence> + </xs:complexType> + <xs:complexType name="DescriptionType"> + <xs:annotation> + <xs:documentation>type for storing descriptions for various elements</xs:documentation> + </xs:annotation> + <xs:sequence> + <xs:element name="Key" type="KeyType" minOccurs="0" maxOccurs="unbounded"/> + </xs:sequence> + </xs:complexType> + <xs:complexType name="MediaType"> + <xs:annotation> + <xs:documentation>deprecated: use File type instead</xs:documentation> + </xs:annotation> + <xs:all> + <xs:element name="Description" type="DescriptionType" minOccurs="0"/> + <xs:element name="FileStore" type="xs:string" minOccurs="0"/> + <xs:element name="Filename" minOccurs="0"/> + <xs:element name="NSLink" type="xs:anyURI" minOccurs="0"/> + <xs:element name="LastBackup" type="xs:date" minOccurs="0"/> + <xs:element name="Availability" type="AvailabilityType" minOccurs="0"/> + </xs:all> + <xs:attribute name="Id" type="xs:ID" use="required"/> + </xs:complexType> + <xs:complexType name="FileType"> + <xs:annotation> + <xs:documentation>a file in the filesystem</xs:documentation> + </xs:annotation> + <xs:all> + <xs:element name="Availability" type="AvailabilityType"/> + <xs:element name="Description" type="DescriptionType" minOccurs="0"/> + <xs:element name="filename" type="xs:string" minOccurs="0"/> + <xs:element name="mimetype" type="xs:string" minOccurs="0"/> + <xs:element name="relPath" type="xs:string" minOccurs="0"/> + <xs:element name="absPath" type="xs:anyURI" minOccurs="0"/> + <xs:element name="URL" type="xs:anyURI" minOccurs="0"/> + </xs:all> + <xs:attribute name="Id" type="xs:ID" use="required"> + <xs:annotation> + <xs:documentation>hanna wants to get rid of this...</xs:documentation> + </xs:annotation> + </xs:attribute> + </xs:complexType> + <xs:complexType name="PersonType"> + <xs:annotation> + <xs:documentation>type for storing speaker descriptions</xs:documentation> + </xs:annotation> + <xs:sequence> + <xs:choice maxOccurs="unbounded"> + <xs:element name="Sigle" type="xs:string" minOccurs="1" maxOccurs="1"/> + <xs:element name="KnownHuman" type="xs:boolean" default="true" minOccurs="0" maxOccurs="1"> + <xs:annotation> + <xs:documentation>deprecated: didn't turn out to be + useful</xs:documentation> + </xs:annotation> + </xs:element> + <xs:element name="Pseudo" type="xs:string" minOccurs="0" maxOccurs="1"> + <xs:annotation> + <xs:documentation>Can hold names, but names should always be + pseudos</xs:documentation> + </xs:annotation> + </xs:element> + <xs:element name="Sex" type="xs:anySimpleType" minOccurs="0" maxOccurs="1"/> + <xs:element name="Location" type="LocationType" minOccurs="0" maxOccurs="unbounded"/> + <xs:element name="Description" type="DescriptionType" minOccurs="0" maxOccurs="1"/> + <xs:element name="Language" type="LanguageType" minOccurs="0" maxOccurs="unbounded"/> + <xs:element name="role" type="roleType" minOccurs="0" maxOccurs="unbounded"/> + <xs:element name="AsocFile" type="AsocFileType" minOccurs="0" maxOccurs="unbounded"/> + </xs:choice> + </xs:sequence> + <xs:attribute name="Id" type="xs:ID" use="required"/> + </xs:complexType> + + <xs:complexType name="AsocFileType"> + <xs:annotation> + <xs:documentation>type for storing any associated file</xs:documentation> + </xs:annotation> + <xs:sequence> + <xs:element name="Name"/> + <xs:element name="File" type="FileType"/> + <xs:element name="Description" type="DescriptionType" minOccurs="0"/> + </xs:sequence> + <xs:attribute name="Id" type="xs:ID" use="required"/> + </xs:complexType> + + <xs:complexType name="TranscriptionType"> + <xs:annotation> + <xs:documentation>type for storing transcript-information</xs:documentation> + </xs:annotation> + <xs:sequence> + <xs:element name="Name"/> + <xs:element name="File" type="FileType"/> + <xs:element name="FileStore" type="xs:string" minOccurs="0"> + <xs:annotation> + <xs:documentation>deprecated: forgot it's purpose</xs:documentation> + </xs:annotation> + </xs:element> + <xs:element name="Filename"> + <xs:annotation> + <xs:documentation>deprecated: now uses File type</xs:documentation> + </xs:annotation> + </xs:element> + <xs:element name="NSLink" type="xs:anyURI"> + <xs:annotation> + <xs:documentation>deprecated: now uses File type</xs:documentation> + </xs:annotation> + </xs:element> + <xs:element name="Description" type="DescriptionType" minOccurs="0"/> + <xs:element name="Availability" type="AvailabilityType" minOccurs="0"> + <xs:annotation> + <xs:documentation>deprecated: now uses File type</xs:documentation> + </xs:annotation> + </xs:element> + <xs:element name="Annotation" type="AnnotationType" minOccurs="0"/> + </xs:sequence> + <xs:attribute name="Id" type="xs:ID" use="required"/> + </xs:complexType> + <xs:complexType name="AnnotationType"> + <xs:annotation> + <xs:documentation>type for linking to annotation files</xs:documentation> + </xs:annotation> + <xs:all> + <xs:element name="Description" type="DescriptionType"/> + <xs:element name="File" type="FileType"/> + </xs:all> + <xs:attribute name="Id" type="xs:ID" use="required"/> + <xs:attribute name="Name" type="xs:string"/> + </xs:complexType> + <xs:complexType name="LanguageType"> + <xs:annotation> + <xs:documentation>type for storing languages</xs:documentation> + </xs:annotation> + <xs:all> + <xs:element name="LanguageCode" type="xs:string"/> + <xs:element name="Description" type="DescriptionType" minOccurs="0"/> + </xs:all> + <xs:attribute name="Type" type="xs:string" use="optional"/> + </xs:complexType> + <xs:complexType name="CommunicationType"> + <xs:annotation> + <xs:documentation>type for storing Session Data. [06.10.03] Media deleted, Object + added</xs:documentation> + </xs:annotation> + <xs:sequence maxOccurs="unbounded"> + <xs:choice> + <xs:element name="Description" type="DescriptionType" minOccurs="0"/> + <xs:element name="Setting"> + <xs:complexType> + <xs:complexContent> + <xs:extension base="SettingType"/> + </xs:complexContent> + </xs:complexType> + </xs:element> + <xs:element name="Recording" type="RecordingType" minOccurs="0" maxOccurs="unbounded"/> + <xs:element name="Transcription" type="TranscriptionType" minOccurs="0" maxOccurs="unbounded"/> + <xs:element name="Location" type="LocationType" minOccurs="0"/> + <xs:element name="Language" type="LanguageType" minOccurs="0" maxOccurs="unbounded"/> + <xs:element name="File" type="FileType" minOccurs="0" maxOccurs="unbounded"> + <xs:annotation> + <xs:documentation>deprecated: now uses AsocFile</xs:documentation> + </xs:annotation> + </xs:element> + <xs:element name="AsocFile" type="AsocFileType" minOccurs="0" maxOccurs="unbounded"/> + </xs:choice> + </xs:sequence> + <xs:attribute name="Id" type="xs:ID" use="required"/> + <xs:attribute name="Name" type="xs:string" use="required"/> + </xs:complexType> + <xs:complexType name="CorpusData"> + <xs:annotation> + <xs:documentation>actual corpus data</xs:documentation> + </xs:annotation> + <xs:sequence> + <xs:choice maxOccurs="unbounded"> + <xs:element name="Communication" type="CommunicationType" minOccurs="0" maxOccurs="unbounded"> + <xs:annotation> + <xs:documentation>=Session=Discourse (im Moment am ehesten IMDI, im Grunde + nur Location Data) </xs:documentation> + </xs:annotation> + </xs:element> + <xs:element name="Speaker" type="PersonType" minOccurs="0" maxOccurs="unbounded"/> + </xs:choice> + </xs:sequence> + </xs:complexType> + <xs:complexType name="KeyType"> + <xs:annotation> + <xs:documentation>key/value pairs fpr storing all kinds of + information</xs:documentation> + </xs:annotation> + <xs:simpleContent> + <xs:extension base="xs:string"> + <xs:attribute name="Name" type="xs:string" use="required"/> + </xs:extension> + </xs:simpleContent> + </xs:complexType> + <xs:complexType name="ObjectType"> + <xs:annotation> + <xs:documentation>Objects used/present in a communication</xs:documentation> + </xs:annotation> + <xs:all> + <xs:element name="Name" type="xs:string"/> + <xs:element name="Description" type="DescriptionType" minOccurs="0"/> + <xs:element name="Availability" type="AvailabilityType" minOccurs="0"/> + </xs:all> + <xs:attribute name="Id" type="xs:ID" use="required"/> + </xs:complexType> + <xs:complexType name="SettingType"> + <xs:annotation> + <xs:documentation>Setting of a recording (communication?)</xs:documentation> + </xs:annotation> + <xs:sequence> + <xs:choice maxOccurs="unbounded"> + <xs:element name="Person" type="xs:IDREF" minOccurs="0" maxOccurs="unbounded"/> + <xs:element name="Description" type="DescriptionType" minOccurs="0"/> + <xs:element name="Object" type="ObjectType" minOccurs="0" maxOccurs="unbounded"/> + </xs:choice> + </xs:sequence> + </xs:complexType> + <xs:complexType name="PeriodType"> + <xs:annotation> + <xs:documentation>marks a period of time</xs:documentation> + </xs:annotation> + <xs:all minOccurs="0"> + <xs:element name="PeriodStart" type="xs:dateTime" minOccurs="0"/> + <xs:element name="PeriodExact" type="xs:boolean" minOccurs="0"/> + <xs:element name="PeriodDuration" type="xs:long" minOccurs="0"/> + </xs:all> + </xs:complexType> + <xs:complexType name="roleType"> + <!-- + role types starting with a '#' are used coma-internal + + pre-defined role types: + #participant = speaker participating in a communication; replaces linking in settings + --> + <xs:annotation> + <xs:documentation>Role of Speakers (and potentially other datatypes)</xs:documentation> + </xs:annotation> + <xs:sequence> + <xs:choice maxOccurs="unbounded"> + <xs:element name="Description" type="DescriptionType" minOccurs="0"/> + </xs:choice> + </xs:sequence> + <xs:attribute name="Type" type="xs:string" use="optional"/> + <xs:attribute name="target" type="xs:IDREF" use="required"/> + </xs:complexType> +</xs:schema> \ No newline at end of file diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/resources/xsd/eaf.xsd b/src/main/java/de/uni_hamburg/corpora/validation/quest/resources/xsd/eaf.xsd new file mode 100644 index 0000000000000000000000000000000000000000..fdc05e23729a50ed20343acb1527a57c50a0cbb7 --- /dev/null +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/resources/xsd/eaf.xsd @@ -0,0 +1,816 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + VERSION 3.0 + DATE December 2016 + - added two referential link elements, CROSS_REF_LINK and GROUP_REF_LINK, contained in + sets of such elements, REF_LINK_SET. + + VERSION 2.8 + DATE April 2014 + - changes that add support for multilingual controlled vocabularies and for associating tiers + and annotations with a specific language + - added new element LANGUAGE + - changed the structure of CONTROLLED_VOCABULARY and CV_ENTRY elements + - a cv entry can now have multiple CVE_VALUE child nodes with a language reference + - introduction of a LICENSE element + - added an EXT_REF attribute on the TIER level, so that e.g. a data category reference can be specified on + the tier level (overriding the one specified on the TYPE level) + + VERSION 2.7 + DATE December 2010 + - new elements and attributes where added in relation to + - support for externally defined controlled vocabularies. A new possible root element CV_RESOURCE + has been added for such vocabularies in an eaf like xml file. Annotations can hold a reference + to the id of ean entry in an external CV. + - a new element for storing information about a lexicon and about a link to an entry or a field + in a lexicon has been added. A linguistic type can be associated with a lexicon or a field / + data category in a lexicon + + VERSION 2.6 + DATE May 2008 + - added elements and attributes for references to concepts defined in the ISO Data Category Registry + and possibly/eventually other external resources. + - attribute EXT_REF added to type annotationAttribute, to elements CV_ENTRY and LINGUISTIC_TYPE + - element EXTERNAL_REF with attributes EXT_REF_ID, TYPE and VALUE + + DATE November 2007 + - added optional attributes: RELATIVE_MEDIA_URL to MEDIA_DESCRIPTOR and RELATIVE_LINK_URL to + LINKED_FILE_DESCRIPTOR for storage of relative url's + - changed the FORMAT from fixed to default, and from 2.4. to 2.5 + + DATE December 2006 + - added attribute: ANNOTATOR to element TIER + - added element: PROPERTY to element HEADER + - changed the type of attribute SVG_REF of ALIGNABLE_ANNOTATION to xsd:string since + it does not refer to an ID in the same file + - changed the type of the TIME_ALIGNABLE and GRAPHIC_REFERENCES attributes of the LINGUISTIC_TYPE + element to type="xsd:boolean" (was xsd:string) + - changed the ID/IDREF mechanism for the combinations of: + - TIER/TIER_ID and TIER/PARENT_REF + - LINGUISTIC_TYPE/LINGUISTIC_TYPE_ID and TIER/LINGUISTIC_TYPE_REF + - CONTROLLED_VOCABULARY/CV_ID and LINGUISTIC_TYPE/CONTROLLED_VOCABULARY_REF + into pairs of xsd:key and xsd:keyref elements. + The advantage is that the ID's only have to be unique per element type (e.g. TIER_ID's + should be unique within the TIER elements but can be the same as a LINGUISTIC_TYPE_ID) + and that there are no constraints on characters that can be used in id's/names. +--> +<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema"> + <xsd:annotation> + <xsd:documentation xml:lang="en"> + ELAN Annotation Format + version 3.0 + December 2016 + Schema by Alexander Klassmann 17/01/03 + Adapted by Hennie Brugman, Han Sloetjes, Micha Hulsbosch + </xsd:documentation> + </xsd:annotation> + + <xsd:element name="ANNOTATION_DOCUMENT"> + <xsd:complexType> + <xsd:sequence> + <xsd:element name="LICENSE" type="licenseType" minOccurs="0" maxOccurs="unbounded"/> + <xsd:element name="HEADER" type="headType"/> + <xsd:element name="TIME_ORDER" type="timeType"/> + <xsd:element name="TIER" type="tierType" minOccurs="0" maxOccurs="unbounded"/> + <xsd:element name="LINGUISTIC_TYPE" type="lingType" minOccurs="0" maxOccurs="unbounded"/> + <xsd:element name="LOCALE" type="localeType" minOccurs="0" maxOccurs="unbounded"/> + <xsd:element name="LANGUAGE" type="langType" minOccurs="0" maxOccurs="unbounded"/> + <xsd:element name="CONSTRAINT" type="constraintType" minOccurs="0" maxOccurs="unbounded"/> + <xsd:element name="CONTROLLED_VOCABULARY" type="convocType" minOccurs="0" maxOccurs="unbounded"> + <xsd:key name="cvEntryKey"> + <xsd:annotation> + <xsd:documentation xml:lang="en"> + The entry id should be unique within the collection of entry elements + </xsd:documentation> + </xsd:annotation> + <xsd:selector xpath="CV_ENTRY_ML"/> + <xsd:field xpath="@CVE_ID"/> + </xsd:key> + </xsd:element> + <xsd:element name="LEXICON_REF" type="lexRefType" minOccurs="0" maxOccurs="unbounded"/> + <xsd:element name="REF_LINK_SET" type="refLinksType" minOccurs="0" maxOccurs="unbounded"/> + <xsd:element name="EXTERNAL_REF" type="extRefType" minOccurs="0" maxOccurs="unbounded"/> + </xsd:sequence> + <xsd:attribute name="DATE" type="xsd:dateTime" use="required"/> + <xsd:attribute name="AUTHOR" type="xsd:string" use="required"/> + <xsd:attribute name="VERSION" type="xsd:string" use="required"/> + <xsd:attribute name="FORMAT" type="xsd:string" use="optional" default="3.0"/> + </xsd:complexType> + + <!-- define key - keyref pairs --> + <xsd:key name="tierNameKey"> + <xsd:annotation> + <xsd:documentation xml:lang="en"> + The Tier name/id should be unique within the collection + of Tier elements + </xsd:documentation> + </xsd:annotation> + <xsd:selector xpath="TIER"/> + <xsd:field xpath="@TIER_ID"/> + </xsd:key> + <xsd:keyref name="tierNameRef" refer="tierNameKey"> + <xsd:annotation> + <xsd:documentation xml:lang="en"> + A Tier can be associated with a parent Tier by referring to an existing Tier id. + </xsd:documentation> + </xsd:annotation> + <xsd:selector xpath="TIER"/> + <xsd:field xpath="@PARENT_REF"/> + </xsd:keyref> + + <xsd:key name="linTypeNameKey"> + <xsd:annotation> + <xsd:documentation xml:lang="en"> + The Linguistic Type name/id should be unique within the collection + of Linguistic Type elements + </xsd:documentation> + </xsd:annotation> + <xsd:selector xpath="LINGUISTIC_TYPE"/> + <xsd:field xpath="@LINGUISTIC_TYPE_ID"/> + </xsd:key> + <xsd:keyref name="linTypeNameRef" refer="linTypeNameKey"> + <xsd:annotation> + <xsd:documentation xml:lang="en"> + A Tier must refer to an existing Linguistic Type id. + </xsd:documentation> + </xsd:annotation> + <xsd:selector xpath="TIER"/> + <xsd:field xpath="@LINGUISTIC_TYPE_REF"/> + </xsd:keyref> + + <xsd:key name="cvNameKey"> + <xsd:annotation> + <xsd:documentation xml:lang="en"> + The Controlled Vocabulary name/id should be unique within the + collection of Controlled Vocabulary elements + </xsd:documentation> + </xsd:annotation> + <xsd:selector xpath="CONTROLLED_VOCABULARY"/> + <xsd:field xpath="@CV_ID"/> + </xsd:key> + <xsd:keyref name="cvNameRef" refer="cvNameKey"> + <xsd:annotation> + <xsd:documentation xml:lang="en"> + A Linguistic Type can be associated with a Controlled Vocabulary by + referring to an existing Controlled Vocabulary id. + </xsd:documentation> + </xsd:annotation> + <xsd:selector xpath="LINGUISTIC_TYPE"/> + <xsd:field xpath="@CONTROLLED_VOCABULARY_REF"/> + </xsd:keyref> + + <xsd:key name="lexNameKey"> + <xsd:annotation> + <xsd:documentation xml:lang="en"> + The Lexicon Service name/id should be unique within the + collection of Lexicon Service elements + </xsd:documentation> + </xsd:annotation> + <xsd:selector xpath="LEXICON_REF"/> + <xsd:field xpath="@LEX_REF_ID"/> + </xsd:key> + <xsd:keyref name="lexNameRef" refer="lexNameKey"> + <xsd:annotation> + <xsd:documentation xml:lang="en"> + A Linguistic Type can be associated with a Lexicon Service by + referring to an existing Lexicon Service id. + </xsd:documentation> + </xsd:annotation> + <xsd:selector xpath="LINGUISTIC_TYPE"/> + <xsd:field xpath="@LEXICON_REF"/> + </xsd:keyref> + + <!-- added in 2.8 but unrelated to the introduction of new elements and attributes --> + <!-- previous annotation reference --> + <xsd:key name="prevAnnoKey"> + <xsd:annotation> + <xsd:documentation> + A key and keyref pair to enforce that a previous annotation idref at least refers + to an annotation id of a reference annotation. + </xsd:documentation> + </xsd:annotation> + <xsd:selector xpath="TIER/ANNOTATION/REF_ANNOTATION"/> + <xsd:field xpath="@ANNOTATION_ID"/> + </xsd:key> + <xsd:keyref name="prevAnnoRef" refer="prevAnnoKey"> + <xsd:selector xpath="TIER/ANNOTATION/REF_ANNOTATION"/> + <xsd:field xpath="@PREVIOUS_ANNOTATION"/> + </xsd:keyref> + <!-- time slot references --> + <xsd:key name="timeSlotKey"> + <xsd:annotation> + <xsd:documentation> + Two key-keyref pairs to enforce that time slot references refer to the id of a time slot. + </xsd:documentation> + </xsd:annotation> + <xsd:selector xpath="TIME_ORDER/TIME_SLOT"/> + <xsd:field xpath="@TIME_SLOT_ID"/> + </xsd:key> + <xsd:keyref name="timeSlotRef1" refer="timeSlotKey"> + <xsd:selector xpath="TIER/ANNOTATION/ALIGNABLE_ANNOTATION"/> + <xsd:field xpath="@TIME_SLOT_REF1"/> + </xsd:keyref> + <xsd:keyref name="timeSlotRef2" refer="timeSlotKey"> + <xsd:selector xpath="TIER/ANNOTATION/ALIGNABLE_ANNOTATION"/> + <xsd:field xpath="@TIME_SLOT_REF2"/> + </xsd:keyref> + + <!-- introduced in 2.8 --> + <xsd:key name="langIdKey"> + <xsd:annotation> + <xsd:documentation> + The ID of a language identifier, can be referred to by any element that + needs a reference to a language identifier. + </xsd:documentation> + </xsd:annotation> + <xsd:selector xpath="LANGUAGE"/> + <xsd:field xpath="@LANG_ID"/> + </xsd:key> + <xsd:keyref name="cvValueLangRef" refer="langIdKey"> + <xsd:annotation> + <xsd:documentation> + Reference from a value in a multilingual CV to a language identifier. + </xsd:documentation> + </xsd:annotation> + <xsd:selector xpath="CONTROLLED_VOCABULARY/CV_ENTRY_ML/CVE_VALUE"/> + <xsd:field xpath="@LANG_REF"/> + </xsd:keyref> + <xsd:keyref name="cvDescLangRef" refer="langIdKey"> + <xsd:annotation> + <xsd:documentation> + Reference from a description in a multilingual CV to a language identifier. + </xsd:documentation> + </xsd:annotation> + <xsd:selector xpath="CONTROLLED_VOCABULARY/DESCRIPTION"/> + <xsd:field xpath="@LANG_REF"/> + </xsd:keyref> + <xsd:keyref name="tierLangRef" refer="langIdKey"> + <xsd:annotation> + <xsd:documentation> + Reference from a tier to a language identifier, to indicate the (main) language recorded + on that tier. + </xsd:documentation> + </xsd:annotation> + <xsd:selector xpath="TIER"/> + <xsd:field xpath="@LANG_REF"/> + </xsd:keyref> + <xsd:keyref name="annoAlignLangRef" refer="langIdKey"> + <xsd:annotation> + <xsd:documentation> + Reference from an individual alignable annotation to a language identifier. + </xsd:documentation> + </xsd:annotation> + <xsd:selector xpath="TIER/ANNOTATION/ALIGNABLE_ANNOTATION"/> + <xsd:field xpath="@LANG_REF"/> + </xsd:keyref> + <xsd:keyref name="annoRefLangRef" refer="langIdKey"> + <xsd:annotation> + <xsd:documentation> + Reference from an individual reference annotation to a language identifier. + </xsd:documentation> + </xsd:annotation> + <xsd:selector xpath="TIER/ANNOTATION/REF_ANNOTATION"/> + <xsd:field xpath="@LANG_REF"/> + </xsd:keyref> + <!-- + Since we try to describe that the @CVE_IDs are unique within the CONTROLLED_VOCABULARY, + the xsd:key element must be located just inside the CONTROLLED_VOCABULARY. + <xsd:key name="cvEntryKey"> + <xsd:annotation> + <xsd:documentation xml:lang="en"> + The entry id should be unique within the collection of entry elements + </xsd:documentation> + </xsd:annotation> + <xsd:selector xpath="CV_ENTRY_ML"/> + <xsd:field xpath="@CVE_ID"/> + </xsd:key> + --> + <!-- + Getting from the CVE_REF to the appropriately matching CVE_ID isn't so simple! + It probably can't be done in XPath, never mind the more restricted version that is + allowed here. + http://www.w3.org/TR/2004/PER-xmlschema-1-20040318/structures.html#coss-identity-constraint + + TIER/ALIGNABLE_ANNOTATION/@CVE_REF/../../@LINGUISTIC_TYPE_REF => call this value x + search for a value equal to x in + LINGUISTIC_TYPE/@LINGUISTIC_TYPE_ID . When found, take (relative to that) + ../@CONTROLLED_VOCABULARY_REF => call this value y + search for a value equal to y in + CONTROLLED_VOCABULARY/@CV_ID and this is the CONTROLLED_VOCABULARY which should + contain (in CVE_ENTRY_ML/@CVE_ID) the value from @CVE_REF. + + + A weaker check could just try to find any matching CONTROLLED_VOCABULARY/CVE_ENTRY_ML/@CVE_ID, + without checking if this is in the correct CONTROLLED_VOCABULARY. + + According to http://docstore.mik.ua/orelly/xml/schema/ch09_02.htm, putting a keyref + in a parent node of some key definition creates an extra uniqueness constraint on + the key values. That is not desired here. + (The validator that we use doesn't seem to check that but gives other, strange, error + messages) + <xsd:keyref name="cvEntryAlignRef" refer="cvEntryKey"> + <xsd:selector xpath="TIER/ANNOTATION/ALIGNABLE_ANNOTATION"/> + <xsd:field xpath="@CVE_REF"/> + </xsd:keyref> + <xsd:keyref name="cvEntryRefRef" refer="cvEntryKey"> + <xsd:selector xpath="TIER/ANNOTATION/REF_ANNOTATION"/> + <xsd:field xpath="@CVE_REF"/> + </xsd:keyref> + --> + <xsd:key name="alignAnnotationIdKey"> + <xsd:selector xpath="TIER/ANNOTATION/ALIGNABLE_ANNOTATION"/> + <xsd:field xpath="@ANNOTATION_ID"/> + </xsd:key> + <xsd:key name="refAnnotationIdKey"> + <xsd:selector xpath="TIER/ANNOTATION/REF_ANNOTATION"/> + <xsd:field xpath="@ANNOTATION_ID"/> + </xsd:key> + <!-- set of key and key refs for referential links + 4 keys for links to refer to: alignable and reference annotation id's and cross link and group link id. + 2 x 4 keyrefs for the cross link ref1 and ref2 idrefs to one of the 4 keys and + 1 x 4 keyrefs for the group link refs idrefs to one of the 4 keys. + --> + <xsd:key name="crossRefLinkIdKey"> + <xsd:selector xpath="REF_LINK_SET/CROSS_REF_LINK"/> + <xsd:field xpath="@REF_LINK_ID"/> + </xsd:key> + <xsd:key name="groupRefLinkIdKey"> + <xsd:selector xpath="REF_LINK_SET/GROUP_REF_LINK"/> + <xsd:field xpath="@REF_LINK_ID"/> + </xsd:key> + + <xsd:keyref name="crossLinkRef1AlignAnnoKeyRef" refer="alignAnnotationIdKey"> + <xsd:selector xpath="REF_LINK_SET/CROSS_REF_LINK"/> + <xsd:field xpath="REF1"/> + </xsd:keyref> + <xsd:keyref name="crossLinkRef1RefAnnoKeyRef" refer="refAnnotationIdKey"> + <xsd:selector xpath="REF_LINK_SET/CROSS_REF_LINK"/> + <xsd:field xpath="REF1"/> + </xsd:keyref> + <xsd:keyref name="crossLinkRef1CrossLinkKeyRef" refer="crossRefLinkIdKey"> + <xsd:selector xpath="REF_LINK_SET/CROSS_REF_LINK"/> + <xsd:field xpath="REF1"/> + </xsd:keyref> + <xsd:keyref name="crossLinkRef1GroupLinkKeyRef" refer="groupRefLinkIdKey"> + <xsd:selector xpath="REF_LINK_SET/CROSS_REF_LINK"/> + <xsd:field xpath="REF1"/> + </xsd:keyref> + + <xsd:keyref name="crossLinkRef2AlignAnnoKeyRef" refer="alignAnnotationIdKey"> + <xsd:selector xpath="REF_LINK_SET/CROSS_REF_LINK"/> + <xsd:field xpath="REF2"/> + </xsd:keyref> + <xsd:keyref name="crossLinkRef2RefAnnoKeyRef" refer="refAnnotationIdKey"> + <xsd:selector xpath="REF_LINK_SET/CROSS_REF_LINK"/> + <xsd:field xpath="REF2"/> + </xsd:keyref> + <xsd:keyref name="crossLinkRef2CrossLinkKeyRef" refer="crossRefLinkIdKey"> + <xsd:selector xpath="REF_LINK_SET/CROSS_REF_LINK"/> + <xsd:field xpath="REF2"/> + </xsd:keyref> + <xsd:keyref name="crossLinkRef2GroupLinkKeyRef" refer="groupRefLinkIdKey"> + <xsd:selector xpath="REF_LINK_SET/CROSS_REF_LINK"/> + <xsd:field xpath="REF2"/> + </xsd:keyref> + + + <xsd:keyref name="groupLinkRefsAlignAnnoKeyRef" refer="alignAnnotationIdKey"> + <xsd:selector xpath="REF_LINK_SET/GROUP_REF_LINK"/> + <xsd:field xpath="REFS"/> + </xsd:keyref> + <xsd:keyref name="groupLinkRefsRefAnnoKeyRef" refer="refAnnotationIdKey"> + <xsd:selector xpath="REF_LINK_SET/GROUP_REF_LINK"/> + <xsd:field xpath="REFS"/> + </xsd:keyref> + <xsd:keyref name="groupLinkRefsCrossLinkKeyRef" refer="crossRefLinkIdKey"> + <xsd:selector xpath="REF_LINK_SET/GROUP_REF_LINK"/> + <xsd:field xpath="REFS"/> + </xsd:keyref> + <xsd:keyref name="groupLinkRefsGroupLinkKeyRef" refer="groupRefLinkIdKey"> + <xsd:selector xpath="REF_LINK_SET/GROUP_REF_LINK"/> + <xsd:field xpath="REFS"/> + </xsd:keyref> + <!-- end of key - keyref pairs --> + </xsd:element> + + <xsd:complexType name="headType"> + <xsd:sequence> + <xsd:element name="MEDIA_DESCRIPTOR" minOccurs="0" maxOccurs="unbounded"> + <xsd:complexType> + <xsd:attribute name="MEDIA_URL" type="xsd:anyURI" use="required"/> + <xsd:attribute name="RELATIVE_MEDIA_URL" type="xsd:anyURI" use="optional"/> + <xsd:attribute name="MIME_TYPE" type="xsd:string" use="required"/> + <xsd:attribute name="TIME_ORIGIN" type="xsd:long" use="optional"/> + <xsd:attribute name="EXTRACTED_FROM" type="xsd:anyURI" use="optional"/> + </xsd:complexType> + </xsd:element> + <xsd:element name="LINKED_FILE_DESCRIPTOR" minOccurs="0" maxOccurs="unbounded"> + <xsd:complexType> + <xsd:attribute name="LINK_URL" type="xsd:anyURI" use="required"/> + <xsd:attribute name="RELATIVE_LINK_URL" type="xsd:anyURI" use="optional"/> + <xsd:attribute name="MIME_TYPE" type="xsd:string" use="required"/> + <xsd:attribute name="TIME_ORIGIN" type="xsd:long" use="optional"/> + <xsd:attribute name="ASSOCIATED_WITH" type="xsd:anyURI" use="optional"/> + </xsd:complexType> + </xsd:element> + <xsd:element name="PROPERTY" type="propType" minOccurs="0" maxOccurs="unbounded"/> + </xsd:sequence> + <xsd:attribute name="MEDIA_FILE" use="optional" type="xsd:string"> + <xsd:annotation> + <xsd:documentation xml:lang="en"> + This attribute is deprecated. Use MEDIA_DESCRIPTOR elements instead. + </xsd:documentation> + <xsd:appinfo>Ignore</xsd:appinfo> + </xsd:annotation> + </xsd:attribute> + <xsd:attribute name="TIME_UNITS" use="optional" default="milliseconds"> + <xsd:simpleType> + <xsd:restriction base="xsd:string"> + <xsd:enumeration value="NTSC-frames"/> + <xsd:enumeration value="PAL-frames"/> + <xsd:enumeration value="milliseconds"/> + </xsd:restriction> + </xsd:simpleType> + </xsd:attribute> + </xsd:complexType> + + <xsd:complexType name="timeType"> + <xsd:sequence> + <xsd:element name="TIME_SLOT" minOccurs="0" maxOccurs="unbounded"> + <xsd:complexType> + <xsd:attribute name="TIME_SLOT_ID" type="xsd:ID" use="required"/> + <xsd:attribute name="TIME_VALUE" type="xsd:unsignedInt" use="optional"/> + </xsd:complexType> + </xsd:element> + </xsd:sequence> + </xsd:complexType> + + <xsd:complexType name="tierType"> + <xsd:sequence> + <xsd:element name="ANNOTATION" type="annotationType" minOccurs="0" maxOccurs="unbounded"/> + </xsd:sequence> + <xsd:attribute name="TIER_ID" type="xsd:string" use="required"/> + <xsd:attribute name="PARTICIPANT" type="xsd:string" use="optional"/> + <xsd:attribute name="ANNOTATOR" type="xsd:string" use="optional"/> + <xsd:attribute name="LINGUISTIC_TYPE_REF" type="xsd:string" use="required"/> + <xsd:attribute name="DEFAULT_LOCALE" type="xsd:IDREF" use="optional"/> + <xsd:attribute name="PARENT_REF" type="xsd:string" use="optional"/> + <!-- since 2.8, to overrule an EXT_REF on the type level --> + <xsd:attribute name="EXT_REF" type="xsd:IDREF" use="optional"/> + <!-- since 2.8 --> + <xsd:attribute name="LANG_REF" type="xsd:IDREF" use="optional"/> + </xsd:complexType> + + <xsd:complexType name="annotationType"> + <xsd:choice> + <xsd:element name="ALIGNABLE_ANNOTATION" type="alignableType"/> + <xsd:element name="REF_ANNOTATION" type="refAnnoType"/> + </xsd:choice> + </xsd:complexType> + + <xsd:complexType name="alignableType"> + <xsd:sequence> + <xsd:element name="ANNOTATION_VALUE" type="xsd:string"/> + </xsd:sequence> + <xsd:attributeGroup ref="annotationAttribute"/> + <xsd:attribute name="TIME_SLOT_REF1" type="xsd:IDREF" use="required"/> + <xsd:attribute name="TIME_SLOT_REF2" type="xsd:IDREF" use="required"/> + <xsd:attribute name="SVG_REF" type="xsd:string" use="optional"/> + </xsd:complexType> + + <xsd:complexType name="refAnnoType"> + <xsd:sequence> + <xsd:element name="ANNOTATION_VALUE" type="xsd:string"/> + </xsd:sequence> + <xsd:attributeGroup ref="annotationAttribute"/> + <xsd:attribute name="ANNOTATION_REF" type="xsd:IDREF" use="required"> + <xsd:annotation> + <xsd:documentation> + This is in fact a reference to the parent annotation. + </xsd:documentation> + </xsd:annotation> + </xsd:attribute> + <xsd:attribute name="PREVIOUS_ANNOTATION" type="xsd:IDREF" use="optional"/> + </xsd:complexType> + + <xsd:complexType name="lingType"> + <xsd:attribute name="LINGUISTIC_TYPE_ID" type="xsd:string" use="required"/> + <xsd:attribute name="TIME_ALIGNABLE" type="xsd:boolean" use="optional"/> + <xsd:attribute name="CONSTRAINTS" type="xsd:IDREF" use="optional"/> + <xsd:attribute name="GRAPHIC_REFERENCES" type="xsd:boolean" use="optional"/> + <xsd:attribute name="CONTROLLED_VOCABULARY_REF" type="xsd:string" use="optional"/> + <xsd:attribute name="EXT_REF" type="xsd:IDREF" use="optional"/> + <xsd:attribute name="LEXICON_REF" type="xsd:IDREF" use="optional"/> + </xsd:complexType> + + <xsd:complexType name="localeType"> + <xsd:attribute name="LANGUAGE_CODE" type="xsd:ID" use="required"/> + <xsd:attribute name="COUNTRY_CODE" type="xsd:string" use="optional"/> + <xsd:attribute name="VARIANT" type="xsd:string" use="optional"/> + </xsd:complexType> + + <xsd:complexType name="constraintType"> + <xsd:attribute name="STEREOTYPE" type="xsd:ID" use="required"/> + <xsd:attribute name="DESCRIPTION" type="xsd:string" use="optional"/> + </xsd:complexType> + + <xsd:complexType name="convocType"> + <!-- change in 2.8, now it contains + a list of multilingual entries plus possible multiple description elements --> + <xsd:sequence> + <xsd:element name="DESCRIPTION" type="descMultiLangType" minOccurs="0" maxOccurs="unbounded"/> + <xsd:element name="CV_ENTRY_ML" type="cventryType" minOccurs="0" maxOccurs="unbounded"/> + </xsd:sequence> + + <xsd:attribute name="CV_ID" type="xsd:string" use="required"/> + <xsd:attribute name="EXT_REF" type="xsd:IDREF" use="optional"> + <xsd:annotation> + <xsd:documentation> + A reference to an url of an external Controlled Vocabulary. + Is intended to be mutually exclusive with a sequence of CV_ENTRY_ML elements. + </xsd:documentation> + </xsd:annotation> + </xsd:attribute> + </xsd:complexType> + + <!-- introduced in 2.8, modification that breaks compatibility with previous version --> + <xsd:complexType name="cventryType"> + <xsd:annotation> + <xsd:documentation> + An entry in a multilingual controlled vocabulary, containing the values and the descriptions + in multiple languages. + </xsd:documentation> + </xsd:annotation> + <xsd:sequence> + <xsd:element name="CVE_VALUE" type="cveValueType" maxOccurs="unbounded"/> + </xsd:sequence> + <xsd:attribute name="CVE_ID" type="xsd:string" use="required"/><!-- in 2.8 moved from ecventry to cv entry --> + <xsd:attribute name="EXT_REF" type="xsd:IDREF" use="optional"/> + </xsd:complexType> + + <!-- introduced in 2.8 --> + <xsd:complexType name="cveValueType"> + <xsd:annotation> + <xsd:documentation> + A controlled vocabulary entry value with a language attribute. + This allows multilingual controlled vocabularies. It adds a language reference attribute + compared to the mono-lingual cv entry element. + </xsd:documentation> + </xsd:annotation> + <xsd:simpleContent> + <xsd:extension base="xsd:string"> + <xsd:attribute name="LANG_REF" type="xsd:IDREF" use="required"/> + <xsd:attribute name="DESCRIPTION" type="xsd:string" use="optional"/> + </xsd:extension> + </xsd:simpleContent> + </xsd:complexType> + + <!-- introduced in 2.8 --> + <xsd:complexType name="descMultiLangType"> + <xsd:annotation> + <xsd:documentation> + A description element with a language reference attribute. + </xsd:documentation> + </xsd:annotation> + <xsd:simpleContent> + <xsd:extension base="xsd:string"> + <xsd:attribute name="LANG_REF" type="xsd:IDREF" use="required"/> + </xsd:extension> + </xsd:simpleContent> + </xsd:complexType> + + <xsd:complexType name="propType"> + <xsd:simpleContent> + <xsd:extension base="xsd:string"> + <xsd:attribute name="NAME" type="xsd:string" use="optional"/> + </xsd:extension> + </xsd:simpleContent> + </xsd:complexType> + + <xsd:complexType name="extRefType"> + <xsd:attribute name="EXT_REF_ID" type="xsd:ID" use="required"/> + <xsd:attribute name="TYPE" use="required"> + <xsd:simpleType> + <xsd:restriction base="xsd:string"> + <xsd:enumeration value="iso12620"> + <xsd:annotation> + <xsd:documentation> + A reference to the id of an ISO Data Category (url including id). + </xsd:documentation> + </xsd:annotation> + </xsd:enumeration> + <xsd:enumeration value="ecv"> + <xsd:annotation> + <xsd:documentation> + A reference to an external (closed) Controlled Vocabulary (url). + </xsd:documentation> + </xsd:annotation> + </xsd:enumeration> + <xsd:enumeration value="cve_id"> + <xsd:annotation> + <xsd:documentation> + A reference to the id of an Entry in an external Controlled Vocabulary (id). + </xsd:documentation> + </xsd:annotation> + </xsd:enumeration> + <xsd:enumeration value="lexen_id"> + <xsd:annotation> + <xsd:documentation> + A reference to the id of an entry in a lexicon (url, url+id or id) + </xsd:documentation> + </xsd:annotation> + </xsd:enumeration> + <xsd:enumeration value="resource_url"> + <xsd:annotation> + <xsd:documentation> + A reference or hyperlink to any type document (url) + </xsd:documentation> + </xsd:annotation> + </xsd:enumeration> + <!-- other external reference types can be added later --> + </xsd:restriction> + </xsd:simpleType> + </xsd:attribute> + <xsd:attribute name="VALUE" type="xsd:string" use="required"/> + </xsd:complexType> + + <xsd:complexType name="lexRefType"> + <xsd:attribute name="LEX_REF_ID" type="xsd:ID" use="required"/> + <xsd:attribute name="NAME" type="xsd:string" use="required"/> + <xsd:attribute name="TYPE" type="xsd:string" use="required"/> + <xsd:attribute name="URL" type="xsd:string" use="required"/> + <xsd:attribute name="LEXICON_ID" type="xsd:string" use="required"/> + <xsd:attribute name="LEXICON_NAME" type="xsd:string" use="required"/> + <xsd:attribute name="DATCAT_ID" type="xsd:string" use="optional"/> + <xsd:attribute name="DATCAT_NAME" type="xsd:string" use="optional"/> + </xsd:complexType> + + <xsd:complexType name="langType"> + <xsd:annotation> + <xsd:documentation xml:lang="en"> + The Language element containing a reference to a language name or (if possible persistent) definition. + </xsd:documentation> + </xsd:annotation> + <xsd:attribute name="LANG_ID" type="xsd:ID" use="required"/> + <!-- definition is optional so that user defined languages are easy to add --> + <xsd:attribute name="LANG_DEF" type="xsd:string" use="optional"> + <xsd:annotation><xsd:documentation> + ISO-639-3 still seems to be the best choice for language codes and closest to persistent language ID's + seem to be the http://cdb.iso.org/lg/... identifiers also used by the iso-language-639-3 component in + the CLARIN ComponentRegistry? + </xsd:documentation></xsd:annotation> + </xsd:attribute> + <xsd:attribute name="LANG_LABEL" type="xsd:string" use="optional"/> + </xsd:complexType> + <!-- since 2.8 --> + <xsd:complexType name="licenseType"> + <xsd:annotation> + <xsd:documentation xml:lang="en"> + The license element can be used to include license information in the eaf file itself. + </xsd:documentation> + </xsd:annotation> + <xsd:simpleContent> + <xsd:extension base="xsd:string"> + <xsd:attribute name="LICENSE_URL" type="xsd:anyURI" use="optional"/> + </xsd:extension> + </xsd:simpleContent> + </xsd:complexType> + + <!-- introduced in 3.0 --> + <xsd:complexType name="refLinksType"> + <xsd:annotation> + <xsd:documentation> + A set containing referential links. + A set can contain both cross-references and grouping referential links. + Apart from an ID the set can have a meaningful, "friendly" name. + A set can have an external reference, a language and a CV reference. + </xsd:documentation> + </xsd:annotation> + <xsd:sequence> + <xsd:choice minOccurs="0" maxOccurs="unbounded"> + <xsd:element name="CROSS_REF_LINK" type="crossLinkType"/> + <xsd:element name="GROUP_REF_LINK" type="groupLinkType"/> + </xsd:choice> + </xsd:sequence> + <xsd:attribute name="LINK_SET_ID" type="xsd:ID" use="required"/> + <xsd:attribute name="LINK_SET_NAME" type="xsd:string" use="optional"/> + <xsd:attribute name="EXT_REF" type="xsd:IDREFS" use="optional"/> + <xsd:attribute name="LANG_REF" type="xsd:IDREF" use="optional"/> + <xsd:attribute name="CV_REF" type="xsd:string" use="optional"/> + </xsd:complexType> + <!-- introduced in 3.0 --> + <!-- a cross reference element --> + <xsd:complexType name="crossLinkType"> + <xsd:annotation> + <xsd:documentation> + A cross reference is a referential link between two existing elements (REF1 and REF2). + Each of these elements can be either an annotation or a referential link. + Optionally the direction of the link can be specified. + </xsd:documentation> + </xsd:annotation> + <xsd:simpleContent> + <xsd:extension base="xsd:string"> + <!-- refers to the ID of an annotation or a reference link --> + <xsd:attribute name="REF1" type="xsd:IDREF" use="required"/> + <xsd:attribute name="REF2" type="xsd:IDREF" use="required"/> + <xsd:attribute name="DIRECTIONALITY" use="optional"> + <xsd:simpleType> + <xsd:restriction base="xsd:string"> + <xsd:enumeration value="undirected"/> + <xsd:enumeration value="unidirectional"/> + <xsd:enumeration value="bidirectional"/> + </xsd:restriction> + </xsd:simpleType> + </xsd:attribute> + + <xsd:attributeGroup ref="refLinkAttribute"/> + </xsd:extension> + </xsd:simpleContent> + </xsd:complexType> + <!-- a grouping reference element --> + <xsd:complexType name="groupLinkType"> + <xsd:annotation> + <xsd:documentation> + A referential element for grouping any number of existing elements (the REFS). + Each element can be an annotation or a referential link. + </xsd:documentation> + </xsd:annotation> + <xsd:simpleContent> + <xsd:extension base="xsd:string"> + <xsd:attribute name="REFS" type="xsd:IDREFS" use="required"/> + <xsd:attributeGroup ref="refLinkAttribute"/> + </xsd:extension> + </xsd:simpleContent> + </xsd:complexType> + <!-- attributes shared by reference link elements --> + <xsd:attributeGroup name="refLinkAttribute"> + <xsd:annotation> + <xsd:documentation> + Attributes common for both cross- and group references. + Apart from an ID it is possible to associate a meaningful, "friendly" + name to the link. Furthermore a link can have an external reference, a language and a + CV entry reference and a type attribute. + </xsd:documentation> + </xsd:annotation> + <xsd:attribute name="REF_LINK_ID" type="xsd:ID" use="required"/> + <xsd:attribute name="REF_LINK_NAME" type="xsd:string" use="optional"/> + <xsd:attribute name="EXT_REF" type="xsd:IDREFS" use="optional"/> + <xsd:attribute name="LANG_REF" type="xsd:IDREF" use="optional"/> + <xsd:attribute name="CVE_REF" type="xsd:string" use="optional"/> + <xsd:attribute name="REF_TYPE" type="xsd:string" use="optional"> + <xsd:annotation> + <xsd:documentation> + An attribute that allows to specify the type of the cross- or group reference/link. + </xsd:documentation> + </xsd:annotation> + </xsd:attribute> + </xsd:attributeGroup> + <!-- end of new in 3.0 --> + + <xsd:attributeGroup name="annotationAttribute"> + <xsd:attribute name="ANNOTATION_ID" type="xsd:ID" use="required"/> + <xsd:attribute name="EXT_REF" type="xsd:IDREFS" use="optional"/> + <xsd:attribute name="LANG_REF" type="xsd:IDREF" use="optional"/><!-- since 2.8 --> + <xsd:attribute name="CVE_REF" type="xsd:string" use="optional"/><!-- since 2.8 --> + </xsd:attributeGroup> + + + <!-- Start of CV_RESOURCE part, an alternative root element --> + <xsd:element name="CV_RESOURCE"> + <xsd:complexType> + <xsd:sequence> + <xsd:element name="LANGUAGE" type="langType" minOccurs="0" maxOccurs="unbounded"/> + <xsd:element name="CONTROLLED_VOCABULARY" type="convocType" minOccurs="1" maxOccurs="unbounded"> + <xsd:key name="cvEntryKey2"> + <xsd:annotation> + <xsd:documentation xml:lang="en"> + The entry id should be unique within the + collection of entry elements + </xsd:documentation> + </xsd:annotation> + <xsd:selector xpath="CV_ENTRY_ML"/> + <xsd:field xpath="@CVE_ID"/> + </xsd:key> + + </xsd:element> + <xsd:element name="EXTERNAL_REF" type="extRefType" minOccurs="0" maxOccurs="unbounded"/> + </xsd:sequence> + <xsd:attribute name="DATE" type="xsd:dateTime" use="optional"/> + <xsd:attribute name="AUTHOR" type="xsd:string" use="optional"/> + <xsd:attribute name="VERSION" type="xsd:string" use="optional"/> + </xsd:complexType> + <!-- define key - keyref pairs --> + <!-- If not commented this is considered a double global definition of cvNameKey --> + <!-- <xsd:key name="cvNameKey"> + <xsd:annotation> + <xsd:documentation xml:lang="en"> + The Controlled Vocabulary name/id should be unique within the + collection of Controlled Vocabulary elements + </xsd:documentation> + </xsd:annotation> + <xsd:selector xpath="CONTROLLED_VOCABULARY"/> + <xsd:field xpath="@CV_ID"/> + </xsd:key>--> + </xsd:element> + +</xsd:schema> \ No newline at end of file diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/resources/xsd/exmaralda_exb.xsd b/src/main/java/de/uni_hamburg/corpora/validation/quest/resources/xsd/exmaralda_exb.xsd new file mode 100644 index 0000000000000000000000000000000000000000..ae380f4661c409158aba48a52c7a8792da08d722 --- /dev/null +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/resources/xsd/exmaralda_exb.xsd @@ -0,0 +1,418 @@ +<?xml version="1.0" encoding="UTF-8"?> +<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"> + + <xs:element name="basic-transcription"> + <xs:complexType> + <xs:sequence> + <xs:element ref="head"/> + <xs:element ref="basic-body"/> + <xs:sequence minOccurs="0"> + <xs:element ref="tierformat-table"/> + </xs:sequence> + </xs:sequence> + <xs:attribute name="Id" type="xs:string"/> + </xs:complexType> + </xs:element> + + <xs:element name="head"> + <xs:complexType> + <xs:sequence> + <xs:element ref="meta-information"/> + <xs:element ref="speakertable"/> + </xs:sequence> + </xs:complexType> + </xs:element> + + <xs:element name="basic-body"> + <xs:complexType> + <xs:sequence> + <xs:element ref="common-timeline"/> + <xs:sequence minOccurs="0" maxOccurs="unbounded"> + <xs:element ref="tier"/> + </xs:sequence> + </xs:sequence> + </xs:complexType> + </xs:element> + + <xs:element name="tierformat-table"> + <xs:complexType> + <xs:sequence> + <xs:sequence minOccurs="0" maxOccurs="unbounded"> + <xs:element ref="referenced-file"/> + </xs:sequence> + <xs:sequence minOccurs="0" maxOccurs="unbounded"> + <xs:element ref="timeline-item-format"/> + </xs:sequence> + <xs:sequence minOccurs="0" maxOccurs="unbounded"> + <xs:element ref="tier-format"/> + </xs:sequence> + </xs:sequence> + </xs:complexType> + </xs:element> + + <xs:element name="meta-information"> + <xs:complexType> + <xs:sequence> + <xs:element ref="project-name"/> + <xs:element ref="transcription-name"/> + <xs:sequence maxOccurs="unbounded"> + <xs:element ref="referenced-file"/> + </xs:sequence> + <xs:element ref="ud-meta-information"/> + <xs:element ref="comment"/> + <xs:element ref="transcription-convention"/> + </xs:sequence> + </xs:complexType> + </xs:element> + + <xs:element name="speakertable"> + <xs:complexType> + <xs:sequence minOccurs="0" maxOccurs="unbounded"> + <xs:element ref="speaker"/> + </xs:sequence> + </xs:complexType> + </xs:element> + + <xs:element name="project-name" type="xs:string"/> + + <xs:element name="transcription-name" type="xs:string"/> + + <xs:element name="referenced-file"> + <xs:complexType> + <xs:attribute name="url" type="xs:string" use="required"/> + </xs:complexType> + </xs:element> + + <xs:element name="ud-meta-information"> + <xs:complexType> + <xs:sequence minOccurs="0" maxOccurs="unbounded"> + <xs:element ref="ud-information"/> + </xs:sequence> + </xs:complexType> + </xs:element> + + <xs:element name="comment" type="xs:string"/> + + <xs:element name="transcription-convention" type="xs:string"/> + + <xs:element name="ud-information"> + <xs:complexType> + <xs:attribute name="attribute-name" type="xs:string" use="required"/> + </xs:complexType> + </xs:element> + + <xs:element name="speaker"> + <xs:complexType> + <xs:sequence> + <xs:element ref="abbreviation"/> + <xs:element ref="sex"/> + <xs:element ref="languages-used"/> + <xs:element ref="l1"/> + <xs:element ref="l2"/> + <xs:element ref="ud-speaker-information"/> + <xs:element ref="comment"/> + </xs:sequence> + <xs:attribute name="id" type="xs:ID" use="required"/> + </xs:complexType> + </xs:element> + + <xs:element name="abbreviation" type="xs:string"/> + + <xs:element name="sex"> + <xs:complexType> + <xs:attribute name="value" use="required"> + <xs:simpleType> + <xs:restriction base="xs:string"> + <xs:enumeration value="m"/> + <xs:enumeration value="f"/> + <xs:enumeration value="u"/> + </xs:restriction> + </xs:simpleType> + </xs:attribute> + </xs:complexType> + </xs:element> + + <xs:element name="languages-used"> + <xs:complexType> + <xs:sequence minOccurs="0" maxOccurs="unbounded"> + <xs:element ref="language"/> + </xs:sequence> + </xs:complexType> + </xs:element> + + <xs:element name="l1"> + <xs:complexType> + <xs:sequence minOccurs="0" maxOccurs="unbounded"> + <xs:element ref="language"/> + </xs:sequence> + </xs:complexType> + </xs:element> + + <xs:element name="l2"> + <xs:complexType> + <xs:sequence minOccurs="0" maxOccurs="unbounded"> + <xs:element ref="language"/> + </xs:sequence> + </xs:complexType> + </xs:element> + + <xs:element name="ud-speaker-information"> + <xs:complexType> + <xs:sequence minOccurs="0" maxOccurs="unbounded"> + <xs:element ref="ud-information"/> + </xs:sequence> + </xs:complexType> + </xs:element> + + <xs:element name="language"> + <xs:complexType> + <xs:attribute name="lang" type="xs:NMTOKEN" use="required"/> + </xs:complexType> + </xs:element> + + <xs:element name="common-timeline"> + <xs:complexType> + <xs:sequence minOccurs="0" maxOccurs="unbounded"> + <xs:element ref="tli"/> + </xs:sequence> + </xs:complexType> + </xs:element> + + <xs:element name="tier"> + <xs:complexType> + <xs:sequence> + <xs:sequence minOccurs="0"> + <xs:element ref="ud-tier-information"/> + </xs:sequence> + <xs:sequence minOccurs="0" maxOccurs="unbounded"> + <xs:element ref="event"/> + </xs:sequence> + </xs:sequence> + <xs:attribute name="id" type="xs:ID" use="required"/> + <xs:attribute name="speaker" type="xs:IDREF"/> + <xs:attribute name="category" type="xs:string" use="required"/> + <xs:attribute name="display-name" type="xs:string"/> + <xs:attribute name="type" use="required"> + <xs:simpleType> + <xs:restriction base="xs:string"> + <xs:enumeration value="t"/> + <xs:enumeration value="d"/> + <xs:enumeration value="a"/> + <xs:enumeration value="l"/> + <xs:enumeration value="u"/> + </xs:restriction> + </xs:simpleType> + </xs:attribute> + </xs:complexType> + </xs:element> + + <xs:element name="tli"> + <xs:complexType> + <xs:attribute name="id" type="xs:ID" use="required"/> + <xs:attribute name="time" type="xs:string"/> + <xs:attribute name="type"> + <xs:simpleType> + <xs:restriction base="xs:string"> + <xs:enumeration value="unsp"/> + <xs:enumeration value="user"/> + <xs:enumeration value="appl"/> + <xs:enumeration value="intp"/> + <xs:enumeration value="othr"/> + </xs:restriction> + </xs:simpleType> + </xs:attribute> + <xs:attribute name="bookmark" type="xs:string"/> + </xs:complexType> + </xs:element> + + <xs:element name="ud-tier-information"> + <xs:complexType> + <xs:sequence minOccurs="0" maxOccurs="unbounded"> + <xs:element ref="ud-information"/> + </xs:sequence> + </xs:complexType> + </xs:element> + + <xs:element name="event"> + <xs:complexType> + <xs:sequence minOccurs="0" maxOccurs="unbounded"> + <xs:choice> + <xs:element ref="ud-information"/> + </xs:choice> + </xs:sequence> + <xs:attribute name="start" type="xs:IDREF" use="required"/> + <xs:attribute name="end" type="xs:IDREF" use="required"/> + <xs:attribute name="medium"> + <xs:simpleType> + <xs:restriction base="xs:string"> + <xs:enumeration value="aud"/> + <xs:enumeration value="vid"/> + <xs:enumeration value="img"/> + <xs:enumeration value="txt"/> + <xs:enumeration value="oth"/> + </xs:restriction> + </xs:simpleType> + </xs:attribute> + <xs:attribute name="url" type="xs:string"/> + </xs:complexType> + </xs:element> + + <xs:element name="timeline-item-format"> + <xs:complexType> + <xs:attribute name="show-every-nth-numbering" type="xs:string"/> + <xs:attribute name="show-every-nth-absolute" type="xs:string"/> + <xs:attribute name="absolute-time-format"> + <xs:simpleType> + <xs:restriction base="xs:string"> + <xs:enumeration value="decimal"/> + <xs:enumeration value="time"/> + </xs:restriction> + </xs:simpleType> + </xs:attribute> + <xs:attribute name="miliseconds-digits" type="xs:string"/> + </xs:complexType> + </xs:element> + + <xs:element name="tier-format"> + <xs:complexType> + <xs:sequence minOccurs="0" maxOccurs="unbounded"> + <xs:choice> + <xs:element ref="property"/> + </xs:choice> + </xs:sequence> + <xs:attribute name="tierref" type="xs:string" use="required"/> + <xs:attribute name="style-name"> + <xs:simpleType> + <xs:restriction base="xs:string"> + <xs:enumeration value="Plain"/> + <xs:enumeration value="Bold"/> + <xs:enumeration value="Italic"/> + </xs:restriction> + </xs:simpleType> + </xs:attribute> + <xs:attribute name="size"> + <xs:simpleType> + <xs:restriction base="xs:string"> + <xs:enumeration value="1"/> + <xs:enumeration value="2"/> + <xs:enumeration value="3"/> + <xs:enumeration value="4"/> + <xs:enumeration value="5"/> + <xs:enumeration value="6"/> + <xs:enumeration value="7"/> + <xs:enumeration value="8"/> + <xs:enumeration value="9"/> + <xs:enumeration value="10"/> + <xs:enumeration value="11"/> + <xs:enumeration value="12"/> + <xs:enumeration value="13"/> + <xs:enumeration value="14"/> + <xs:enumeration value="15"/> + <xs:enumeration value="16"/> + <xs:enumeration value="17"/> + <xs:enumeration value="18"/> + <xs:enumeration value="19"/> + <xs:enumeration value="20"/> + <xs:enumeration value="21"/> + <xs:enumeration value="22"/> + <xs:enumeration value="23"/> + <xs:enumeration value="24"/> + <xs:enumeration value="25"/> + <xs:enumeration value="26"/> + <xs:enumeration value="27"/> + <xs:enumeration value="28"/> + <xs:enumeration value="29"/> + <xs:enumeration value="30"/> + <xs:enumeration value="31"/> + <xs:enumeration value="32"/> + <xs:enumeration value="33"/> + <xs:enumeration value="34"/> + <xs:enumeration value="35"/> + <xs:enumeration value="36"/> + <xs:enumeration value="37"/> + <xs:enumeration value="38"/> + <xs:enumeration value="39"/> + <xs:enumeration value="40"/> + <xs:enumeration value="41"/> + <xs:enumeration value="42"/> + <xs:enumeration value="43"/> + <xs:enumeration value="44"/> + <xs:enumeration value="45"/> + <xs:enumeration value="46"/> + <xs:enumeration value="47"/> + <xs:enumeration value="48"/> + <xs:enumeration value="72"/> + </xs:restriction> + </xs:simpleType> + </xs:attribute> + <xs:attribute name="alignment-name"> + <xs:simpleType> + <xs:restriction base="xs:string"> + <xs:enumeration value="Left"/> + <xs:enumeration value="Right"/> + <xs:enumeration value="Center"/> + </xs:restriction> + </xs:simpleType> + </xs:attribute> + <xs:attribute name="textcolor-name"> + <xs:simpleType> + <xs:restriction base="xs:string"> + <xs:enumeration value="white"/> + <xs:enumeration value="lightGray"/> + <xs:enumeration value="darkGray"/> + <xs:enumeration value="black"/> + <xs:enumeration value="red"/> + <xs:enumeration value="pink"/> + <xs:enumeration value="orange"/> + <xs:enumeration value="yellow"/> + <xs:enumeration value="green"/> + <xs:enumeration value="magenta"/> + <xs:enumeration value="cyan"/> + <xs:enumeration value="blue"/> + </xs:restriction> + </xs:simpleType> + </xs:attribute> + <xs:attribute name="bgcolor-name"> + <xs:simpleType> + <xs:restriction base="xs:string"> + <xs:enumeration value="white"/> + <xs:enumeration value="lightGray"/> + <xs:enumeration value="darkGray"/> + <xs:enumeration value="black"/> + <xs:enumeration value="red"/> + <xs:enumeration value="pink"/> + <xs:enumeration value="orange"/> + <xs:enumeration value="yellow"/> + <xs:enumeration value="green"/> + <xs:enumeration value="magenta"/> + <xs:enumeration value="cyan"/> + <xs:enumeration value="blue"/> + </xs:restriction> + </xs:simpleType> + </xs:attribute> + </xs:complexType> + </xs:element> + + <xs:element name="property"> + <xs:complexType> + <xs:attribute name="name" use="required"> + <xs:simpleType> + <xs:restriction base="xs:string"> + <xs:enumeration value="font-color"/> + <xs:enumeration value="bg-color"/> + <xs:enumeration value="font-size"/> + <xs:enumeration value="font-name"/> + <xs:enumeration value="font-face"/> + <xs:enumeration value="chunk-border"/> + <xs:enumeration value="chunk-border-color"/> + <xs:enumeration value="chunk-border-style"/> + <xs:enumeration value="text-alignment"/> + <xs:enumeration value="row-height-calculation"/> + <xs:enumeration value="fixed-row-height"/> + </xs:restriction> + </xs:simpleType> + </xs:attribute> + </xs:complexType> + </xs:element> + +</xs:schema> diff --git a/src/main/java/de/uni_hamburg/corpora/validation/quest/resources/xsd/exmaralda_exs.xsd b/src/main/java/de/uni_hamburg/corpora/validation/quest/resources/xsd/exmaralda_exs.xsd new file mode 100644 index 0000000000000000000000000000000000000000..812d7bd7bbd54a1c3e11c34f3c75776a3aad3fc6 --- /dev/null +++ b/src/main/java/de/uni_hamburg/corpora/validation/quest/resources/xsd/exmaralda_exs.xsd @@ -0,0 +1,365 @@ +<?xml version="1.0" encoding="UTF-8"?> +<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"> + + <xs:element name="segmented-transcription"> + <xs:complexType> + <xs:sequence> + <xs:element ref="head"/> + <xs:element ref="segmented-body"/> + <xs:sequence minOccurs="0"> + <xs:element ref="conversion-info"/> + </xs:sequence> + </xs:sequence> + </xs:complexType> + </xs:element> + + <xs:element name="head"> + <xs:complexType> + <xs:sequence> + <xs:element ref="meta-information"/> + <xs:element ref="speakertable"/> + </xs:sequence> + </xs:complexType> + </xs:element> + + <xs:element name="segmented-body"> + <xs:complexType> + <xs:sequence> + <xs:element ref="common-timeline"/> + <xs:sequence minOccurs="0" maxOccurs="unbounded"> + <xs:element ref="segmented-tier"/> + </xs:sequence> + </xs:sequence> + </xs:complexType> + </xs:element> + + <xs:element name="conversion-info"> + <xs:complexType> + <xs:sequence minOccurs="0"> + <xs:element ref="basic-transcription-conversion-info"/> + </xs:sequence> + </xs:complexType> + </xs:element> + + <xs:element name="meta-information"> + <xs:complexType> + <xs:sequence> + <xs:element ref="project-name"/> + <xs:element ref="transcription-name"/> + <xs:sequence maxOccurs="unbounded"> + <xs:element ref="referenced-file"/> + </xs:sequence> + <xs:element ref="ud-meta-information"/> + <xs:element ref="comment"/> + <xs:element ref="transcription-convention"/> + </xs:sequence> + </xs:complexType> + </xs:element> + + <xs:element name="speakertable"> + <xs:complexType> + <xs:sequence minOccurs="0" maxOccurs="unbounded"> + <xs:element ref="speaker"/> + </xs:sequence> + </xs:complexType> + </xs:element> + + <xs:element name="project-name" type="xs:string"/> + + <xs:element name="transcription-name" type="xs:string"/> + + <xs:element name="referenced-file"> + <xs:complexType> + <xs:attribute name="url" type="xs:string" use="required"/> + </xs:complexType> + </xs:element> + + <xs:element name="ud-meta-information"> + <xs:complexType> + <xs:sequence minOccurs="0" maxOccurs="unbounded"> + <xs:element ref="ud-information"/> + </xs:sequence> + </xs:complexType> + </xs:element> + + <xs:element name="comment" type="xs:string"/> + + <xs:element name="transcription-convention" type="xs:string"/> + + <xs:element name="ud-information"> + <xs:complexType> + <xs:attribute name="attribute-name" type="xs:string" use="required"/> + </xs:complexType> + </xs:element> + + <xs:element name="speaker"> + <xs:complexType> + <xs:sequence> + <xs:element ref="abbreviation"/> + <xs:element ref="sex"/> + <xs:element ref="languages-used"/> + <xs:element ref="l1"/> + <xs:element ref="l2"/> + <xs:element ref="ud-speaker-information"/> + <xs:element ref="comment"/> + </xs:sequence> + <xs:attribute name="id" type="xs:ID" use="required"/> + </xs:complexType> + </xs:element> + + <xs:element name="abbreviation" type="xs:string"/> + + <xs:element name="sex"> + <xs:complexType> + <xs:attribute name="value" use="required"> + <xs:simpleType> + <xs:restriction base="xs:string"> + <xs:enumeration value="m"/> + <xs:enumeration value="f"/> + <xs:enumeration value="u"/> + </xs:restriction> + </xs:simpleType> + </xs:attribute> + </xs:complexType> + </xs:element> + + <xs:element name="languages-used"> + <xs:complexType> + <xs:sequence minOccurs="0" maxOccurs="unbounded"> + <xs:element ref="language"/> + </xs:sequence> + </xs:complexType> + </xs:element> + + <xs:element name="l1"> + <xs:complexType> + <xs:sequence minOccurs="0" maxOccurs="unbounded"> + <xs:element ref="language"/> + </xs:sequence> + </xs:complexType> + </xs:element> + + <xs:element name="l2"> + <xs:complexType> + <xs:sequence minOccurs="0" maxOccurs="unbounded"> + <xs:element ref="language"/> + </xs:sequence> + </xs:complexType> + </xs:element> + + <xs:element name="ud-speaker-information"> + <xs:complexType> + <xs:sequence minOccurs="0" maxOccurs="unbounded"> + <xs:element ref="ud-information"/> + </xs:sequence> + </xs:complexType> + </xs:element> + + <xs:element name="language"> + <xs:complexType> + <xs:attribute name="lang" type="xs:NMTOKEN" use="required"/> + </xs:complexType> + </xs:element> + + <xs:element name="common-timeline"> + <xs:complexType> + <xs:sequence minOccurs="0" maxOccurs="unbounded"> + <xs:element ref="tli"/> + </xs:sequence> + </xs:complexType> + </xs:element> + + <xs:element name="segmented-tier"> + <xs:complexType> + <xs:sequence> + <xs:sequence minOccurs="0" maxOccurs="unbounded"> + <xs:element ref="timeline-fork"/> + </xs:sequence> + <xs:sequence maxOccurs="unbounded"> + <xs:element ref="segmentation"/> + </xs:sequence> + <xs:sequence minOccurs="0" maxOccurs="unbounded"> + <xs:element ref="annotation"/> + </xs:sequence> + </xs:sequence> + <xs:attribute name="id" type="xs:ID" use="required"/> + <xs:attribute name="speaker" type="xs:IDREF"/> + <xs:attribute name="category" type="xs:string" use="required"/> + <xs:attribute name="display-name" type="xs:string"/> + <xs:attribute name="type" use="required"> + <xs:simpleType> + <xs:restriction base="xs:string"> + <xs:enumeration value="t"/> + <xs:enumeration value="d"/> + <xs:enumeration value="a"/> + <xs:enumeration value="l"/> + <xs:enumeration value="u"/> + </xs:restriction> + </xs:simpleType> + </xs:attribute> + </xs:complexType> + </xs:element> + + <xs:element name="tli"> + <xs:complexType> + <xs:attribute name="id" type="xs:ID" use="required"/> + <xs:attribute name="time" type="xs:string"/> + <xs:attribute name="type"> + <xs:simpleType> + <xs:restriction base="xs:string"> + <xs:enumeration value="unsp"/> + <xs:enumeration value="user"/> + <xs:enumeration value="appl"/> + <xs:enumeration value="intp"/> + <xs:enumeration value="othr"/> + </xs:restriction> + </xs:simpleType> + </xs:attribute> + <xs:attribute name="bookmark" type="xs:string"/> + </xs:complexType> + </xs:element> + + <xs:element name="timeline-fork"> + <xs:complexType> + <xs:sequence maxOccurs="unbounded"> + <xs:element ref="tli"/> + </xs:sequence> + <xs:attribute name="start" type="xs:IDREF" use="required"/> + <xs:attribute name="end" type="xs:IDREF" use="required"/> + </xs:complexType> + </xs:element> + + <xs:element name="segmentation"> + <xs:complexType> + <xs:sequence minOccurs="0" maxOccurs="unbounded"> + <xs:choice> + <xs:element ref="ts"/> + <xs:element ref="ats"/> + </xs:choice> + </xs:sequence> + <xs:attribute name="name" type="xs:string" use="required"/> + <xs:attribute name="tierref" type="xs:string"/> + </xs:complexType> + </xs:element> + + <xs:element name="annotation"> + <xs:complexType> + <xs:sequence minOccurs="0" maxOccurs="unbounded"> + <xs:element ref="ta"/> + </xs:sequence> + <xs:attribute name="name" type="xs:string" use="required"/> + <xs:attribute name="tierref" type="xs:string"/> + </xs:complexType> + </xs:element> + + <xs:element name="ts"> + <xs:complexType> + <xs:sequence minOccurs="0" maxOccurs="unbounded"> + <xs:choice> + <xs:element ref="ts"/> + <xs:element ref="ats"/> + <xs:element ref="nts"/> + </xs:choice> + </xs:sequence> + <xs:attribute name="n" type="xs:string" use="required"/> + <xs:attribute name="id" type="xs:ID"/> + <xs:attribute name="s" type="xs:IDREF" use="required"/> + <xs:attribute name="e" type="xs:IDREF" use="required"/> + <xs:attribute name="medium"> + <xs:simpleType> + <xs:restriction base="xs:string"> + <xs:enumeration value="aud"/> + <xs:enumeration value="vid"/> + <xs:enumeration value="img"/> + <xs:enumeration value="txt"/> + <xs:enumeration value="oth"/> + </xs:restriction> + </xs:simpleType> + </xs:attribute> + <xs:attribute name="url" type="xs:string"/> + </xs:complexType> + </xs:element> + + <xs:element name="ats"> + <xs:complexType> + <xs:attribute name="n" type="xs:string" use="required"/> + <xs:attribute name="id" type="xs:ID"/> + <xs:attribute name="s" type="xs:IDREF" use="required"/> + <xs:attribute name="e" type="xs:IDREF" use="required"/> + <xs:attribute name="medium"> + <xs:simpleType> + <xs:restriction base="xs:string"> + <xs:enumeration value="aud"/> + <xs:enumeration value="vid"/> + <xs:enumeration value="img"/> + <xs:enumeration value="txt"/> + <xs:enumeration value="oth"/> + </xs:restriction> + </xs:simpleType> + </xs:attribute> + <xs:attribute name="url" type="xs:string"/> + </xs:complexType> + </xs:element> + + <xs:element name="nts"> + <xs:complexType> + <xs:attribute name="n" type="xs:string" use="required"/> + <xs:attribute name="id" type="xs:ID"/> + </xs:complexType> + </xs:element> + + <xs:element name="ta"> + <xs:complexType> + <xs:attribute name="n" type="xs:string"/> + <xs:attribute name="id" type="xs:ID"/> + <xs:attribute name="s" type="xs:IDREF" use="required"/> + <xs:attribute name="e" type="xs:IDREF" use="required"/> + </xs:complexType> + </xs:element> + + <xs:element name="basic-transcription-conversion-info"> + <xs:complexType> + <xs:sequence> + <xs:element ref="conversion-timeline"/> + <xs:sequence minOccurs="0" maxOccurs="unbounded"> + <xs:element ref="conversion-tier"/> + </xs:sequence> + </xs:sequence> + </xs:complexType> + </xs:element> + + <xs:element name="conversion-timeline"> + <xs:complexType> + <xs:sequence minOccurs="0" maxOccurs="unbounded"> + <xs:element ref="conversion-tli"/> + </xs:sequence> + </xs:complexType> + </xs:element> + + <xs:element name="conversion-tier"> + <xs:complexType> + <xs:attribute name="segmented-tier-id" type="xs:IDREF" use="required"/> + <xs:attribute name="name" type="xs:string" use="required"/> + <xs:attribute name="category" type="xs:string" use="required"/> + <xs:attribute name="display-name" type="xs:string" use="required"/> + <xs:attribute name="type" use="required"> + <xs:simpleType> + <xs:restriction base="xs:string"> + <xs:enumeration value="t"/> + <xs:enumeration value="a"/> + <xs:enumeration value="d"/> + <xs:enumeration value="l"/> + <xs:enumeration value="u"/> + </xs:restriction> + </xs:simpleType> + </xs:attribute> + </xs:complexType> + </xs:element> + + <xs:element name="conversion-tli"> + <xs:complexType> + <xs:attribute name="id" type="xs:IDREF" use="required"/> + </xs:complexType> + </xs:element> + +</xs:schema> diff --git a/src/main/scala/de/uni-hamburg/corpora/validation/quest/ScalaCorpusFunctionTest.scala b/src/main/scala/de/uni-hamburg/corpora/validation/quest/ScalaCorpusFunctionTest.scala new file mode 100644 index 0000000000000000000000000000000000000000..c96c7a6cfdf52ba5dc69cf072ac602746306f5dc --- /dev/null +++ b/src/main/scala/de/uni-hamburg/corpora/validation/quest/ScalaCorpusFunctionTest.scala @@ -0,0 +1,26 @@ +package de.uni_hamburg.corpora.validation.quest + +import de.uni_hamburg.corpora.validation.Checker +import de.uni_hamburg.corpora.{Corpus, CorpusData, CorpusFunction, EXMARaLDATranscriptionData, Report} + +import java.{lang, util} +import java.util.{Collections, Properties} + +class ScalaCorpusFunctionTest(properties: Properties) extends Checker (false,properties) with CorpusFunction { + + override def getDescription: String = "Test corpus function written in Scala" + + override def getIsUsableFor: util.Collection[Class[_ <: CorpusData]] = { + Collections.singleton(classOf[EXMARaLDATranscriptionData]) + } + + override def function(c: Corpus, fix: lang.Boolean): Report = { + var report = new Report + return report + } + + override def function(cd: CorpusData, fix: lang.Boolean): Report = { + var report = new Report + return report + } +} \ No newline at end of file