Skip to content
Snippets Groups Projects
Commit ab573582 authored by Lange, Dr. Herbert's avatar Lange, Dr. Herbert
Browse files

add support for childes metadata and a first basic check for generic metadata

parent b52df305
No related branches found
No related tags found
No related merge requests found
Pipeline #9174 failed
package de.uni_hamburg.corpora;
import com.helger.collection.pair.Pair;
import org.apache.commons.io.FilenameUtils;
import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.xpath.XPathExpressionException;
import java.io.*;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.*;
import java.util.logging.Level;
import java.util.logging.Logger;
public class ChildesMetadata implements Metadata {
// url of the metadata file
private URL url ;
// unclear?!?
private URL parentUrl ;
// filename from the url
private String fileName ;
// baseName is filename without extension
private String baseName ;
private String unformatedString ;
// Use hash set because we cannot assume that keys are unique
private final HashSet<Pair<String,String>> metadata = new HashSet<>();
// Separate set for efficient lookup of keys
private final HashSet<String> metadataKeys = new HashSet<>();
public HashSet<Pair<String, String>> getMetadata() {
return metadata;
}
public HashSet<String> getMetadataKeys() {
return metadataKeys;
}
public ChildesMetadata() {
super();
}
public ChildesMetadata(URL url) {
super();
this.url = url ;
try {
// Copied from BasicTranscriptionData
URI uri = url.toURI();
URI parentURI = uri.getPath().endsWith("/") ? uri.resolve("..") : uri.resolve(".");
parentUrl = parentURI.toURL();
// up to here
fileName = new File(uri).getName();
baseName = FilenameUtils.getBaseName(fileName);
BufferedReader input = new BufferedReader(new FileReader(new File(uri)));
for (Object l : input.lines().toArray()) {
String line = l.toString();
// Split lines into keys and values and store the results in a hash map
String[] pair = line.split(":\\s+");
if (pair.length == 2) {
metadata.add(new Pair<>(pair[0],pair[1]));
}
metadataKeys.add(pair[0]);
}
}
catch (Exception ex) {
Logger.getLogger(ChildesMetadata.class.getName()).log(Level.SEVERE, null, ex);
}
}
@Override
public URL getURL() {
return url;
}
@Override
public void setURL(URL url) {
throw new UnsupportedOperationException();
}
@Override
public URL getParentURL() {
return parentUrl;
}
@Override
public void setParentURL(URL url) {
// Setters are not supported because they don't make sense without reloading the data
throw new UnsupportedOperationException();
}
@Override
public String getFilename() {
return fileName;
}
@Override
public void setFilename(String s) {
// Setters are not supported because they don't make sense without reloading the data
throw new UnsupportedOperationException();
}
@Override
public String getFilenameWithoutFileEnding() {
return baseName;
}
@Override
public void setFilenameWithoutFileEnding(String s) {
// Setters are not supported because they don't make sense without reloading the data
throw new UnsupportedOperationException();
}
@Override
public String toSaveableString() throws TransformerException, ParserConfigurationException, SAXException, IOException, XPathExpressionException {
return unformatedString;
}
@Override
public String toUnformattedString() {
return unformatedString;
}
@Override
public void updateUnformattedString(String newUnformattedString) {
this.unformatedString = newUnformattedString ;
}
@Override
public Collection<URL> getReferencedCorpusDataURLs() throws MalformedURLException, URISyntaxException {
// Return empty list assuming that there are no corpus urls referenced in CHILDES meta-data
return new ArrayList<>();
}
}
...@@ -50,6 +50,7 @@ public class CorpusIO { ...@@ -50,6 +50,7 @@ public class CorpusIO {
CmdiData cmdidata = new CmdiData(); CmdiData cmdidata = new CmdiData();
UnspecifiedXMLData usdata = new UnspecifiedXMLData(); UnspecifiedXMLData usdata = new UnspecifiedXMLData();
SegmentedTranscriptionData segdata = new SegmentedTranscriptionData(); SegmentedTranscriptionData segdata = new SegmentedTranscriptionData();
ChildesMetadata childesmetadata = new ChildesMetadata();
public CorpusIO() { public CorpusIO() {
allCorpusDataTypes.add(bt.getClass()); allCorpusDataTypes.add(bt.getClass());
...@@ -58,6 +59,7 @@ public class CorpusIO { ...@@ -58,6 +59,7 @@ public class CorpusIO {
allCorpusDataTypes.add(cmdidata.getClass()); allCorpusDataTypes.add(cmdidata.getClass());
allCorpusDataTypes.add(usdata.getClass()); allCorpusDataTypes.add(usdata.getClass());
allCorpusDataTypes.add(segdata.getClass()); allCorpusDataTypes.add(segdata.getClass());
allCorpusDataTypes.add(childesmetadata.getClass());
} }
public String CorpusData2String(CorpusData cd) throws TransformerException, ParserConfigurationException, SAXException, IOException, XPathExpressionException { public String CorpusData2String(CorpusData cd) throws TransformerException, ParserConfigurationException, SAXException, IOException, XPathExpressionException {
...@@ -150,6 +152,10 @@ public class CorpusIO { ...@@ -150,6 +152,10 @@ public class CorpusIO {
SegmentedTranscriptionData seg = new SegmentedTranscriptionData(url); SegmentedTranscriptionData seg = new SegmentedTranscriptionData(url);
System.out.println(seg.getFilename() + " read"); System.out.println(seg.getFilename() + " read");
return seg; return seg;
} else if (url.getPath().toLowerCase().endsWith("cdc") && clcds.contains(childesmetadata.getClass())) {
ChildesMetadata cmd = new ChildesMetadata(url);
System.out.println(cmd.getFilename() + " read");
return cmd;
} else { } else {
System.out.println(url + " will not be read"); System.out.println(url + " will not be read");
return null; return null;
...@@ -273,7 +279,8 @@ public class CorpusIO { ...@@ -273,7 +279,8 @@ public class CorpusIO {
listFiles(entry); listFiles(entry);
} }
String sentry = entry.getFileName().toString().toLowerCase(); String sentry = entry.getFileName().toString().toLowerCase();
if (sentry.endsWith(".exb") || sentry.endsWith(".exs") || sentry.endsWith(".coma") || sentry.endsWith(".xml") || sentry.endsWith(".cmdi") || sentry.endsWith(".eaf") || sentry.endsWith(".flextext") || sentry.endsWith(".esa") || sentry.endsWith(".tei") || sentry.endsWith(".xsl")) { // TODO put the list of suffixes somewhere more central
if (sentry.endsWith(".exb") || sentry.endsWith(".exs") || sentry.endsWith(".coma") || sentry.endsWith(".xml") || sentry.endsWith(".cmdi") || sentry.endsWith(".eaf") || sentry.endsWith(".flextext") || sentry.endsWith(".esa") || sentry.endsWith(".tei") || sentry.endsWith(".xsl") || sentry.endsWith(".cdc")) {
recursed.add(entry.toUri().toURL()); recursed.add(entry.toUri().toURL());
} }
} }
......
package de.uni_hamburg.corpora.validation;
import de.uni_hamburg.corpora.*;
import org.exmaralda.partitureditor.fsm.FSMException;
import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException;
import org.jdom.JDOMException;
import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.xpath.XPathExpressionException;
import java.io.IOException;
import java.net.URISyntaxException;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
public class ChildesGenericMetadata extends Checker implements CorpusFunction {
public ChildesGenericMetadata(boolean hasfixingoption) {
super(hasfixingoption);
}
@Override
public String getDescription() {
return "Checks the generic metadata in a Childes corpus";
}
@Override
public Report function(CorpusData cd, Boolean fix) throws NoSuchAlgorithmException, ClassNotFoundException, FSMException, URISyntaxException, SAXException, IOException, ParserConfigurationException, JexmaraldaException, TransformerException, XPathExpressionException, JDOMException {
HashSet<String> keys = ((ChildesMetadata) cd).getMetadataKeys();
Report stats = new Report();
String[] requiredKeys = {
// Class generic basic
"CMDI_PID", "Identifier", // Property: identifier - REQUIRED - 1..
"Title", // Property: title - REQUIRED - 1..
"Description", // Property: description - REQUIRED - 1..
// Not applicable // Property: version - OPTIONAL - 0..1
// Not applicable // Property: keywords - RECOMMENDED - 0..1
"Rights", // Property: license - REQUIRED - 1
// Not applicable // Property: rightsHolder - RECOMMENDED - 0..1
"IMDI_AccessAvailability", // Property: accessRights - OPTIONAL - 0..1
"Date", // Property: publicationYear - REQUIRED - 1
"Publisher", // Property: publisher - REQUIRED - 1
"Creator", // Property: creator - REQUIRED - 1..
// Class person
// Not applicable // Property: name - OPTIONAL - 0..1
// Not applicable // Property: familyName - REQUIRED - 1..1
// Not applicable // Property: givenName - RECOMMENDED - 0..1
// Not applicable // Property: identifier - REQUIRED - 1..
// Not applicable // Property: affiliation - REQUIRED/RECOMMENDED/OPTIONAL - 0..1/1..
// Not applicable // Property: email - RECOMMENDED (REQUIRED for rightsholder) - 0..
// Class organization
// Not applicable // Property: name - REQUIRED - 1
// Not applicable // Property: identifier - RECOMMENDED - 0..
// Not applicable // Property: url - RECOMMENDED - 0..1
// Not applicable // Property: email - RECOMMENDED (REQUIRED for rightsholder) - 0../1..
// Class generic extended
"Relation", // Property: sameAs - OPTIONAL - 0..
"Relation", // Property: isPartOf - OPTIONAL - 0..
"Relation", // Property: hasPart - OPTIONAL - 0..
"Relation", // Property: isBasedOn - OPTIONAL - 0..
// Class language data
"Subject.olac:language", "Language", // Property: objectLanguage - REQUIRED - 1..
"Type.olac:linguistic-type", // Property: linguisticDataType - RECOMMENDED - 0..
"IMDI_Modalities", // Property: modality - RECOMMENDED - 0..
// Class language
"Language", // Property: name - REQUIRED - 1
// Not applicable // Property: preferredLabel - RECOMMENDED - 0..1
"Subject.olac:language" // Property: identifier - REQUIRED 1..
// Class AV data
// Not yet specified
} ;
for (String key : requiredKeys) {
if (!keys.contains(key)) {
stats.addWarning(this.getClass().getName(), "Missing metadata: " + key);
}
}
return stats;
}
@Override
public Report function(Corpus c, Boolean fix) throws NoSuchAlgorithmException, ClassNotFoundException, FSMException, URISyntaxException, SAXException, IOException, ParserConfigurationException, JexmaraldaException, TransformerException, XPathExpressionException, JDOMException {
Report stats = new Report();
for (CorpusData cdata : c.getBasicTranscriptionData()) {
stats.merge(execute(cdata));
}
return stats;
}
@Override
public Collection<Class<? extends CorpusData>> getIsUsableFor() {
Collection<Class<? extends CorpusData>> classes = new ArrayList<>();
try {
Class<? extends CorpusData> cl = (Class<? extends CorpusData>) Class.forName("de.uni_hamburg.corpora.ChildesMetadata");
classes.add(cl);
}
catch (ClassNotFoundException ex) {
report.addException(ex, "class not found error");
}
return classes ;
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment