Skip to content
Snippets Groups Projects
Commit a5edc6a4 authored by Jettka, Daniel's avatar Jettka, Daniel
Browse files

added XSLT for extracting utterances and inserting USAS annotations (from...

added XSLT for extracting utterances and inserting USAS annotations (from Wmatrix) into TEI files + samples of processed files
parent 9ea9bd42
No related branches found
No related tags found
No related merge requests found
Source diff could not be displayed: it is too large. Options to address this: view the blob.
This diff is collapsed.
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:string="https://inel.corpora.uni-hamburg.de/xmlns/string"
exclude-result-prefixes="xs string"
version="2.0">
<xsl:output method="xml" media-type="text/xml" encoding="UTF-8"/>
<!-- Global parameters -->
<xsl:param name="base-directory" select="'file:/E:/emnlp2020/corpora/'" as="xs:string"/>
<xsl:param name="corpus-directory" select="'selkup-0.1', 'dolgan-1.0', 'kamas-1.0'" as="xs:string+"/>
<xsl:param name="file-pattern" select="'*_tei.xml'" as="xs:string"/>
<xsl:param name="utterance-tier-category" select="'fe'" as="xs:string"/>
<!-- Templates -->
<xsl:template match="/">
<xsl:for-each select="$corpus-directory">
<xsl:result-document href="{.}-utterances.xml">
<corpus name="{.}" dir="{concat($base-directory, .)}">
<xsl:for-each select="collection(concat($base-directory, ., '/', '?select=', $file-pattern, ';recurse=yes'))//*:spanGrp[@type=$utterance-tier-category]/*:span">
<u file-ref="{tokenize(base-uri(), '/')[last()]}" tier-category="{../$utterance-tier-category}" speaker="{../../@who}" from="{@from}" to="{@to}">
<xsl:value-of select="replace(text(), '(\([^\(]*\)|\[|\])', '')"/>
</u>
</xsl:for-each>
</corpus>
</xsl:result-document>
</xsl:for-each>
</xsl:template>
</xsl:stylesheet>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
exclude-result-prefixes="#all"
xmlns:string="https://inel.corpora.uni-hamburg.de/xmlns/string"
version="2.0">
<xsl:output method="xml" media-type="text/xml" encoding="UTF-8"/>
<!-- Global parameters -->
<xsl:param name="base-directory" select="'file:/E:/emnlp2020/corpora/'" as="xs:string"/>
<xsl:param name="corpus-directory" select="'selkup-0.1', 'dolgan-1.0', 'kamas-1.0'" as="xs:string+"/>
<xsl:param name="file-pattern" select="'*_tei.xml'" as="xs:string"/>
<!-- path to Wmatrix output is derived from $corpus-directory, e.g. selkup-0.1-wmatrix.xml -->
<!-- Global variables -->
<xsl:variable name="wmatrix-docs">
<xsl:for-each select="$corpus-directory">
<docs>
<xsl:for-each select="$corpus-directory">
<doc corpus="{.}" path="{concat($base-directory, ., '/', ., '-wmatrix.xml')}">
<xsl:copy-of select="document(concat($base-directory, ., '/', ., '-wmatrix.xml'))"/>
</doc>
</xsl:for-each>
</docs>
</xsl:for-each>
</xsl:variable>
<!-- Keys -->
<xsl:key name="words" match="w" use="concat(../@file-ref, '#', ../@tier-category, '#', ../@speaker, '#', ../@from, '#', ../@to)"/>
<xsl:key name="utterance-by-file-and-category-and-speaker" match="u" use="concat(@file-ref, '#', @tier-category, '#', @speaker)"></xsl:key>
<!-- Templates -->
<xsl:template match="/">
<xsl:for-each select="$corpus-directory">
<xsl:variable name="corpus" select="." as="xs:string"/>
<xsl:for-each select="collection(concat($base-directory, ., '/', '?select=', $file-pattern, ';recurse=yes'))">
<!-- overwrite the document -->
<xsl:result-document href="{base-uri(.)}">
<xsl:apply-templates>
<xsl:with-param name="corpus" select="$corpus" as="xs:string"/>
</xsl:apply-templates>
</xsl:result-document>
</xsl:for-each>
</xsl:for-each>
</xsl:template>
<!-- insert a new tier with USAS annotation -->
<xsl:template match="spanGrp[exists(key('utterance-by-file-and-category-and-speaker', concat(tokenize(base-uri(), '/')[last()], '#', @type, '#', ../@who), $wmatrix-docs))]">
<xsl:param name="corpus" as="xs:string"/>
<xsl:variable name="base-tier" select="." as="element()"/>
<xsl:variable name="speaker" select="../@who" as="xs:string"/>
<xsl:variable name="all-tier-categories" select="../spanGrp/@type" as="xs:string+"/>
<!-- first copy base tier -->
<xsl:copy-of select="$base-tier"/>
<!-- insert new tiers ..._usas after the base tier -->
<xsl:for-each select="'.*', 'N.*', 'V.*'">
<xsl:variable name="pos-pattern" select="." as="xs:string"/>
<xsl:variable name="new-tier-category" select="concat($base-tier/@type, concat('-', replace($pos-pattern, '\.\*', ''))[not(.='-')], '_usas')" as="xs:string"/>
<xsl:variable name="new-tier-id" select="$new-tier-category" as="xs:string"/>
<!-- check if the tier already exists -->
<xsl:if test="$new-tier-category = $all-tier-categories">
<xsl:message select="concat('***ERROR: tier with category ', $new-tier-category, ' exists already.')" terminate="yes"/>
</xsl:if>
<spanGrp type="{$new-tier-id}">
<xsl:for-each select="$base-tier/span">
<xsl:variable name="value" select="string-join(key('words', concat(tokenize(base-uri(), '/')[last()], '#', $base-tier/@type, '#', $speaker, '#', @from, '#', @to), $wmatrix-docs//*:doc[@corpus=$corpus])[matches(@pos, concat('^', $pos-pattern))]/@sem, ' ')" as="xs:string?"/>
<xsl:if test="not(matches($value, '^\s*$'))">
<span from="{@from}" to="{@to}">
<xsl:value-of select="$value"/>
</span>
</xsl:if>
</xsl:for-each>
</spanGrp>
</xsl:for-each>
<xsl:message select="concat('*** Inserted USAS tier/s for speaker ', $speaker, ' in file ', tokenize(base-uri(), '/')[last()])"/>
</xsl:template>
<!-- recursive copy template -->
<xsl:template match="*">
<xsl:param name="corpus" as="xs:string"/>
<xsl:copy>
<xsl:copy-of select="@*"/>
<xsl:apply-templates>
<xsl:with-param name="corpus" select="$corpus" as="xs:string"/>
</xsl:apply-templates>
</xsl:copy>
</xsl:template>
<xsl:template match="text()|comment()|processing-instruction()">
<xsl:copy-of select="."/>
</xsl:template>
</xsl:stylesheet>
\ No newline at end of file
Source diff could not be displayed: it is too large. Options to address this: view the blob.
This diff is collapsed.
Source diff could not be displayed: it is too large. Options to address this: view the blob.
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment