Select Git revision
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
insert-usas-annotations.xsl 7.23 KiB
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
exclude-result-prefixes="#all"
xmlns:string="https://inel.corpora.uni-hamburg.de/xmlns/string"
version="2.0">
<!-- This stylesheet inserts USAS annotations into transcripts from which previously
utterances have been extracted for the USAS annotation in Wmatrix.
It relies on the result files from Wmatrix and will insert the annotations into the
files referenced in the @file-ref in the u elements.
Currently, supported file types:
(a) ISO/TEI-Spoken () and EXB are supported as file types.
-->
<xsl:output method="xml" media-type="text/xml" encoding="UTF-8"/>
<!-- Global parameters -->
<xsl:param name="wmatrix-output-files" select="
'file:/E:/emnlp2020/usas-annotation/dolgan-1.0-wmatrix.xml',
'file:/E:/emnlp2020/usas-annotation/kamas-1.0-wmatrix.xml',
'file:/E:/emnlp2020/usas-annotation/selkup-0.1-wmatrix.xml'
" as="xs:string+"/>
<!-- path to Wmatrix output is derived from $corpus-directory, e.g. selkup-0.1-wmatrix.xml -->
<!-- Global variables -->
<!-- new tiers -->
<xsl:variable name="pos-patterns" select="'.*', 'N.*', 'V.*'" as="xs:string+"/>
<!-- the documents containing USAS-enriched utterances -->
<xsl:variable name="wmatrix-utterances">
<xsl:for-each select="$wmatrix-output-files">
<xsl:copy-of select="document(.)//*:u"/>
</xsl:for-each>
</xsl:variable>
<!-- Keys -->
<xsl:key name="words" match="w" use="concat(../@file-ref, '#', ../@tier-category, '#', ../@speaker, '#', ../@start, '#', ../@end)"/>
<xsl:key name="utterance-by-file-and-category-and-speaker" match="u" use="concat(@file-ref, '#', @tier-category, '#', @speaker)"></xsl:key>
<!-- Templates -->
<xsl:template match="/">
<xsl:for-each select="distinct-values($wmatrix-utterances//*:u/@file-ref)">
<!-- overwrite the document -->
<xsl:for-each select="document(.)">
<xsl:result-document href="{.}">
<xsl:apply-templates/>
</xsl:result-document>
</xsl:for-each>
</xsl:for-each>
</xsl:template>
<!-- ISO/TEI-Spoken: insert a new tier with USAS annotation -->
<xsl:template match="*:spanGrp[exists(key('utterance-by-file-and-category-and-speaker', concat(base-uri(), '#', @type, '#', ../@who), $wmatrix-utterances))]">
<xsl:variable name="base-tier" select="." as="element()"/>
<xsl:variable name="speaker" select="../@who" as="xs:string"/>
<xsl:variable name="all-tier-categories" select="../*:spanGrp/@type" as="xs:string+"/>
<!-- first copy base tier -->
<xsl:copy-of select="$base-tier"/>
<!-- insert new tiers ..._usas after the base tier -->
<xsl:for-each select="$pos-patterns">
<xsl:variable name="pos-pattern" select="." as="xs:string"/>
<xsl:variable name="new-tier-category" select="concat($base-tier/@type, concat('-', replace($pos-pattern, '\.\*', ''))[not(.='-')], '_usas')" as="xs:string"/>
<xsl:variable name="new-tier-id" select="$new-tier-category" as="xs:string"/>
<!-- check if the tier already exists -->
<xsl:if test="$new-tier-category = $all-tier-categories">
<xsl:message select="concat('***ERROR: tier with category ', $new-tier-category, ' exists already.')" terminate="yes"/>
</xsl:if>
<spanGrp type="{$new-tier-id}">
<xsl:for-each select="$base-tier/*:span">
<xsl:variable name="value" select="string-join(key('words', concat(base-uri(), '#', $base-tier/@type, '#', $speaker, '#', @from, '#', @to), $wmatrix-utterances)[matches(@pos, concat('^', $pos-pattern))]/@sem, ' ')" as="xs:string?"/>
<xsl:if test="not(matches($value, '^\s*$'))">
<xsl:copy>
<xsl:copy-of select="@*"/>
<xsl:value-of select="$value"/>
</xsl:copy>
</xsl:if>
</xsl:for-each>
</spanGrp>
</xsl:for-each>
<xsl:message select="concat('*** Inserted USAS tier/s for speaker ', $speaker, ' in file ', base-uri())"/>
</xsl:template>
<!-- EXB: insert a new tier with USAS annotation -->
<xsl:template match="*:tier[exists(key('utterance-by-file-and-category-and-speaker', concat(base-uri(), '#', @category, '#', @speaker), $wmatrix-utterances))]">
<xsl:variable name="base-tier" select="." as="element()"/>
<xsl:variable name="exists-more-than-one-speaker" select="count(distinct-values(../*:tier/@speaker)) > 1" as="xs:boolean"/>
<xsl:variable name="all-tier-categories" select="../*:tier/@category" as="xs:string+"/>
<!-- first copy base tier -->
<xsl:copy-of select="$base-tier"/>
<!-- insert new tiers ..._usas after the base tier -->
<xsl:for-each select="$pos-patterns">
<xsl:variable name="pos-pattern" select="." as="xs:string"/>
<xsl:variable name="new-tier-category" select="concat($base-tier/@category, concat('-', replace($pos-pattern, '\.\*', ''))[not(.='-')], '_usas')" as="xs:string"/>
<xsl:variable name="new-tier-id" select="concat($new-tier-category, concat('-', $base-tier/@speaker)[$exists-more-than-one-speaker])" as="xs:string"/>
<!-- check if the tier already exists -->
<xsl:if test="$new-tier-category = $all-tier-categories">
<xsl:message select="concat('***ERROR: tier with category ', $new-tier-category, ' exists already.')" terminate="yes"/>
</xsl:if>
<tier id="{$new-tier-id}" speaker="{$base-tier/@speaker}" category="{$new-tier-category}" type="a" display-name="{$new-tier-id}">
<xsl:for-each select="$base-tier/*:event">
<xsl:variable name="value" select="string-join(key('words', concat(base-uri(), '#', $base-tier/@category, '#', $base-tier/@speaker, '#', @start, '#', @end), $wmatrix-utterances)[matches(@pos, concat('^', $pos-pattern))]/@sem, ' ')" as="xs:string?"/>
<xsl:if test="not(matches($value, '^\s*$'))">
<xsl:copy>
<xsl:copy-of select="@*"/>
<xsl:value-of select="$value"/>
</xsl:copy>
</xsl:if>
</xsl:for-each>
</tier>
</xsl:for-each>
<xsl:message select="concat('*** Inserted USAS tier/s for speaker ', $base-tier/@speaker, ' in file ', base-uri())"/>
</xsl:template>
<!-- recursive copy template -->
<xsl:template match="*">
<xsl:copy>
<xsl:copy-of select="@*"/>
<xsl:apply-templates/>
</xsl:copy>
</xsl:template>
<xsl:template match="text()|comment()|processing-instruction()">
<xsl:copy-of select="."/>
</xsl:template>
</xsl:stylesheet>