Skip to content
Snippets Groups Projects
Select Git revision
  • 2abc4c8211946824d397b2387ff789c05eeaff76
  • main default
2 results

insert-usas-annotations.xsl

Blame
  • Code owners
    Assign users and groups as approvers for specific file changes. Learn more.
    insert-usas-annotations.xsl 7.23 KiB
    <?xml version="1.0" encoding="UTF-8"?>
    <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
        xmlns:xs="http://www.w3.org/2001/XMLSchema"
        exclude-result-prefixes="#all"
        xmlns:string="https://inel.corpora.uni-hamburg.de/xmlns/string"
        version="2.0">
        
        <!-- This stylesheet inserts USAS annotations into transcripts from which previously 
            utterances have been extracted for the USAS annotation in Wmatrix.
            
            It relies on the result files from Wmatrix and will insert the annotations into the 
            files referenced in the @file-ref in the u elements.
            
            Currently, supported file types:
            (a) ISO/TEI-Spoken () and EXB are supported as file types.        
        -->
        
        <xsl:output method="xml" media-type="text/xml" encoding="UTF-8"/>
        
        <!-- Global parameters -->
        <xsl:param name="wmatrix-output-files" select="
            'file:/E:/emnlp2020/usas-annotation/dolgan-1.0-wmatrix.xml',
            'file:/E:/emnlp2020/usas-annotation/kamas-1.0-wmatrix.xml',
            'file:/E:/emnlp2020/usas-annotation/selkup-0.1-wmatrix.xml'
            " as="xs:string+"/>
        
        <!-- path to Wmatrix output is derived from $corpus-directory, e.g. selkup-0.1-wmatrix.xml -->
        
        <!-- Global variables -->
        <!-- new tiers -->
        <xsl:variable name="pos-patterns" select="'.*', 'N.*', 'V.*'" as="xs:string+"/>
        <!-- the documents containing USAS-enriched utterances -->
        <xsl:variable name="wmatrix-utterances">
            <xsl:for-each select="$wmatrix-output-files">
                <xsl:copy-of select="document(.)//*:u"/>
            </xsl:for-each>
        </xsl:variable>
        
        <!-- Keys -->
        <xsl:key name="words" match="w" use="concat(../@file-ref, '#', ../@tier-category, '#', ../@speaker, '#', ../@start, '#', ../@end)"/>
        <xsl:key name="utterance-by-file-and-category-and-speaker" match="u" use="concat(@file-ref, '#', @tier-category, '#', @speaker)"></xsl:key>
        
        
        <!-- Templates -->
        <xsl:template match="/">
            <xsl:for-each select="distinct-values($wmatrix-utterances//*:u/@file-ref)">       
                <!-- overwrite the document -->
                <xsl:for-each select="document(.)">
                  <xsl:result-document href="{.}">
                          <xsl:apply-templates/>
                  </xsl:result-document>     
                </xsl:for-each>
            </xsl:for-each>
        </xsl:template>
        
        <!-- ISO/TEI-Spoken: insert a new tier with USAS annotation -->
        <xsl:template match="*:spanGrp[exists(key('utterance-by-file-and-category-and-speaker', concat(base-uri(), '#', @type, '#', ../@who), $wmatrix-utterances))]">
            
            <xsl:variable name="base-tier" select="." as="element()"/>
            <xsl:variable name="speaker" select="../@who" as="xs:string"/>
            <xsl:variable name="all-tier-categories" select="../*:spanGrp/@type" as="xs:string+"/>
            
            <!-- first copy base tier -->
            <xsl:copy-of select="$base-tier"/>
            
            <!-- insert new tiers ..._usas after the base tier -->
            <xsl:for-each select="$pos-patterns">
                <xsl:variable name="pos-pattern" select="." as="xs:string"/>
                <xsl:variable name="new-tier-category" select="concat($base-tier/@type, concat('-', replace($pos-pattern, '\.\*', ''))[not(.='-')], '_usas')" as="xs:string"/>
                <xsl:variable name="new-tier-id" select="$new-tier-category" as="xs:string"/>