Skip to content
Snippets Groups Projects
TEI2JSONAudioWordcloudOutput.xsl 8.95 KiB
Newer Older
Ferger, Anne's avatar
Ferger, Anne committed
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
    xmlns:xs="http://www.w3.org/2001/XMLSchema" exclude-result-prefixes="xs" version="2.0">

    <xsl:output method="text" omit-xml-declaration="yes"/>

    <xsl:param name="tei-base-path" required="yes" as="xs:string"/>

    <xsl:key name="x-by-name-and-category" match="x" use="concat(text(), '#', @category)"/>

    <xsl:variable name="NEWLINE" as="xs:string">
        <xsl:text>&#xa;</xsl:text>
    </xsl:variable>

    <!-- insert correct templates with param into the JSON template -->
    <xsl:template match="/">
        <xsl:text>[</xsl:text>
        <xsl:value-of select="$NEWLINE"/>
        <xsl:call-template name="getmostfrequentwordsfromcorpus"/>
        <xsl:value-of select="$NEWLINE"/>
        <xsl:text>]</xsl:text>
    </xsl:template>

    <xsl:template name="getmostfrequentwordsfromcorpus">
        <xsl:variable name="tier1" select="'ps'"/>
        <xsl:variable name="tier2" select="'ge'"/>
        <xsl:variable name="tier3" select="'mb'"/>
        <xsl:variable name="tier4" select="'BOR'"/>
        <xsl:variable name="allwords">
            <xsl:for-each
                select="collection(concat($tei-base-path, '?select=*.xml;recurse=yes;on-error=warning'))">
                <xsl:variable name="DOC" select="."/>
                <xsl:for-each select="$DOC//*:spanGrp[@type = $tier3]/*:span/*:span">
                    <!-- TO DO -->
                    <xsl:variable name="corpus-version-name" select="'dolgan-1.0'"/>
                    <xsl:variable name="mid" select="@xml:id"/>
                    <xsl:variable name="wid" select="../@from"/>
                    <xsl:variable name="TstartTime"
                        select="../../../*:u/*:seg/*:w[@xml:id = $wid]/preceding-sibling::*:anchor[1]/@synch"/>
                    <xsl:variable name="TendTime"
                        select="../../../*:u/*:seg/*:w[@xml:id = $wid]/following-sibling::*:anchor[1]/@synch"/>
                    <!--if the corresponding ge morpheme is not only CAPs and /.[]-->
                    <xsl:variable name="gloss"
                        select="not(../../../*:spanGrp[@type = $tier2]/*:span/*:span[@from = $mid]/matches(text(), '[A-Z0-9\.:\\]{2}'))"/>
                    <!--if the corresponding BOR word annotation is empty-->
                    <xsl:variable name="BOR"
                        select="empty(../../../*:spanGrp[@type = $tier4]/*:span[@from = $wid])"/>
                    <xsl:variable name="audio-filename">
                        <xsl:value-of
                            select="../../../../../../*:teiHeader/*:fileDesc/*:sourceDesc/*:recordingStmt/*:recording/*:media[ends-with(lower-case(@url), '.mp3')]/@url"
                        />
                    </xsl:variable>
                    <xsl:if test="$BOR and $gloss">
                        <x>
                            <!-- "category": "v",
                             "eng_transl": "like",
                             "audio_file": "audio_snippets/AkEE_19900810_GirlAnys_flk.mp3",
                             "audio_filename" : "AkEE_19900810_GirlAnys_flk",
                             "audio_start": "1.275",
                             "audio_end": "2.55" -->
                            <xsl:attribute name="category">
                                <xsl:value-of
                                    select="../../../*:spanGrp[@type = $tier1]/*:span[@from = $wid]"
                                />
                            </xsl:attribute>
                            <xsl:attribute name="eng_transl">
                                <xsl:value-of
                                    select="../../../*:spanGrp[@type = $tier2]/*:span/*:span[@from = $mid]"
                                />
                            </xsl:attribute>
                            <xsl:attribute name="audio_file">
                                <!-- https://corpora.uni-hamburg.de/hzsk/de/islandora/object/recording:dolgan-1.0_PoNA_200X_GirlFromTundra_nar/datastream/MP3/PoNA_200X_GirlFromTundra_nar.mp3 -->
                                <xsl:value-of select="concat('https://corpora.uni-hamburg.de/hzsk/de/islandora/object/recording:', $corpus-version-name, '_', tokenize($audio-filename, '.mp3')[1], '/datastream/MP3/', $audio-filename)"/>
                            </xsl:attribute>
                            <xsl:attribute name="audio_filename">
                                <xsl:value-of select="$audio-filename"/>
                            </xsl:attribute>
                            <xsl:attribute name="audio_start">
                                <xsl:value-of
                                    select="../../../../../*:timeline/*:when[@xml:id = $TstartTime]/@interval"
                                />
                            </xsl:attribute>
                            <xsl:attribute name="audio_end">
                                <xsl:value-of
                                    select="../../../../../*:timeline/*:when[@xml:id = $TendTime]/@interval"
                                />
                            </xsl:attribute>
                            <xsl:value-of select="text()"/>
                        </x>
                    </xsl:if>
                </xsl:for-each>
            </xsl:for-each>
        </xsl:variable>
        <!-- add the value number and only have one instance of each word -->
        <xsl:variable name="allwordsvalues">
            <xsl:for-each-group select="$allwords/x" group-by="text()">
                <xsl:variable name="group-text" select="text()"/>
                <xsl:for-each-group select="current-group()" group-by="@category">
                    <xsl:variable name="group-cat" select="@category"/>
                        <xsl:for-each select="current-group()[not(@audio_filename = '')][not(@audio_start='')][not(@audio_end='')]">
                            <xsl:if test="position() = 1">
                                <x>
                                    <xsl:copy-of select="@*"/>
                                    <xsl:attribute name="value">
                                        <xsl:value-of
                                            select="count(//$allwords/x[text() = $group-text][@category = $group-cat])"
                                        />
                                    </xsl:attribute>
                                    <xsl:value-of select="current()"/>
                                </x>
                            </xsl:if>
                        </xsl:for-each>
                </xsl:for-each-group>
            </xsl:for-each-group>
        </xsl:variable>

        <!-- And the audio filename is not allowed to be empty when choosing it later -->
        <!--  exists($audio-filename) -->

        <!-- now get the 50 most common values from this variable -->
        <xsl:for-each select="$allwordsvalues/x">
            <xsl:sort select="@value" data-type="number" order="descending"/>
            <xsl:if test="position() &lt; 100">
                <xsl:value-of select="$NEWLINE"/>
                <xsl:text>     {</xsl:text>
                <xsl:value-of select="$NEWLINE"/>
                <xsl:text>        "x": "</xsl:text>
                <xsl:value-of select="current()"/>
                <xsl:text>",</xsl:text>
                <xsl:value-of select="$NEWLINE"/>
                <xsl:text>        "category": "</xsl:text>
                <xsl:value-of select="@category"/>
                <xsl:text>",</xsl:text>
                <xsl:value-of select="$NEWLINE"/>
                <xsl:text>        "eng_transl": "</xsl:text>
                <xsl:value-of select="@eng_transl"/>
                <xsl:text>",</xsl:text>
                <xsl:value-of select="$NEWLINE"/>
                <xsl:text>        "audio_file": "</xsl:text>
                <xsl:value-of select="@audio_file"/>
                <xsl:text>",</xsl:text>
                <xsl:value-of select="$NEWLINE"/>
                <xsl:text>        "audio_filename": "</xsl:text>
                <xsl:value-of select="@audio_filename"/>
                <xsl:text>",</xsl:text>
                <xsl:value-of select="$NEWLINE"/>
                <xsl:text>        "audio_start": "</xsl:text>
                <xsl:value-of select="@audio_start"/>
                <xsl:text>",</xsl:text>
                <xsl:value-of select="$NEWLINE"/>
                <xsl:text>        "audio_end": "</xsl:text>
                <xsl:value-of select="@audio_end"/>
                <xsl:text>",</xsl:text>
                <xsl:value-of select="$NEWLINE"/>
                <xsl:text>        "value": "</xsl:text>
                <xsl:value-of select="@value"/>
                <xsl:text>"</xsl:text>
                <xsl:value-of select="$NEWLINE"/>
                <xsl:text>    }</xsl:text>
                <xsl:if test="not(position() = 99)">
                    <xsl:text>,</xsl:text>
                </xsl:if>
            </xsl:if>
            <!-- if it's the last one, don't add a comma -->
        </xsl:for-each>
    </xsl:template>

</xsl:stylesheet>