<?xml version="1.0" encoding="UTF-8"?> <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xs="http://www.w3.org/2001/XMLSchema" exclude-result-prefixes="xs" version="2.0"> <xsl:output method="text" omit-xml-declaration="yes"/> <xsl:param name="tei-base-path" required="yes" as="xs:string"/> <xsl:key name="x-by-name-and-category" match="x" use="concat(text(), '#', @category)"/> <xsl:variable name="NEWLINE" as="xs:string"> <xsl:text>
</xsl:text> </xsl:variable> <!-- insert correct templates with param into the JSON template --> <xsl:template match="/"> <xsl:text>[</xsl:text> <xsl:value-of select="$NEWLINE"/> <xsl:call-template name="getmostfrequentwordsfromcorpus"/> <xsl:value-of select="$NEWLINE"/> <xsl:text>]</xsl:text> </xsl:template> <xsl:template name="getmostfrequentwordsfromcorpus"> <xsl:variable name="tier1" select="'ps'"/> <xsl:variable name="tier2" select="'ge'"/> <xsl:variable name="tier3" select="'mb'"/> <xsl:variable name="tier4" select="'BOR'"/> <xsl:variable name="allwords"> <xsl:for-each select="collection(concat($tei-base-path, '?select=*.xml;recurse=yes;on-error=warning'))"> <xsl:variable name="DOC" select="."/> <xsl:for-each select="$DOC//*:spanGrp[@type = $tier3]/*:span/*:span"> <!-- TO DO --> <xsl:variable name="corpus-version-name" select="'dolgan-1.0'"/> <xsl:variable name="mid" select="@xml:id"/> <xsl:variable name="wid" select="../@from"/> <xsl:variable name="TstartTime" select="../../../*:u/*:seg/*:w[@xml:id = $wid]/preceding-sibling::*:anchor[1]/@synch"/> <xsl:variable name="TendTime" select="../../../*:u/*:seg/*:w[@xml:id = $wid]/following-sibling::*:anchor[1]/@synch"/> <!--if the corresponding ge morpheme is not only CAPs and /.[]--> <xsl:variable name="gloss" select="not(../../../*:spanGrp[@type = $tier2]/*:span/*:span[@from = $mid]/matches(text(), '[A-Z0-9\.:\\]{2}'))"/> <!--if the corresponding BOR word annotation is empty--> <xsl:variable name="BOR" select="empty(../../../*:spanGrp[@type = $tier4]/*:span[@from = $wid])"/> <xsl:variable name="audio-filename"> <xsl:value-of select="../../../../../../*:teiHeader/*:fileDesc/*:sourceDesc/*:recordingStmt/*:recording/*:media[ends-with(lower-case(@url), '.mp3')]/@url" /> </xsl:variable> <xsl:if test="$BOR and $gloss"> <x> <!-- "category": "v", "eng_transl": "like", "audio_file": "audio_snippets/AkEE_19900810_GirlAnys_flk.mp3", "audio_filename" : "AkEE_19900810_GirlAnys_flk", "audio_start": "1.275", "audio_end": "2.55" --> <xsl:attribute name="category"> <xsl:value-of select="../../../*:spanGrp[@type = $tier1]/*:span[@from = $wid]" /> </xsl:attribute> <xsl:attribute name="eng_transl"> <xsl:value-of select="../../../*:spanGrp[@type = $tier2]/*:span/*:span[@from = $mid]" /> </xsl:attribute> <xsl:attribute name="audio_file"> <!-- https://corpora.uni-hamburg.de/hzsk/de/islandora/object/recording:dolgan-1.0_PoNA_200X_GirlFromTundra_nar/datastream/MP3/PoNA_200X_GirlFromTundra_nar.mp3 --> <xsl:value-of select="concat('https://corpora.uni-hamburg.de/hzsk/de/islandora/object/recording:', $corpus-version-name, '_', tokenize($audio-filename, '.mp3')[1], '/datastream/MP3/', $audio-filename)"/> </xsl:attribute> <xsl:attribute name="audio_filename"> <xsl:value-of select="$audio-filename"/> </xsl:attribute> <xsl:attribute name="audio_start"> <xsl:value-of select="../../../../../*:timeline/*:when[@xml:id = $TstartTime]/@interval" /> </xsl:attribute> <xsl:attribute name="audio_end"> <xsl:value-of select="../../../../../*:timeline/*:when[@xml:id = $TendTime]/@interval" /> </xsl:attribute> <xsl:value-of select="text()"/> </x> </xsl:if> </xsl:for-each> </xsl:for-each> </xsl:variable> <!-- add the value number and only have one instance of each word --> <xsl:variable name="allwordsvalues"> <xsl:for-each-group select="$allwords/x" group-by="text()"> <xsl:variable name="group-text" select="text()"/> <xsl:for-each-group select="current-group()" group-by="@category"> <xsl:variable name="group-cat" select="@category"/> <xsl:for-each select="current-group()[not(@audio_filename = '')][not(@audio_start='')][not(@audio_end='')]"> <xsl:if test="position() = 1"> <x> <xsl:copy-of select="@*"/> <xsl:attribute name="value"> <xsl:value-of select="count(//$allwords/x[text() = $group-text][@category = $group-cat])" /> </xsl:attribute> <xsl:value-of select="current()"/> </x> </xsl:if> </xsl:for-each> </xsl:for-each-group> </xsl:for-each-group> </xsl:variable> <!-- And the audio filename is not allowed to be empty when choosing it later --> <!-- exists($audio-filename) --> <!-- now get the 50 most common values from this variable --> <xsl:for-each select="$allwordsvalues/x"> <xsl:sort select="@value" data-type="number" order="descending"/> <xsl:if test="position() < 100"> <xsl:value-of select="$NEWLINE"/> <xsl:text> {</xsl:text> <xsl:value-of select="$NEWLINE"/> <xsl:text> "x": "</xsl:text> <xsl:value-of select="current()"/> <xsl:text>",</xsl:text> <xsl:value-of select="$NEWLINE"/> <xsl:text> "category": "</xsl:text> <xsl:value-of select="@category"/> <xsl:text>",</xsl:text> <xsl:value-of select="$NEWLINE"/> <xsl:text> "eng_transl": "</xsl:text> <xsl:value-of select="@eng_transl"/> <xsl:text>",</xsl:text> <xsl:value-of select="$NEWLINE"/> <xsl:text> "audio_file": "</xsl:text> <xsl:value-of select="@audio_file"/> <xsl:text>",</xsl:text> <xsl:value-of select="$NEWLINE"/> <xsl:text> "audio_filename": "</xsl:text> <xsl:value-of select="@audio_filename"/> <xsl:text>",</xsl:text> <xsl:value-of select="$NEWLINE"/> <xsl:text> "audio_start": "</xsl:text> <xsl:value-of select="@audio_start"/> <xsl:text>",</xsl:text> <xsl:value-of select="$NEWLINE"/> <xsl:text> "audio_end": "</xsl:text> <xsl:value-of select="@audio_end"/> <xsl:text>",</xsl:text> <xsl:value-of select="$NEWLINE"/> <xsl:text> "value": "</xsl:text> <xsl:value-of select="@value"/> <xsl:text>"</xsl:text> <xsl:value-of select="$NEWLINE"/> <xsl:text> }</xsl:text> <xsl:if test="not(position() = 99)"> <xsl:text>,</xsl:text> </xsl:if> </xsl:if> <!-- if it's the last one, don't add a comma --> </xsl:for-each> </xsl:template> </xsl:stylesheet>