From 8e4aefeea5ca3ba563f30cf057f4fea404c69f05 Mon Sep 17 00:00:00 2001
From: Timofey Arkhangelskiy <timofey.arkhangelskiy@uni-hamburg.de>
Date: Wed, 21 Dec 2022 22:16:53 +0100
Subject: [PATCH] Enable advanced data view for Tsakorpus

---
 common/search_retrieve.py           | 18 ++++--
 common/tsakorpus_response_parser.py | 98 ++++++++++++++++++++++++-----
 main.py                             |  4 +-
 notes.txt                           |  2 +-
 static/dataview_adv.xml             |  8 +--
 static/search_retrieve_response.xml | 30 ++++-----
 6 files changed, 117 insertions(+), 43 deletions(-)

diff --git a/common/search_retrieve.py b/common/search_retrieve.py
index e25928d..32ba113 100644
--- a/common/search_retrieve.py
+++ b/common/search_retrieve.py
@@ -15,26 +15,34 @@ class Record:
         self.dataView = dataView
         # For simple search:
         self.text = ''
+        self.textNoHighlight = ''   # no <hits:Hit> elements, just text
         # For advanced search:
         self.segments = []
-        self.layers = []
+        self.layers = {}            # ID -> content
 
     def as_dict(self):
         """
         Returns a dictionary for insertion into the XML template.
         """
+        # In generic SRU, records, resources and resource fragments
+        # are all distinct entities. In FCS, it's the same thing. Still,
+        # I model the templates in a generic way, in case that changes
+        # in FCS in the future
         record = {
-            'resources': {
+            'resources': [{
                 'resource_fragments': [{
                     'dv_hits': [{
                         'text': self.text
                     }],
-                    'dv_adv': []
+                    'dv_adv': [{
+                        'segments': self.segments,
+                        'layers': self.layers
+                    }]
                 }]
-            }
+            }]
         }
         if self.dataView == DataView.adv:
-            record['resources']['resource_fragments'][0]['dv_adv'].append({
+            record['resources'][0]['resource_fragments'][0]['dv_adv'].append({
                 'segments': self.segments,
                 'layers': self.layers
             })
diff --git a/common/tsakorpus_response_parser.py b/common/tsakorpus_response_parser.py
index 683c004..2af8482 100644
--- a/common/tsakorpus_response_parser.py
+++ b/common/tsakorpus_response_parser.py
@@ -1,7 +1,7 @@
 from urllib.parse import quote
 import re
 import json
-import urllib.request
+import html
 from lxml.html import fragment_fromstring
 from .enums import *
 from .config import ResourceConfig
@@ -16,6 +16,83 @@ class TsakorpusResponseParser:
     def __init__(self):
         pass
 
+    def parse_annotation(self, anno, segID, record):
+        """
+        Parse HTML annotation for one word taken from a hit.
+        Add the data to the layers in the record object.
+        """
+        annoTree = fragment_fromstring(anno,
+                                       create_parent='div')
+        lemmas = set()
+        lemmasStr = '_'
+        pos = set()
+        posStr = '_'
+        lexNodes = annoTree.xpath('div[@class="popup_word"]/div[@class="popup_ana"]/span[@class="popup_lex"]')
+        for node in lexNodes:
+            if node.text is not None:
+                lemmas.add(node.text)
+        if len(lemmas) > 0:
+            lemmasStr = '|'.join(l for l in sorted(lemmas))
+        posNodes = annoTree.xpath('div[@class="popup_word"]/div[@class="popup_ana"]/span[@class="popup_pos"]')
+        for node in posNodes:
+            if node.text is not None:
+                posText = re.sub('&nbsp;|[  \t\ufeff]+', '', node.text)
+                pos.add(posText)
+        if len(pos) > 0:
+            posStr = '|'.join(p for p in sorted(pos))
+
+        if 'pos' not in record.layers:
+            record.layers['pos'] = []
+        record.layers['pos'].append({
+            'ref': segID,
+            'value': posStr
+        })
+
+        if 'lemma' not in record.layers:
+            record.layers['lemma'] = []
+        record.layers['lemma'].append({
+            'ref': segID,
+            'value': lemmasStr
+        })
+
+
+    def parse_span(self, el, record):
+        """
+        Parse one <span> element from the HTML representation
+        of one hit returned by a Tsakorpus instance. Add the extracted
+        text to the record object.
+        """
+        if el.tag == 'span' and 'class' in el.attrib and 'sentence_meta' in el.attrib['class']:
+            # This is the introductory span that only contains the header
+            # (title, author etc.)
+            if el.tail is not None:
+                record.text += el.tail.strip('\n\t ')
+            return
+
+        if el.text is not None:
+            bMatch = False
+            if 'class' in el.attrib and re.search('\\bword\\b', el.attrib['class']) is not None:
+                if re.search('\\bwmatch\\b', el.attrib['class']) is not None:
+                    bMatch = True
+                segID = 's' + str(len(record.segments))
+                segment = {
+                    'id': segID,
+                    'start': len(record.textNoHighlight) + 1,
+                    'end': len(record.textNoHighlight) + len(el.text)
+                }
+                record.segments.append(segment)
+                record.textNoHighlight += el.text
+                if 'data-ana' in el.attrib:
+                    self.parse_annotation(el.attrib['data-ana'], segID, record)
+            if bMatch:
+                record.text += '<hits:Hit>' + el.text + '</hits:Hit>'
+            else:
+                record.text += el.text
+        if el.tail is not None:
+            record.text += el.tail
+            record.textNoHighlight += el.tail
+
+
     def parse_context(self, hit, config: ResourceConfig, lang=''):
         """
         Parse one hit. Return it as a Record object.
@@ -27,23 +104,12 @@ class TsakorpusResponseParser:
                 or lang not in hit['languages']
                 or 'text' not in hit['languages'][lang]):
             return record
-        content = fragment_fromstring(hit['languages'][lang]['text'],
+        contentTxt = re.sub('[\r\n\t\ufeff]+', '', hit['languages'][lang]['text'], flags=re.DOTALL)
+        print(contentTxt)
+        content = fragment_fromstring(contentTxt,
                                       create_parent='div')
-        text = ''
         for el in content:
-            if el.tag == 'span' and 'class' in el.attrib and 'sentence_meta' in el.attrib['class']:
-                if el.tail is not None:
-                    text += el.tail.strip('\n\t ')
-                continue
-            if el.text is not None:
-                if 'class' in el.attrib and re.search('\\bwmatch\\b', el.attrib['class']) is not None:
-                    text += '<hits:Hit>' + el.text + '</hits:Hit>'
-                else:
-                    text += el.text
-            if el.tail is not None:
-                text += el.tail
-        print(text)
-        record.text = text
+            self.parse_span(el, record)
         return record
 
 
diff --git a/main.py b/main.py
index 8ef74a4..d43431c 100644
--- a/main.py
+++ b/main.py
@@ -83,8 +83,8 @@ def endpoint(
                                                   'request': request,
                                                   'n_hits': nHits,
                                                   'records': records
-                                              })
-                                              # media_type='application/xml')
+                                              },
+                                              media_type='application/xml')
             # return str(res)
 
     return {'operation': operation, 'version': version}
diff --git a/notes.txt b/notes.txt
index fdef6aa..4f5f782 100644
--- a/notes.txt
+++ b/notes.txt
@@ -12,4 +12,4 @@ p. 14: x-cmd-resource-info parameter present in the query example, but never exp
 
 p. 12 and elsewhere: http://explain.z3950.org/dtd/2.0/ mentioned as the URL of the record schema, but its actual URL is http://zeerex.z3950.org/dtd/zeerex-2.0.dtd .
 
-p. 14: What is the sruResponse prefix? It is never mentioned. No namespace is provided here, so the example is actually invalid XML. (Who doesn't validate their examples before showing them to a wide audience -- in an official specification, no less??) Maybe sru was actually meant (see p. 2 and an SRU v. 1.2 example above)?
\ No newline at end of file
+p. 14: What is the sruResponse prefix? It is never mentioned. No namespace is provided here, so the example is actually invalid XML. (Who doesn't validate their examples before showing them to a wide audience -- in an official specification, no less??) Maybe sru was actually meant (see p. 2 and an SRU v. 1.2 example above)? I'm putting sru there for now.
\ No newline at end of file
diff --git a/static/dataview_adv.xml b/static/dataview_adv.xml
index 6b1d3a1..203f02c 100644
--- a/static/dataview_adv.xml
+++ b/static/dataview_adv.xml
@@ -1,10 +1,10 @@
 <fcs:DataView type="application/x-clarin-fcs-adv+xml">
 {% for hit in rf.dv_adv %}
-    <adv:Advanced unit="item">
+    <adv:Advanced unit="item" xmlns:adv="http://clarin.eu/fcs/dataview/advanced">
     	<adv:Segments>{% for seg in hit.segments %}
-    		<adv:Segment id="{{ seg.id }}" start="{{ seg.start }}" end="{{ seg.end }}"{% if seg.ref %} ref="{{ seg.ref }}"{% endif %}/>
-    	</adv:Segments>{% for layer in hit.layers %}
-    	<Layer id="{{ layer.id }}">{% for span in layer.spans %}
+    		<adv:Segment id="{{ seg.id }}" start="{{ seg.start }}" end="{{ seg.end }}"{% if seg.ref %} ref="{{ seg.ref }}"{% endif %}/>{% endfor %}
+    	</adv:Segments>{% for layer_id in hit.layers %}
+    	<Layer id="{{ layer_id }}">{% for span in hit.layers[layer_id] %}
     		<Span ref="{{ span.ref }}"{% if span.alt_value %} alt-value="{{ span.alt_value }}"{% endif %}{% if span.highlight %} highlight="{{ span.highlight }}"{% endif %}>{{ span.value }}</Span>
     	    {% endfor %}
     	</Layer>
diff --git a/static/search_retrieve_response.xml b/static/search_retrieve_response.xml
index b356b31..c9664a7 100644
--- a/static/search_retrieve_response.xml
+++ b/static/search_retrieve_response.xml
@@ -1,19 +1,19 @@
 <?xml version='1.0' encoding='utf-8'?>
-<sruResponse:searchRetrieveResponse>
-<sruResponse:version>2.0</sruResponse:version>
-<sruResponse:numberOfRecords>{{ n_hits }}</sruResponse:numberOfRecords>
-<sruResponse:records>{% for record in records %}
-	<sruResponse:record>
-		<sruResponse:recordSchema>http://clarin.eu/fcs/resource</sruResponse:recordSchema>
-		<sruResponse:recordXMLEscaping>xml</sruResponse:recordXMLEscaping>
-		<sruResponse:recordData>{% for resource in record.resources %}
+<sru:searchRetrieveResponse xmlns:sru="http://docs.oasis-open.org/ns/search-ws/sruResponse">
+<sru:version>2.0</sru:version>
+<sru:numberOfRecords>{{ n_hits }}</sru:numberOfRecords>
+<sru:records>{% for record in records %}
+	<sru:record>
+		<sru:recordSchema>http://clarin.eu/fcs/resource</sru:recordSchema>
+		<sru:recordXMLEscaping>xml</sru:recordXMLEscaping>
+		<sru:recordData>{% for resource in record.resources %}
 		{% include 'resource.xml' %}
 		{% endfor %}
-		</sruResponse:recordData>
-		<sruResponse:recordPosition>{{ loop.index }}</sruResponse:recordPosition>
-	</sruResponse:record>
+		</sru:recordData>
+		<sru:recordPosition>{{ loop.index }}</sru:recordPosition>
+	</sru:record>
 {% endfor %}
-</sruResponse:records>{% if n_hits > records|length %}
-<sruResponse:nextRecordPosition>{{ records|length + 1 }}</sruResponse:nextRecordPosition>{% endif %}
-<sruResponse:resultCountPrecision>info:srw/vocabulary/resultCountPrecision/1/exact</sruResponse:resultCountPrecision>
-</sruResponse:searchRetrieveResponse>
\ No newline at end of file
+</sru:records>{% if n_hits > records|length %}
+<sru:nextRecordPosition>{{ records|length + 1 }}</sru:nextRecordPosition>{% endif %}
+<sru:resultCountPrecision>info:srw/vocabulary/resultCountPrecision/1/exact</sru:resultCountPrecision>
+</sru:searchRetrieveResponse>
\ No newline at end of file
-- 
GitLab