From efe4527e4892cff9dacbd165f3326e7fbe79721f Mon Sep 17 00:00:00 2001
From: Timofey Arkhangelskiy <timarkh@gmail.com>
Date: Mon, 26 Jun 2023 18:30:18 +0200
Subject: [PATCH] ANNIS response parser works now

---
 common/annis_response_parser.py     | 179 +++++++++++++++++++++++-----
 common/config.py                    |   1 +
 common/tsakorpus_response_parser.py |   2 -
 common/views_logic.py               |  24 ++--
 config/annis_test.json              |   6 +-
 docs/configuration.rst              |  10 ++
 test_queries.txt                    |   3 +-
 7 files changed, 180 insertions(+), 45 deletions(-)

diff --git a/common/annis_response_parser.py b/common/annis_response_parser.py
index 02ab151..85f966b 100644
--- a/common/annis_response_parser.py
+++ b/common/annis_response_parser.py
@@ -14,10 +14,152 @@ class AnnisResponseParser:
     Parses responses from an ANNIS instance.
     """
 
+    rxNodeIDPfx = re.compile('^[^/]*::')
+
     def __init__(self):
         self.pc = None      # POS convertor, rebuilt with each parse call
 
-    def process_hits(self, hits, config: ResourceConfig, searchOptions: dict,
+    def node_anno_value(self, node):
+        """
+        Return annotation value for a node, represented by
+        a node data dictionary.
+        """
+        try:
+            return node['annis::tok']
+        except KeyError:
+            pass
+        return ''
+
+    def node_tier_value(self, node, config):
+        """
+        Return tier name for a node, represented by
+        a node data dictionary.
+        """
+        for k in node:
+            if k.startswith('annis::'):
+                continue
+            k = self.rxNodeIDPfx.sub('', k)
+            if k in config.tier_convert:
+                return config.tier_convert[k]
+        return None
+
+    def process_token_sequences(self, seqs, hit, nodeData, highlightNodes,
+                                config: ResourceConfig, searchOptions: dict,
+                                diagnostics: list[Diagnostic], advancedHits=False):
+        """
+        Extract information from several sequences of token nodes
+        and their descendants, which represent one search hit.
+        Return a Record object with all the data.
+        """
+        record = Record(advancedHits=advancedHits)
+        # print(highlightNodes)
+        for iSeq in range(len(seqs)):
+            seq = seqs[iSeq]
+            for tokenNodeID in seq:
+                node = nodeData[tokenNodeID]
+                # print(tokenNodeID, node)
+                token = self.node_anno_value(node)
+                if advancedHits:
+                    segID = 's' + str(len(record.segments))
+                    segment = {
+                        'id': segID,
+                        'start': len(record.textNoHighlight) + 1,
+                        'end': len(record.textNoHighlight) + len(token)
+                    }
+                    record.segments.append(segment)
+                    # Now extract data from all relevant tiers.
+                    # This is tricky: nodes are linked not only to the
+                    # respective annotation nodes, but also to the next
+                    # node on the same level. So we use depth-first search
+                    # and break once we see another node from a layer
+                    # we have already seen.
+                    usedLayers = set()
+                    # First, the token
+                    tierName = self.node_tier_value(node, config)
+                    if tierName is not None:
+                        if tierName not in record.layers:
+                            record.layers[tierName] = []
+                        record.layers[tierName].append({
+                            'ref': segID,
+                            'value': token
+                        })
+
+                    # Then all its annotations
+                    for e in nx.dfs_edges(hit, tokenNodeID):
+                        descendantNode = nodeData[e[1]]
+                        annoTierName = self.node_tier_value(descendantNode, config)
+                        annoValue = self.node_anno_value(descendantNode)
+                        if annoTierName is not None and annoTierName in usedLayers:
+                            break
+                        usedLayers.add(annoTierName)
+                        if annoTierName is not None and len(annoValue) > 0:
+                            if annoTierName not in record.layers:
+                                record.layers[annoTierName] = []
+                            record.layers[annoTierName].append({
+                                'ref': segID,
+                                'value': annoValue
+                            })
+                record.textNoHighlight += token + ' '
+                if tokenNodeID in highlightNodes:
+                    record.text += '<hits:Hit>' + token + '</hits:Hit> '
+                else:
+                    record.text += token + ' '
+            if iSeq < len(seqs) - 1:
+                record.textNoHighlight += '... '
+                record.text += '... '
+        return record
+
+    def process_subgraph(self, hit, highlightNodes,
+                         config: ResourceConfig, searchOptions: dict,
+                         diagnostics: list[Diagnostic], advancedHits=False):
+        """
+        Process one hit returned by ANNIS, stored as a networkx graph.
+        Return a Record object.
+        """
+        highlightNodes = [self.rxNodeIDPfx.sub('', nodeID) for nodeID in highlightNodes]
+        # An annotated text segment in ANNIS is a subgraph, where some
+        # nodes are descendant of another nodes, e.g. tokens are descendants
+        # of a text. This is too complicated for an FCS output, where you
+        # have to have tokens, each of which can have some annotation at
+        # different layers, such as lemma or pos. So we only look at the
+        # ANNIS subgraph nodes that are labeled as belonging to tokenLayer,
+        # count them as tokens, and look for their annotation in their
+        # descendant nodes.
+        tokenLayer = 'tok'
+        if 'text' in config.tier_convert_reverse:
+            tokenLayer = config.tier_convert_reverse['text']
+
+        # ANNIS can find several disjoint segments that belong to the same
+        # text, each of which contains at least one search term. If this is
+        # the case, there will be several disconnected subgraphs in the response,
+        # so there can be multiple roots.
+        roots = [n for n, d in hit.in_degree() if d == 0]
+        tokenSequences = []
+        nodeData = {
+            node[0]: node[1] for node in hit.nodes(data=True)
+        }
+        for root in roots:
+            usedNodes = set()
+            tokenSequences.append([])
+            for e in nx.bfs_edges(hit, root):
+                # print(hit.get_edge_data(e[0], e[1]))
+                for side in (0, 1):
+                    if e[side] in usedNodes:
+                        continue
+                    usedNodes.add(e[side])
+                    node = nodeData[e[side]]
+                    for k in node.keys():
+                        if not k.startswith('annis::') and self.rxNodeIDPfx.sub('', k) == tokenLayer:
+                            tokenSequences[-1].append(e[side])
+                        # if k == 'annis::tok':
+                        #     tok = node[k]
+        record = self.process_token_sequences(tokenSequences, hit, nodeData, highlightNodes,
+                                              config, searchOptions,
+                                              diagnostics, advancedHits)
+        return record
+
+    def process_hits(self, hits, highlightNodes,
+                     config: ResourceConfig, searchOptions: dict,
                      diagnostics: list[Diagnostic], advancedHits=False):
         """
         Process hits returned by ANNIS, which take the form of
@@ -33,35 +175,13 @@ class AnnisResponseParser:
                 raise Diagnostic(DiagnosticTypes.sru, 1,
                                  details='Could not parse graphML data returned by the ANNIS instance.')
 
-        for hit in hits:
+        for iHit in range(len(hits)):
             # for node, data in hit.nodes(data=True):
             #     print(node, data)
-            roots = [n for n, d in hit.in_degree() if d == 0]
-            textSegments = []
-            nodeData = {
-                node[0]: node[1] for node in hit.nodes(data=True)
-            }
-            for root in roots:
-                usedNodes = set()
-                textSegment = ''
-                for e in nx.bfs_edges(hit, root):
-                    for side in (0, 1):
-                        if e[side] in usedNodes:
-                            continue
-                        usedNodes.add(e[side])
-                        node = nodeData[e[side]]
-                        tok = ''
-                        tier = ''
-                        for k in node.keys():
-                            if k == 'annis::tok':
-                                tok = node[k]
-                            elif k.endswith('Gloss'):
-                                tier = 'Gloss'
-                        if tier == 'Gloss':
-                            textSegment += tok + ' '
-                if len(textSegment) > 0:
-                    textSegments.append(textSegment.strip())
-            records.append(' <...> '.join(textSegments))
+            record = self.process_subgraph(hits[iHit], highlightNodes[iHit], config,
+                                           searchOptions, diagnostics, advancedHits)
+            if record is not None:
+                records.append(record)
         return records
 
     def parse(self, responseData, config: ResourceConfig, searchOptions: dict):
@@ -88,7 +208,8 @@ class AnnisResponseParser:
             nRecords = 0
         else:
             try:
-                records = self.process_hits(responseData['hits'], config, searchOptions,
+                records = self.process_hits(responseData['hits'], responseData['hit_ids'],
+                                            config, searchOptions,
                                             diagnostics, advancedHits=advancedHits)
             except Diagnostic as d:
                 diagnostics.append(d)
diff --git a/common/config.py b/common/config.py
index 69270e6..1ea39b8 100644
--- a/common/config.py
+++ b/common/config.py
@@ -45,6 +45,7 @@ class ResourceConfig:
         self.search_lang_id = ''
         self.pos_convert = []           # corpus-specific to UD (regexes)
         self.pos_convert_reverse = {}   # UD to corpus-specific
+        self.tier_convert = {}          # corpus-specific tier IDs to Advanced view layer labels
         self.tier_convert_reverse = {}  # FCS to corpus-specific tier IDs
 
         self.query_timeout = 60
diff --git a/common/tsakorpus_response_parser.py b/common/tsakorpus_response_parser.py
index be5ca79..1000e24 100644
--- a/common/tsakorpus_response_parser.py
+++ b/common/tsakorpus_response_parser.py
@@ -114,7 +114,6 @@ class TsakorpusResponseParser:
             record.text += el.tail
             record.textNoHighlight += el.tail
 
-
     def parse_context(self, hit, config: ResourceConfig, lang='', advancedHits=False):
         """
         Parse one hit. Return it as a Record object.
@@ -134,7 +133,6 @@ class TsakorpusResponseParser:
             self.parse_span(el, record, advancedHits)
         return record
 
-
     def parse(self, response, config: ResourceConfig, searchOptions: dict, lang=''):
         """
         Read a dictionary with the first N hits returned by a Tsakorpus
diff --git a/common/views_logic.py b/common/views_logic.py
index 793c014..f68a143 100644
--- a/common/views_logic.py
+++ b/common/views_logic.py
@@ -209,18 +209,18 @@ def process_search_retrieve(version: SRUVersion,
         records, nHits, diagnostics = app.rp_annis.parse(res, config, searchOptions)
         if any(diag.is_fatal() for diag in diagnostics):
             return fatal_response(Operation.searchRetrieve, version, config, diagnostics, request, templates)
-        return records
-        # records = [r.as_dict() for r in records]
-        # diagnostics = [str(d) for d in diagnostics]
-        # return templates.TemplateResponse('search_retrieve_response.xml',
-        #                                   {
-        #                                       'request': request,
-        #                                       'n_hits': nHits,
-        #                                       'records': records,
-        #                                       'version': templateVersion,
-        #                                       'diagnostics': diagnostics
-        #                                   },
-        #                                   media_type='application/xml')
+        # return records
+        records = [r.as_dict() for r in records]
+        diagnostics = [str(d) for d in diagnostics]
+        return templates.TemplateResponse('search_retrieve_response.xml',
+                                          {
+                                              'request': request,
+                                              'n_hits': nHits,
+                                              'records': records,
+                                              'version': templateVersion,
+                                              'diagnostics': diagnostics
+                                          },
+                                          media_type='application/xml')
     if config.platform == CorpPlatform.tsakorpus:
         try:
             if queryType == QueryType.cql:
diff --git a/config/annis_test.json b/config/annis_test.json
index 401b2c1..40be8c6 100644
--- a/config/annis_test.json
+++ b/config/annis_test.json
@@ -1,5 +1,5 @@
 {
-	"host": "https://www.sign-lang.uni-hamburg.de/dgs-korpus/ (replace later)",
+	"host": "https://dock.fdm.uni-hamburg.de/meinedgs/",
 	"port": "80",
 	"transport_protocol": "https",
 	"max_hits": 17,
@@ -9,6 +9,10 @@
 	"resource_base_url": "http://adwhh1.server.uni-hamburg.de:17101",
 	"annis_corpus_id": "DGS-Corpus-r3-en",
 	"annis_context_size": 5,
+	"tier_convert": {
+		"Gloss": "word",
+		"GlossType": "lemma"
+	},
 	"tier_convert_reverse": {
 		"text": "Gloss",
 		"lemma": "GlossType"
diff --git a/docs/configuration.rst b/docs/configuration.rst
index a025556..fdaee34 100644
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@@ -73,3 +73,13 @@ POS tags are required to be in the UD_ standard, per FCS specifications. If a co
 - ``pos_convert_reverse`` (dictionary) -- rules that convert UD tags from a query to corpus-specific tags or expressions. Keys are UD tags, values are expressions they have to be replaced with.
 
 .. _UD: https://universaldependencies.org/u/pos/
+
+ANNIS tier configuration
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+There are two parameters that define how tier/layer names in the search query should map to the layer names in ANNIS annotations, and how those should map to what is returned to the client.
+
+- ``tier_convert_reverse`` (dictionary) -- tells the endpoint which tier names in the query should be mapped to differently named tiers in ANNIS. For example, if it contains a key-value pair ``"lemma": "GlossType"`` and the query is ``lemma="CAN2B"``, then the value ``CAN2B`` will be searched in the tiers named ``GlossedType`` (possibly with a ``::`` prefix, e.g. ``PersonA::GlossType``. By default, ``text`` is mapped to ``tok`` and all the rest is left as is. An ANNIS tier indicated as an equivalent of ``text`` here is treated as a token-level tier.
+
+- ``tier_convert`` (dictionary) -- tells the endpoint which tiers from ANNIS should end up in the response, and (possibly) how they should be called in the output XML (``<Layer id="layer_name">``). Tiers not listed in this dictionary, apart from the token tier, will be disregarded. If you want to have a tier in the output, but do not want to rename it, just use identical key and value for it.
+
diff --git a/test_queries.txt b/test_queries.txt
index 74a81ea..afd9716 100644
--- a/test_queries.txt
+++ b/test_queries.txt
@@ -15,4 +15,5 @@ http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[lemma=%2
 http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[pos=%22NOUN%22]&x-fcs-dataviews=adv&queryType=fcs	Tsakorpus -- Advanced search, term only / POS (NOUN, 22639 hits)
 http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[lemma%3D%22m.*%22%20%26%20(pos=%22NOUN%22|pos=%22VERB%22)]&x-fcs-dataviews=adv&queryType=fcs	Tsakorpus -- Advanced search, Boolean operators inside term query (words that start with "m" and are either nouns or verbs, 3878 hits)
 http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[text=%22ud%22]%20[]{1,2}%20[text=%22no%22]&x-fcs-dataviews=adv&queryType=fcs	Tsakorpus -- Advanced search, sequence of terms ("ud" followed by "no" at the distance between 2 and 3, 7 hits)
-http://127.0.0.1:5000/fcs-endpoint/annis_test?operation=searchRetrieve&query=CAN2B%20AND%20WHY1*	ANNIS -- Simple search with boolean operator ("CAN2B" AND "WHY1*" within one text, 140 hits)
\ No newline at end of file
+http://127.0.0.1:5000/fcs-endpoint/annis_test?operation=searchRetrieve&query=CAN2B%20AND%20WHY1*	ANNIS -- Simple search with boolean operator ("CAN2B" AND "WHY1*" within one text, 140 hits)
+http://127.0.0.1:5000/fcs-endpoint/annis_test?operation=searchRetrieve&query=BOSS1B AND CAN2B AND WHY1*&x-fcs-dataviews=adv	ANNIS -- Simple search with boolean operators, advanced view (5 hits)
\ No newline at end of file
-- 
GitLab