From efe4527e4892cff9dacbd165f3326e7fbe79721f Mon Sep 17 00:00:00 2001 From: Timofey Arkhangelskiy <timarkh@gmail.com> Date: Mon, 26 Jun 2023 18:30:18 +0200 Subject: [PATCH] ANNIS response parser works now --- common/annis_response_parser.py | 179 +++++++++++++++++++++++----- common/config.py | 1 + common/tsakorpus_response_parser.py | 2 - common/views_logic.py | 24 ++-- config/annis_test.json | 6 +- docs/configuration.rst | 10 ++ test_queries.txt | 3 +- 7 files changed, 180 insertions(+), 45 deletions(-) diff --git a/common/annis_response_parser.py b/common/annis_response_parser.py index 02ab151..85f966b 100644 --- a/common/annis_response_parser.py +++ b/common/annis_response_parser.py @@ -14,10 +14,152 @@ class AnnisResponseParser: Parses responses from an ANNIS instance. """ + rxNodeIDPfx = re.compile('^[^/]*::') + def __init__(self): self.pc = None # POS convertor, rebuilt with each parse call - def process_hits(self, hits, config: ResourceConfig, searchOptions: dict, + def node_anno_value(self, node): + """ + Return annotation value for a node, represented by + a node data dictionary. + """ + try: + return node['annis::tok'] + except KeyError: + pass + return '' + + def node_tier_value(self, node, config): + """ + Return tier name for a node, represented by + a node data dictionary. + """ + for k in node: + if k.startswith('annis::'): + continue + k = self.rxNodeIDPfx.sub('', k) + if k in config.tier_convert: + return config.tier_convert[k] + return None + + def process_token_sequences(self, seqs, hit, nodeData, highlightNodes, + config: ResourceConfig, searchOptions: dict, + diagnostics: list[Diagnostic], advancedHits=False): + """ + Extract information from several sequences of token nodes + and their descendants, which represent one search hit. + Return a Record object with all the data. + """ + record = Record(advancedHits=advancedHits) + # print(highlightNodes) + for iSeq in range(len(seqs)): + seq = seqs[iSeq] + for tokenNodeID in seq: + node = nodeData[tokenNodeID] + # print(tokenNodeID, node) + token = self.node_anno_value(node) + if advancedHits: + segID = 's' + str(len(record.segments)) + segment = { + 'id': segID, + 'start': len(record.textNoHighlight) + 1, + 'end': len(record.textNoHighlight) + len(token) + } + record.segments.append(segment) + # Now extract data from all relevant tiers. + # This is tricky: nodes are linked not only to the + # respective annotation nodes, but also to the next + # node on the same level. So we use depth-first search + # and break once we see another node from a layer + # we have already seen. + usedLayers = set() + # First, the token + tierName = self.node_tier_value(node, config) + if tierName is not None: + if tierName not in record.layers: + record.layers[tierName] = [] + record.layers[tierName].append({ + 'ref': segID, + 'value': token + }) + + # Then all its annotations + for e in nx.dfs_edges(hit, tokenNodeID): + descendantNode = nodeData[e[1]] + annoTierName = self.node_tier_value(descendantNode, config) + annoValue = self.node_anno_value(descendantNode) + if annoTierName is not None and annoTierName in usedLayers: + break + usedLayers.add(annoTierName) + if annoTierName is not None and len(annoValue) > 0: + if annoTierName not in record.layers: + record.layers[annoTierName] = [] + record.layers[annoTierName].append({ + 'ref': segID, + 'value': annoValue + }) + record.textNoHighlight += token + ' ' + if tokenNodeID in highlightNodes: + record.text += '<hits:Hit>' + token + '</hits:Hit> ' + else: + record.text += token + ' ' + if iSeq < len(seqs) - 1: + record.textNoHighlight += '... ' + record.text += '... ' + return record + + def process_subgraph(self, hit, highlightNodes, + config: ResourceConfig, searchOptions: dict, + diagnostics: list[Diagnostic], advancedHits=False): + """ + Process one hit returned by ANNIS, stored as a networkx graph. + Return a Record object. + """ + highlightNodes = [self.rxNodeIDPfx.sub('', nodeID) for nodeID in highlightNodes] + # An annotated text segment in ANNIS is a subgraph, where some + # nodes are descendant of another nodes, e.g. tokens are descendants + # of a text. This is too complicated for an FCS output, where you + # have to have tokens, each of which can have some annotation at + # different layers, such as lemma or pos. So we only look at the + # ANNIS subgraph nodes that are labeled as belonging to tokenLayer, + # count them as tokens, and look for their annotation in their + # descendant nodes. + tokenLayer = 'tok' + if 'text' in config.tier_convert_reverse: + tokenLayer = config.tier_convert_reverse['text'] + + # ANNIS can find several disjoint segments that belong to the same + # text, each of which contains at least one search term. If this is + # the case, there will be several disconnected subgraphs in the response, + # so there can be multiple roots. + roots = [n for n, d in hit.in_degree() if d == 0] + tokenSequences = [] + nodeData = { + node[0]: node[1] for node in hit.nodes(data=True) + } + for root in roots: + usedNodes = set() + tokenSequences.append([]) + for e in nx.bfs_edges(hit, root): + # print(hit.get_edge_data(e[0], e[1])) + for side in (0, 1): + if e[side] in usedNodes: + continue + usedNodes.add(e[side]) + node = nodeData[e[side]] + for k in node.keys(): + if not k.startswith('annis::') and self.rxNodeIDPfx.sub('', k) == tokenLayer: + tokenSequences[-1].append(e[side]) + # if k == 'annis::tok': + # tok = node[k] + record = self.process_token_sequences(tokenSequences, hit, nodeData, highlightNodes, + config, searchOptions, + diagnostics, advancedHits) + return record + + def process_hits(self, hits, highlightNodes, + config: ResourceConfig, searchOptions: dict, diagnostics: list[Diagnostic], advancedHits=False): """ Process hits returned by ANNIS, which take the form of @@ -33,35 +175,13 @@ class AnnisResponseParser: raise Diagnostic(DiagnosticTypes.sru, 1, details='Could not parse graphML data returned by the ANNIS instance.') - for hit in hits: + for iHit in range(len(hits)): # for node, data in hit.nodes(data=True): # print(node, data) - roots = [n for n, d in hit.in_degree() if d == 0] - textSegments = [] - nodeData = { - node[0]: node[1] for node in hit.nodes(data=True) - } - for root in roots: - usedNodes = set() - textSegment = '' - for e in nx.bfs_edges(hit, root): - for side in (0, 1): - if e[side] in usedNodes: - continue - usedNodes.add(e[side]) - node = nodeData[e[side]] - tok = '' - tier = '' - for k in node.keys(): - if k == 'annis::tok': - tok = node[k] - elif k.endswith('Gloss'): - tier = 'Gloss' - if tier == 'Gloss': - textSegment += tok + ' ' - if len(textSegment) > 0: - textSegments.append(textSegment.strip()) - records.append(' <...> '.join(textSegments)) + record = self.process_subgraph(hits[iHit], highlightNodes[iHit], config, + searchOptions, diagnostics, advancedHits) + if record is not None: + records.append(record) return records def parse(self, responseData, config: ResourceConfig, searchOptions: dict): @@ -88,7 +208,8 @@ class AnnisResponseParser: nRecords = 0 else: try: - records = self.process_hits(responseData['hits'], config, searchOptions, + records = self.process_hits(responseData['hits'], responseData['hit_ids'], + config, searchOptions, diagnostics, advancedHits=advancedHits) except Diagnostic as d: diagnostics.append(d) diff --git a/common/config.py b/common/config.py index 69270e6..1ea39b8 100644 --- a/common/config.py +++ b/common/config.py @@ -45,6 +45,7 @@ class ResourceConfig: self.search_lang_id = '' self.pos_convert = [] # corpus-specific to UD (regexes) self.pos_convert_reverse = {} # UD to corpus-specific + self.tier_convert = {} # corpus-specific tier IDs to Advanced view layer labels self.tier_convert_reverse = {} # FCS to corpus-specific tier IDs self.query_timeout = 60 diff --git a/common/tsakorpus_response_parser.py b/common/tsakorpus_response_parser.py index be5ca79..1000e24 100644 --- a/common/tsakorpus_response_parser.py +++ b/common/tsakorpus_response_parser.py @@ -114,7 +114,6 @@ class TsakorpusResponseParser: record.text += el.tail record.textNoHighlight += el.tail - def parse_context(self, hit, config: ResourceConfig, lang='', advancedHits=False): """ Parse one hit. Return it as a Record object. @@ -134,7 +133,6 @@ class TsakorpusResponseParser: self.parse_span(el, record, advancedHits) return record - def parse(self, response, config: ResourceConfig, searchOptions: dict, lang=''): """ Read a dictionary with the first N hits returned by a Tsakorpus diff --git a/common/views_logic.py b/common/views_logic.py index 793c014..f68a143 100644 --- a/common/views_logic.py +++ b/common/views_logic.py @@ -209,18 +209,18 @@ def process_search_retrieve(version: SRUVersion, records, nHits, diagnostics = app.rp_annis.parse(res, config, searchOptions) if any(diag.is_fatal() for diag in diagnostics): return fatal_response(Operation.searchRetrieve, version, config, diagnostics, request, templates) - return records - # records = [r.as_dict() for r in records] - # diagnostics = [str(d) for d in diagnostics] - # return templates.TemplateResponse('search_retrieve_response.xml', - # { - # 'request': request, - # 'n_hits': nHits, - # 'records': records, - # 'version': templateVersion, - # 'diagnostics': diagnostics - # }, - # media_type='application/xml') + # return records + records = [r.as_dict() for r in records] + diagnostics = [str(d) for d in diagnostics] + return templates.TemplateResponse('search_retrieve_response.xml', + { + 'request': request, + 'n_hits': nHits, + 'records': records, + 'version': templateVersion, + 'diagnostics': diagnostics + }, + media_type='application/xml') if config.platform == CorpPlatform.tsakorpus: try: if queryType == QueryType.cql: diff --git a/config/annis_test.json b/config/annis_test.json index 401b2c1..40be8c6 100644 --- a/config/annis_test.json +++ b/config/annis_test.json @@ -1,5 +1,5 @@ { - "host": "https://www.sign-lang.uni-hamburg.de/dgs-korpus/ (replace later)", + "host": "https://dock.fdm.uni-hamburg.de/meinedgs/", "port": "80", "transport_protocol": "https", "max_hits": 17, @@ -9,6 +9,10 @@ "resource_base_url": "http://adwhh1.server.uni-hamburg.de:17101", "annis_corpus_id": "DGS-Corpus-r3-en", "annis_context_size": 5, + "tier_convert": { + "Gloss": "word", + "GlossType": "lemma" + }, "tier_convert_reverse": { "text": "Gloss", "lemma": "GlossType" diff --git a/docs/configuration.rst b/docs/configuration.rst index a025556..fdaee34 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -73,3 +73,13 @@ POS tags are required to be in the UD_ standard, per FCS specifications. If a co - ``pos_convert_reverse`` (dictionary) -- rules that convert UD tags from a query to corpus-specific tags or expressions. Keys are UD tags, values are expressions they have to be replaced with. .. _UD: https://universaldependencies.org/u/pos/ + +ANNIS tier configuration +~~~~~~~~~~~~~~~~~~~~~~~~ + +There are two parameters that define how tier/layer names in the search query should map to the layer names in ANNIS annotations, and how those should map to what is returned to the client. + +- ``tier_convert_reverse`` (dictionary) -- tells the endpoint which tier names in the query should be mapped to differently named tiers in ANNIS. For example, if it contains a key-value pair ``"lemma": "GlossType"`` and the query is ``lemma="CAN2B"``, then the value ``CAN2B`` will be searched in the tiers named ``GlossedType`` (possibly with a ``::`` prefix, e.g. ``PersonA::GlossType``. By default, ``text`` is mapped to ``tok`` and all the rest is left as is. An ANNIS tier indicated as an equivalent of ``text`` here is treated as a token-level tier. + +- ``tier_convert`` (dictionary) -- tells the endpoint which tiers from ANNIS should end up in the response, and (possibly) how they should be called in the output XML (``<Layer id="layer_name">``). Tiers not listed in this dictionary, apart from the token tier, will be disregarded. If you want to have a tier in the output, but do not want to rename it, just use identical key and value for it. + diff --git a/test_queries.txt b/test_queries.txt index 74a81ea..afd9716 100644 --- a/test_queries.txt +++ b/test_queries.txt @@ -15,4 +15,5 @@ http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[lemma=%2 http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[pos=%22NOUN%22]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, term only / POS (NOUN, 22639 hits) http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[lemma%3D%22m.*%22%20%26%20(pos=%22NOUN%22|pos=%22VERB%22)]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, Boolean operators inside term query (words that start with "m" and are either nouns or verbs, 3878 hits) http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[text=%22ud%22]%20[]{1,2}%20[text=%22no%22]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, sequence of terms ("ud" followed by "no" at the distance between 2 and 3, 7 hits) -http://127.0.0.1:5000/fcs-endpoint/annis_test?operation=searchRetrieve&query=CAN2B%20AND%20WHY1* ANNIS -- Simple search with boolean operator ("CAN2B" AND "WHY1*" within one text, 140 hits) \ No newline at end of file +http://127.0.0.1:5000/fcs-endpoint/annis_test?operation=searchRetrieve&query=CAN2B%20AND%20WHY1* ANNIS -- Simple search with boolean operator ("CAN2B" AND "WHY1*" within one text, 140 hits) +http://127.0.0.1:5000/fcs-endpoint/annis_test?operation=searchRetrieve&query=BOSS1B AND CAN2B AND WHY1*&x-fcs-dataviews=adv ANNIS -- Simple search with boolean operators, advanced view (5 hits) \ No newline at end of file -- GitLab