Skip to content
Snippets Groups Projects
Commit efe4527e authored by Timofey Arkhangelskiy's avatar Timofey Arkhangelskiy
Browse files

ANNIS response parser works now

parent 7065e47b
Branches
No related tags found
No related merge requests found
...@@ -14,54 +14,174 @@ class AnnisResponseParser: ...@@ -14,54 +14,174 @@ class AnnisResponseParser:
Parses responses from an ANNIS instance. Parses responses from an ANNIS instance.
""" """
rxNodeIDPfx = re.compile('^[^/]*::')
def __init__(self): def __init__(self):
self.pc = None # POS convertor, rebuilt with each parse call self.pc = None # POS convertor, rebuilt with each parse call
def process_hits(self, hits, config: ResourceConfig, searchOptions: dict, def node_anno_value(self, node):
diagnostics: list[Diagnostic], advancedHits=False):
""" """
Process hits returned by ANNIS, which take the form of Return annotation value for a node, represented by
graphML strings. a node data dictionary.
If anything goes wrong, add Diagnostic objects to diagnostics list.
Return a list of Record objects.
""" """
records = []
for iHit in range(len(hits)):
try: try:
hits[iHit] = nx.parse_graphml(hits[iHit].decode('utf-8')) return node['annis::tok']
except: except KeyError:
raise Diagnostic(DiagnosticTypes.sru, 1, pass
details='Could not parse graphML data returned by the ANNIS instance.') return ''
for hit in hits: def node_tier_value(self, node, config):
# for node, data in hit.nodes(data=True): """
# print(node, data) Return tier name for a node, represented by
a node data dictionary.
"""
for k in node:
if k.startswith('annis::'):
continue
k = self.rxNodeIDPfx.sub('', k)
if k in config.tier_convert:
return config.tier_convert[k]
return None
def process_token_sequences(self, seqs, hit, nodeData, highlightNodes,
config: ResourceConfig, searchOptions: dict,
diagnostics: list[Diagnostic], advancedHits=False):
"""
Extract information from several sequences of token nodes
and their descendants, which represent one search hit.
Return a Record object with all the data.
"""
record = Record(advancedHits=advancedHits)
# print(highlightNodes)
for iSeq in range(len(seqs)):
seq = seqs[iSeq]
for tokenNodeID in seq:
node = nodeData[tokenNodeID]
# print(tokenNodeID, node)
token = self.node_anno_value(node)
if advancedHits:
segID = 's' + str(len(record.segments))
segment = {
'id': segID,
'start': len(record.textNoHighlight) + 1,
'end': len(record.textNoHighlight) + len(token)
}
record.segments.append(segment)
# Now extract data from all relevant tiers.
# This is tricky: nodes are linked not only to the
# respective annotation nodes, but also to the next
# node on the same level. So we use depth-first search
# and break once we see another node from a layer
# we have already seen.
usedLayers = set()
# First, the token
tierName = self.node_tier_value(node, config)
if tierName is not None:
if tierName not in record.layers:
record.layers[tierName] = []
record.layers[tierName].append({
'ref': segID,
'value': token
})
# Then all its annotations
for e in nx.dfs_edges(hit, tokenNodeID):
descendantNode = nodeData[e[1]]
annoTierName = self.node_tier_value(descendantNode, config)
annoValue = self.node_anno_value(descendantNode)
if annoTierName is not None and annoTierName in usedLayers:
break
usedLayers.add(annoTierName)
if annoTierName is not None and len(annoValue) > 0:
if annoTierName not in record.layers:
record.layers[annoTierName] = []
record.layers[annoTierName].append({
'ref': segID,
'value': annoValue
})
record.textNoHighlight += token + ' '
if tokenNodeID in highlightNodes:
record.text += '<hits:Hit>' + token + '</hits:Hit> '
else:
record.text += token + ' '
if iSeq < len(seqs) - 1:
record.textNoHighlight += '... '
record.text += '... '
return record
def process_subgraph(self, hit, highlightNodes,
config: ResourceConfig, searchOptions: dict,
diagnostics: list[Diagnostic], advancedHits=False):
"""
Process one hit returned by ANNIS, stored as a networkx graph.
Return a Record object.
"""
highlightNodes = [self.rxNodeIDPfx.sub('', nodeID) for nodeID in highlightNodes]
# An annotated text segment in ANNIS is a subgraph, where some
# nodes are descendant of another nodes, e.g. tokens are descendants
# of a text. This is too complicated for an FCS output, where you
# have to have tokens, each of which can have some annotation at
# different layers, such as lemma or pos. So we only look at the
# ANNIS subgraph nodes that are labeled as belonging to tokenLayer,
# count them as tokens, and look for their annotation in their
# descendant nodes.
tokenLayer = 'tok'
if 'text' in config.tier_convert_reverse:
tokenLayer = config.tier_convert_reverse['text']
# ANNIS can find several disjoint segments that belong to the same
# text, each of which contains at least one search term. If this is
# the case, there will be several disconnected subgraphs in the response,
# so there can be multiple roots.
roots = [n for n, d in hit.in_degree() if d == 0] roots = [n for n, d in hit.in_degree() if d == 0]
textSegments = [] tokenSequences = []
nodeData = { nodeData = {
node[0]: node[1] for node in hit.nodes(data=True) node[0]: node[1] for node in hit.nodes(data=True)
} }
for root in roots: for root in roots:
usedNodes = set() usedNodes = set()
textSegment = '' tokenSequences.append([])
for e in nx.bfs_edges(hit, root): for e in nx.bfs_edges(hit, root):
# print(hit.get_edge_data(e[0], e[1]))
for side in (0, 1): for side in (0, 1):
if e[side] in usedNodes: if e[side] in usedNodes:
continue continue
usedNodes.add(e[side]) usedNodes.add(e[side])
node = nodeData[e[side]] node = nodeData[e[side]]
tok = ''
tier = ''
for k in node.keys(): for k in node.keys():
if k == 'annis::tok': if not k.startswith('annis::') and self.rxNodeIDPfx.sub('', k) == tokenLayer:
tok = node[k] tokenSequences[-1].append(e[side])
elif k.endswith('Gloss'): # if k == 'annis::tok':
tier = 'Gloss' # tok = node[k]
if tier == 'Gloss': record = self.process_token_sequences(tokenSequences, hit, nodeData, highlightNodes,
textSegment += tok + ' ' config, searchOptions,
if len(textSegment) > 0: diagnostics, advancedHits)
textSegments.append(textSegment.strip()) return record
records.append(' <...> '.join(textSegments))
def process_hits(self, hits, highlightNodes,
config: ResourceConfig, searchOptions: dict,
diagnostics: list[Diagnostic], advancedHits=False):
"""
Process hits returned by ANNIS, which take the form of
graphML strings.
If anything goes wrong, add Diagnostic objects to diagnostics list.
Return a list of Record objects.
"""
records = []
for iHit in range(len(hits)):
try:
hits[iHit] = nx.parse_graphml(hits[iHit].decode('utf-8'))
except:
raise Diagnostic(DiagnosticTypes.sru, 1,
details='Could not parse graphML data returned by the ANNIS instance.')
for iHit in range(len(hits)):
# for node, data in hit.nodes(data=True):
# print(node, data)
record = self.process_subgraph(hits[iHit], highlightNodes[iHit], config,
searchOptions, diagnostics, advancedHits)
if record is not None:
records.append(record)
return records return records
def parse(self, responseData, config: ResourceConfig, searchOptions: dict): def parse(self, responseData, config: ResourceConfig, searchOptions: dict):
...@@ -88,7 +208,8 @@ class AnnisResponseParser: ...@@ -88,7 +208,8 @@ class AnnisResponseParser:
nRecords = 0 nRecords = 0
else: else:
try: try:
records = self.process_hits(responseData['hits'], config, searchOptions, records = self.process_hits(responseData['hits'], responseData['hit_ids'],
config, searchOptions,
diagnostics, advancedHits=advancedHits) diagnostics, advancedHits=advancedHits)
except Diagnostic as d: except Diagnostic as d:
diagnostics.append(d) diagnostics.append(d)
......
...@@ -45,6 +45,7 @@ class ResourceConfig: ...@@ -45,6 +45,7 @@ class ResourceConfig:
self.search_lang_id = '' self.search_lang_id = ''
self.pos_convert = [] # corpus-specific to UD (regexes) self.pos_convert = [] # corpus-specific to UD (regexes)
self.pos_convert_reverse = {} # UD to corpus-specific self.pos_convert_reverse = {} # UD to corpus-specific
self.tier_convert = {} # corpus-specific tier IDs to Advanced view layer labels
self.tier_convert_reverse = {} # FCS to corpus-specific tier IDs self.tier_convert_reverse = {} # FCS to corpus-specific tier IDs
self.query_timeout = 60 self.query_timeout = 60
......
...@@ -114,7 +114,6 @@ class TsakorpusResponseParser: ...@@ -114,7 +114,6 @@ class TsakorpusResponseParser:
record.text += el.tail record.text += el.tail
record.textNoHighlight += el.tail record.textNoHighlight += el.tail
def parse_context(self, hit, config: ResourceConfig, lang='', advancedHits=False): def parse_context(self, hit, config: ResourceConfig, lang='', advancedHits=False):
""" """
Parse one hit. Return it as a Record object. Parse one hit. Return it as a Record object.
...@@ -134,7 +133,6 @@ class TsakorpusResponseParser: ...@@ -134,7 +133,6 @@ class TsakorpusResponseParser:
self.parse_span(el, record, advancedHits) self.parse_span(el, record, advancedHits)
return record return record
def parse(self, response, config: ResourceConfig, searchOptions: dict, lang=''): def parse(self, response, config: ResourceConfig, searchOptions: dict, lang=''):
""" """
Read a dictionary with the first N hits returned by a Tsakorpus Read a dictionary with the first N hits returned by a Tsakorpus
......
...@@ -209,18 +209,18 @@ def process_search_retrieve(version: SRUVersion, ...@@ -209,18 +209,18 @@ def process_search_retrieve(version: SRUVersion,
records, nHits, diagnostics = app.rp_annis.parse(res, config, searchOptions) records, nHits, diagnostics = app.rp_annis.parse(res, config, searchOptions)
if any(diag.is_fatal() for diag in diagnostics): if any(diag.is_fatal() for diag in diagnostics):
return fatal_response(Operation.searchRetrieve, version, config, diagnostics, request, templates) return fatal_response(Operation.searchRetrieve, version, config, diagnostics, request, templates)
return records # return records
# records = [r.as_dict() for r in records] records = [r.as_dict() for r in records]
# diagnostics = [str(d) for d in diagnostics] diagnostics = [str(d) for d in diagnostics]
# return templates.TemplateResponse('search_retrieve_response.xml', return templates.TemplateResponse('search_retrieve_response.xml',
# { {
# 'request': request, 'request': request,
# 'n_hits': nHits, 'n_hits': nHits,
# 'records': records, 'records': records,
# 'version': templateVersion, 'version': templateVersion,
# 'diagnostics': diagnostics 'diagnostics': diagnostics
# }, },
# media_type='application/xml') media_type='application/xml')
if config.platform == CorpPlatform.tsakorpus: if config.platform == CorpPlatform.tsakorpus:
try: try:
if queryType == QueryType.cql: if queryType == QueryType.cql:
......
{ {
"host": "https://www.sign-lang.uni-hamburg.de/dgs-korpus/ (replace later)", "host": "https://dock.fdm.uni-hamburg.de/meinedgs/",
"port": "80", "port": "80",
"transport_protocol": "https", "transport_protocol": "https",
"max_hits": 17, "max_hits": 17,
...@@ -9,6 +9,10 @@ ...@@ -9,6 +9,10 @@
"resource_base_url": "http://adwhh1.server.uni-hamburg.de:17101", "resource_base_url": "http://adwhh1.server.uni-hamburg.de:17101",
"annis_corpus_id": "DGS-Corpus-r3-en", "annis_corpus_id": "DGS-Corpus-r3-en",
"annis_context_size": 5, "annis_context_size": 5,
"tier_convert": {
"Gloss": "word",
"GlossType": "lemma"
},
"tier_convert_reverse": { "tier_convert_reverse": {
"text": "Gloss", "text": "Gloss",
"lemma": "GlossType" "lemma": "GlossType"
......
...@@ -73,3 +73,13 @@ POS tags are required to be in the UD_ standard, per FCS specifications. If a co ...@@ -73,3 +73,13 @@ POS tags are required to be in the UD_ standard, per FCS specifications. If a co
- ``pos_convert_reverse`` (dictionary) -- rules that convert UD tags from a query to corpus-specific tags or expressions. Keys are UD tags, values are expressions they have to be replaced with. - ``pos_convert_reverse`` (dictionary) -- rules that convert UD tags from a query to corpus-specific tags or expressions. Keys are UD tags, values are expressions they have to be replaced with.
.. _UD: https://universaldependencies.org/u/pos/ .. _UD: https://universaldependencies.org/u/pos/
ANNIS tier configuration
~~~~~~~~~~~~~~~~~~~~~~~~
There are two parameters that define how tier/layer names in the search query should map to the layer names in ANNIS annotations, and how those should map to what is returned to the client.
- ``tier_convert_reverse`` (dictionary) -- tells the endpoint which tier names in the query should be mapped to differently named tiers in ANNIS. For example, if it contains a key-value pair ``"lemma": "GlossType"`` and the query is ``lemma="CAN2B"``, then the value ``CAN2B`` will be searched in the tiers named ``GlossedType`` (possibly with a ``::`` prefix, e.g. ``PersonA::GlossType``. By default, ``text`` is mapped to ``tok`` and all the rest is left as is. An ANNIS tier indicated as an equivalent of ``text`` here is treated as a token-level tier.
- ``tier_convert`` (dictionary) -- tells the endpoint which tiers from ANNIS should end up in the response, and (possibly) how they should be called in the output XML (``<Layer id="layer_name">``). Tiers not listed in this dictionary, apart from the token tier, will be disregarded. If you want to have a tier in the output, but do not want to rename it, just use identical key and value for it.
...@@ -16,3 +16,4 @@ http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[pos=%22N ...@@ -16,3 +16,4 @@ http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[pos=%22N
http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[lemma%3D%22m.*%22%20%26%20(pos=%22NOUN%22|pos=%22VERB%22)]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, Boolean operators inside term query (words that start with "m" and are either nouns or verbs, 3878 hits) http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[lemma%3D%22m.*%22%20%26%20(pos=%22NOUN%22|pos=%22VERB%22)]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, Boolean operators inside term query (words that start with "m" and are either nouns or verbs, 3878 hits)
http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[text=%22ud%22]%20[]{1,2}%20[text=%22no%22]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, sequence of terms ("ud" followed by "no" at the distance between 2 and 3, 7 hits) http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[text=%22ud%22]%20[]{1,2}%20[text=%22no%22]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, sequence of terms ("ud" followed by "no" at the distance between 2 and 3, 7 hits)
http://127.0.0.1:5000/fcs-endpoint/annis_test?operation=searchRetrieve&query=CAN2B%20AND%20WHY1* ANNIS -- Simple search with boolean operator ("CAN2B" AND "WHY1*" within one text, 140 hits) http://127.0.0.1:5000/fcs-endpoint/annis_test?operation=searchRetrieve&query=CAN2B%20AND%20WHY1* ANNIS -- Simple search with boolean operator ("CAN2B" AND "WHY1*" within one text, 140 hits)
http://127.0.0.1:5000/fcs-endpoint/annis_test?operation=searchRetrieve&query=BOSS1B AND CAN2B AND WHY1*&x-fcs-dataviews=adv ANNIS -- Simple search with boolean operators, advanced view (5 hits)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment