Skip to content
Snippets Groups Projects
Commit 7065e47b authored by Timofey Arkhangelskiy's avatar Timofey Arkhangelskiy
Browse files

Start response processing for ANNIS (not complete yet)

parent 3e45b565
Branches
No related tags found
No related merge requests found
......@@ -12,9 +12,9 @@ class AnnisQueryParser(QueryParser):
Parses search queries for ANNIS-based corpora.
"""
rxTsakorpusBool = re.compile('[()|,]')
rxRelOps = re.compile('^(?:\\^\\*|\\||\\.[*,0-9]*)|_=_$') # Operators for setting relations between query words
rxFramingQuotes = re.compile('^[/"]|(?<!\\\\)[/"]$')
rxNodeIDPfx = re.compile('^[^/]*::')
def build_get_string(self, params, config: ResourceConfig,
searchOptions: dict, withinClause=''):
......@@ -40,7 +40,6 @@ class AnnisQueryParser(QueryParser):
queryTail = ''
params = self.rename_params(params, config)
for param in sorted(params):
print(param)
# For query words:
# param = [annotation_layer, query_word_number, value, operator]
# For relations between query words:
......@@ -230,21 +229,34 @@ class AnnisQueryParser(QueryParser):
"""
res = {
'n_hits': -1,
'hit_ids': ''
'hit_ids': [],
'hits': []
}
urlCount = config.resource_base_url.strip('/') + '/v1/search/count'
print(urlCount)
response = requests.post(urlCount, json=query, timeout=60)
response = requests.post(urlCount, json=query, timeout=config.query_timeout)
try:
res['n_hits'] = response.json()['match_count']
except:
pass
if res['n_hits'] > 0:
# First, find IDs for the matches
urlFind = config.resource_base_url.strip('/') + '/v1/search/find'
print(urlFind)
response = requests.post(urlFind, json=query, timeout=60)
res['hit_ids'] = response.content.decode('utf-8').strip('\n').split('\n')
print(res)
response = requests.post(urlFind, json=query, timeout=config.query_timeout)
res['hit_ids'] = [list(tokenIDs.split(' '))
for tokenIDs in response.content.decode('utf-8').strip('\n').split('\n')
if len(tokenIDs) > 0]
# Second, find subgraphs including those matches and some context
urlSubgraph = config.resource_base_url.strip('/') + '/v1/corpora/' \
+ config.annis_corpus_id + '/subgraph'
for hitIDs in res['hit_ids']:
subgraphQuery = {
'node_ids': [self.rxNodeIDPfx.sub('', hitID)
for hitID in hitIDs],
'left': config.annis_context_size,
'right': config.annis_context_size
}
response = requests.post(urlSubgraph, json=subgraphQuery, timeout=config.query_timeout)
res['hits'].append(response.content)
return res
......
from urllib.parse import quote
import re
import networkx as nx
import matplotlib.pyplot as plt
from lxml.html import fromstring, tostring
from .enums import *
from .config import ResourceConfig
from .search_retrieve import Record
from .diagnostics import Diagnostic, DiagnosticTypes
class AnnisResponseParser:
"""
Parses responses from an ANNIS instance.
"""
def __init__(self):
self.pc = None # POS convertor, rebuilt with each parse call
def process_hits(self, hits, config: ResourceConfig, searchOptions: dict,
diagnostics: list[Diagnostic], advancedHits=False):
"""
Process hits returned by ANNIS, which take the form of
graphML strings.
If anything goes wrong, add Diagnostic objects to diagnostics list.
Return a list of Record objects.
"""
records = []
for iHit in range(len(hits)):
try:
hits[iHit] = nx.parse_graphml(hits[iHit].decode('utf-8'))
except:
raise Diagnostic(DiagnosticTypes.sru, 1,
details='Could not parse graphML data returned by the ANNIS instance.')
for hit in hits:
# for node, data in hit.nodes(data=True):
# print(node, data)
roots = [n for n, d in hit.in_degree() if d == 0]
textSegments = []
nodeData = {
node[0]: node[1] for node in hit.nodes(data=True)
}
for root in roots:
usedNodes = set()
textSegment = ''
for e in nx.bfs_edges(hit, root):
for side in (0, 1):
if e[side] in usedNodes:
continue
usedNodes.add(e[side])
node = nodeData[e[side]]
tok = ''
tier = ''
for k in node.keys():
if k == 'annis::tok':
tok = node[k]
elif k.endswith('Gloss'):
tier = 'Gloss'
if tier == 'Gloss':
textSegment += tok + ' '
if len(textSegment) > 0:
textSegments.append(textSegment.strip())
records.append(' <...> '.join(textSegments))
return records
def parse(self, responseData, config: ResourceConfig, searchOptions: dict):
"""
Read graphML response with the first N hits returned by an ANNIS
instance. The hits to process are stored in responseData['hits'].
Return a list of Record objects and the total number of
records found.
"""
diagnostics = []
advancedHits = False
dataViewsRequested = {v.strip() for v in searchOptions['x-fcs-dataviews'].split(',') if len(v.strip()) > 0}
if 'adv' in dataViewsRequested:
advancedHits = True
nRecords = responseData['n_hits']
records = []
if searchOptions['startRecord'] > 1 and nRecords < searchOptions['startRecord']:
# We don't actually care about startRecord, but we should
# return a fatal diagnostic if it is larger than the number
# of hits.
diagnostics.append(Diagnostic(DiagnosticTypes.sru, 61))
return records, nRecords, diagnostics
if len(responseData['hits']) <= 0:
nRecords = 0
else:
try:
records = self.process_hits(responseData['hits'], config, searchOptions,
diagnostics, advancedHits=advancedHits)
except Diagnostic as d:
diagnostics.append(d)
return records, nRecords, diagnostics
if __name__ == '__main__':
pass
......@@ -26,7 +26,8 @@ class ResourceConfig:
self.port = '5000'
self.url_path = '127.0.0.1'
self.resource_base_url = 'http://127.0.0.1'
self.annis_corpus_id = '' # ANNIS-internal ID of the corpus to search in
self.annis_corpus_id = '' # ANNIS-internal ID of the corpus to search in
self.annis_context_size = 5 # Context size for hits rendering (ANNIS only)
self.titles = []
self.descriptions = []
self.authors = []
......
......@@ -16,6 +16,7 @@ class Diagnostic(Exception):
stdMessages = {
(DiagnosticTypes.fcs, 4): 'Requested Data View not valid for this resource.',
(DiagnosticTypes.sru, 1): 'General system error.',
(DiagnosticTypes.sru, 4): 'Unsupported operation. Supported operation: explain, searchRetrieve, scan.',
(DiagnosticTypes.sru, 5): 'Unsupported version. Supported SRU versions: 1.2 and 2.0.',
(DiagnosticTypes.sru, 8): 'Unsupported parameter.',
......
......@@ -202,15 +202,14 @@ def process_search_retrieve(version: SRUVersion,
query = app.qp_annis.translate_simple(query, config, searchOptions)
else:
query = app.qp_annis.translate_advanced(query, config, searchOptions)
print(query)
# print(query)
res = app.qp_annis.send_query(query, config)
except Diagnostic as diag:
return fatal_response(Operation.searchRetrieve, version, config, diagnostics + [diag], request, templates)
# return query['query']
return res
# records, nHits, diagnostics = app.rp_annis.parse(res, config, searchOptions['x-fcs-dataviews'])
# if any(diag.is_fatal() for diag in diagnostics):
# return fatal_response(Operation.searchRetrieve, version, config, diagnostics, request, templates)
records, nHits, diagnostics = app.rp_annis.parse(res, config, searchOptions)
if any(diag.is_fatal() for diag in diagnostics):
return fatal_response(Operation.searchRetrieve, version, config, diagnostics, request, templates)
return records
# records = [r.as_dict() for r in records]
# diagnostics = [str(d) for d in diagnostics]
# return templates.TemplateResponse('search_retrieve_response.xml',
......@@ -228,7 +227,6 @@ def process_search_retrieve(version: SRUVersion,
strGetParams = app.qp_tsakorpus.translate_simple(query, config, searchOptions)
else:
strGetParams = app.qp_tsakorpus.translate_advanced(query, config, searchOptions)
print(strGetParams)
res = app.qp_tsakorpus.send_query(strGetParams, config)
except Diagnostic as diag:
return fatal_response(Operation.searchRetrieve, version, config, diagnostics + [diag], request, templates)
......@@ -253,9 +251,7 @@ def process_search_retrieve(version: SRUVersion,
else:
# No advanced search for Litterae
strGetParams = app.qp_litterae.translate_simple(query, config, searchOptions)
# print(strGetParams)
res = app.qp_litterae.send_query(strGetParams, config)
print(res)
except Diagnostic as diag:
return fatal_response(Operation.searchRetrieve, version, config, diagnostics + [diag], request, templates)
for dv in searchOptions['x-fcs-dataviews'].split(','):
......@@ -297,7 +293,7 @@ def process_request(operation: Operation,
:param diagnostics: List of diagnostics produced by the validation
function.
"""
print(query)
# print(query)
# If something is clearly wrong with the query, return
# a response with the list of diagnostics
if config is None or any(d.is_fatal() for d in diagnostics):
......
......@@ -8,6 +8,7 @@
"adv_supported": true,
"resource_base_url": "http://adwhh1.server.uni-hamburg.de:17101",
"annis_corpus_id": "DGS-Corpus-r3-en",
"annis_context_size": 5,
"tier_convert_reverse": {
"text": "Gloss",
"lemma": "GlossType"
......
......@@ -9,6 +9,7 @@ from common.litterae_response_parser import LitteraeResponseParser
from common.tsakorpus_query_parser import TsakorpusQueryParser
from common.tsakorpus_response_parser import TsakorpusResponseParser
from common.annis_query_parser import AnnisQueryParser
from common.annis_response_parser import AnnisResponseParser
from common.enums import *
from common.diagnostics import Diagnostic
from common.config import ResourceConfig, read_configs
......@@ -28,6 +29,7 @@ app.rp_litterae = LitteraeResponseParser()
app.qp_tsakorpus = TsakorpusQueryParser()
app.rp_tsakorpus = TsakorpusResponseParser()
app.qp_annis = AnnisQueryParser()
app.rp_annis = AnnisResponseParser()
app.configs = read_configs()
app.logging = True
......
......@@ -3,4 +3,5 @@ uvicorn>=0.20.0
lxml
Jinja2>=3.0.3
requests
a2wsgi
\ No newline at end of file
a2wsgi
networkx
\ No newline at end of file
......@@ -14,4 +14,5 @@ http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[text=%22
http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[lemma=%22mon%22]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, term only / lemma ("mon", 2284 hits)
http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[pos=%22NOUN%22]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, term only / POS (NOUN, 22639 hits)
http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[lemma%3D%22m.*%22%20%26%20(pos=%22NOUN%22|pos=%22VERB%22)]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, Boolean operators inside term query (words that start with "m" and are either nouns or verbs, 3878 hits)
http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[text=%22ud%22]%20[]{1,2}%20[text=%22no%22]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, sequence of terms ("ud" followed by "no" at the distance between 2 and 3, 7 hits)
\ No newline at end of file
http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[text=%22ud%22]%20[]{1,2}%20[text=%22no%22]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, sequence of terms ("ud" followed by "no" at the distance between 2 and 3, 7 hits)
http://127.0.0.1:5000/fcs-endpoint/annis_test?operation=searchRetrieve&query=CAN2B%20AND%20WHY1* ANNIS -- Simple search with boolean operator ("CAN2B" AND "WHY1*" within one text, 140 hits)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment