Skip to content
Snippets Groups Projects
Commit 7065e47b authored by Timofey Arkhangelskiy's avatar Timofey Arkhangelskiy
Browse files

Start response processing for ANNIS (not complete yet)

parent 3e45b565
No related branches found
No related tags found
No related merge requests found
...@@ -12,9 +12,9 @@ class AnnisQueryParser(QueryParser): ...@@ -12,9 +12,9 @@ class AnnisQueryParser(QueryParser):
Parses search queries for ANNIS-based corpora. Parses search queries for ANNIS-based corpora.
""" """
rxTsakorpusBool = re.compile('[()|,]')
rxRelOps = re.compile('^(?:\\^\\*|\\||\\.[*,0-9]*)|_=_$') # Operators for setting relations between query words rxRelOps = re.compile('^(?:\\^\\*|\\||\\.[*,0-9]*)|_=_$') # Operators for setting relations between query words
rxFramingQuotes = re.compile('^[/"]|(?<!\\\\)[/"]$') rxFramingQuotes = re.compile('^[/"]|(?<!\\\\)[/"]$')
rxNodeIDPfx = re.compile('^[^/]*::')
def build_get_string(self, params, config: ResourceConfig, def build_get_string(self, params, config: ResourceConfig,
searchOptions: dict, withinClause=''): searchOptions: dict, withinClause=''):
...@@ -40,7 +40,6 @@ class AnnisQueryParser(QueryParser): ...@@ -40,7 +40,6 @@ class AnnisQueryParser(QueryParser):
queryTail = '' queryTail = ''
params = self.rename_params(params, config) params = self.rename_params(params, config)
for param in sorted(params): for param in sorted(params):
print(param)
# For query words: # For query words:
# param = [annotation_layer, query_word_number, value, operator] # param = [annotation_layer, query_word_number, value, operator]
# For relations between query words: # For relations between query words:
...@@ -230,21 +229,34 @@ class AnnisQueryParser(QueryParser): ...@@ -230,21 +229,34 @@ class AnnisQueryParser(QueryParser):
""" """
res = { res = {
'n_hits': -1, 'n_hits': -1,
'hit_ids': '' 'hit_ids': [],
'hits': []
} }
urlCount = config.resource_base_url.strip('/') + '/v1/search/count' urlCount = config.resource_base_url.strip('/') + '/v1/search/count'
print(urlCount) response = requests.post(urlCount, json=query, timeout=config.query_timeout)
response = requests.post(urlCount, json=query, timeout=60)
try: try:
res['n_hits'] = response.json()['match_count'] res['n_hits'] = response.json()['match_count']
except: except:
pass pass
if res['n_hits'] > 0: if res['n_hits'] > 0:
# First, find IDs for the matches
urlFind = config.resource_base_url.strip('/') + '/v1/search/find' urlFind = config.resource_base_url.strip('/') + '/v1/search/find'
print(urlFind) response = requests.post(urlFind, json=query, timeout=config.query_timeout)
response = requests.post(urlFind, json=query, timeout=60) res['hit_ids'] = [list(tokenIDs.split(' '))
res['hit_ids'] = response.content.decode('utf-8').strip('\n').split('\n') for tokenIDs in response.content.decode('utf-8').strip('\n').split('\n')
print(res) if len(tokenIDs) > 0]
# Second, find subgraphs including those matches and some context
urlSubgraph = config.resource_base_url.strip('/') + '/v1/corpora/' \
+ config.annis_corpus_id + '/subgraph'
for hitIDs in res['hit_ids']:
subgraphQuery = {
'node_ids': [self.rxNodeIDPfx.sub('', hitID)
for hitID in hitIDs],
'left': config.annis_context_size,
'right': config.annis_context_size
}
response = requests.post(urlSubgraph, json=subgraphQuery, timeout=config.query_timeout)
res['hits'].append(response.content)
return res return res
......
from urllib.parse import quote
import re
import networkx as nx
import matplotlib.pyplot as plt
from lxml.html import fromstring, tostring
from .enums import *
from .config import ResourceConfig
from .search_retrieve import Record
from .diagnostics import Diagnostic, DiagnosticTypes
class AnnisResponseParser:
"""
Parses responses from an ANNIS instance.
"""
def __init__(self):
self.pc = None # POS convertor, rebuilt with each parse call
def process_hits(self, hits, config: ResourceConfig, searchOptions: dict,
diagnostics: list[Diagnostic], advancedHits=False):
"""
Process hits returned by ANNIS, which take the form of
graphML strings.
If anything goes wrong, add Diagnostic objects to diagnostics list.
Return a list of Record objects.
"""
records = []
for iHit in range(len(hits)):
try:
hits[iHit] = nx.parse_graphml(hits[iHit].decode('utf-8'))
except:
raise Diagnostic(DiagnosticTypes.sru, 1,
details='Could not parse graphML data returned by the ANNIS instance.')
for hit in hits:
# for node, data in hit.nodes(data=True):
# print(node, data)
roots = [n for n, d in hit.in_degree() if d == 0]
textSegments = []
nodeData = {
node[0]: node[1] for node in hit.nodes(data=True)
}
for root in roots:
usedNodes = set()
textSegment = ''
for e in nx.bfs_edges(hit, root):
for side in (0, 1):
if e[side] in usedNodes:
continue
usedNodes.add(e[side])
node = nodeData[e[side]]
tok = ''
tier = ''
for k in node.keys():
if k == 'annis::tok':
tok = node[k]
elif k.endswith('Gloss'):
tier = 'Gloss'
if tier == 'Gloss':
textSegment += tok + ' '
if len(textSegment) > 0:
textSegments.append(textSegment.strip())
records.append(' <...> '.join(textSegments))
return records
def parse(self, responseData, config: ResourceConfig, searchOptions: dict):
"""
Read graphML response with the first N hits returned by an ANNIS
instance. The hits to process are stored in responseData['hits'].
Return a list of Record objects and the total number of
records found.
"""
diagnostics = []
advancedHits = False
dataViewsRequested = {v.strip() for v in searchOptions['x-fcs-dataviews'].split(',') if len(v.strip()) > 0}
if 'adv' in dataViewsRequested:
advancedHits = True
nRecords = responseData['n_hits']
records = []
if searchOptions['startRecord'] > 1 and nRecords < searchOptions['startRecord']:
# We don't actually care about startRecord, but we should
# return a fatal diagnostic if it is larger than the number
# of hits.
diagnostics.append(Diagnostic(DiagnosticTypes.sru, 61))
return records, nRecords, diagnostics
if len(responseData['hits']) <= 0:
nRecords = 0
else:
try:
records = self.process_hits(responseData['hits'], config, searchOptions,
diagnostics, advancedHits=advancedHits)
except Diagnostic as d:
diagnostics.append(d)
return records, nRecords, diagnostics
if __name__ == '__main__':
pass
...@@ -27,6 +27,7 @@ class ResourceConfig: ...@@ -27,6 +27,7 @@ class ResourceConfig:
self.url_path = '127.0.0.1' self.url_path = '127.0.0.1'
self.resource_base_url = 'http://127.0.0.1' self.resource_base_url = 'http://127.0.0.1'
self.annis_corpus_id = '' # ANNIS-internal ID of the corpus to search in self.annis_corpus_id = '' # ANNIS-internal ID of the corpus to search in
self.annis_context_size = 5 # Context size for hits rendering (ANNIS only)
self.titles = [] self.titles = []
self.descriptions = [] self.descriptions = []
self.authors = [] self.authors = []
......
...@@ -16,6 +16,7 @@ class Diagnostic(Exception): ...@@ -16,6 +16,7 @@ class Diagnostic(Exception):
stdMessages = { stdMessages = {
(DiagnosticTypes.fcs, 4): 'Requested Data View not valid for this resource.', (DiagnosticTypes.fcs, 4): 'Requested Data View not valid for this resource.',
(DiagnosticTypes.sru, 1): 'General system error.',
(DiagnosticTypes.sru, 4): 'Unsupported operation. Supported operation: explain, searchRetrieve, scan.', (DiagnosticTypes.sru, 4): 'Unsupported operation. Supported operation: explain, searchRetrieve, scan.',
(DiagnosticTypes.sru, 5): 'Unsupported version. Supported SRU versions: 1.2 and 2.0.', (DiagnosticTypes.sru, 5): 'Unsupported version. Supported SRU versions: 1.2 and 2.0.',
(DiagnosticTypes.sru, 8): 'Unsupported parameter.', (DiagnosticTypes.sru, 8): 'Unsupported parameter.',
......
...@@ -202,15 +202,14 @@ def process_search_retrieve(version: SRUVersion, ...@@ -202,15 +202,14 @@ def process_search_retrieve(version: SRUVersion,
query = app.qp_annis.translate_simple(query, config, searchOptions) query = app.qp_annis.translate_simple(query, config, searchOptions)
else: else:
query = app.qp_annis.translate_advanced(query, config, searchOptions) query = app.qp_annis.translate_advanced(query, config, searchOptions)
print(query) # print(query)
res = app.qp_annis.send_query(query, config) res = app.qp_annis.send_query(query, config)
except Diagnostic as diag: except Diagnostic as diag:
return fatal_response(Operation.searchRetrieve, version, config, diagnostics + [diag], request, templates) return fatal_response(Operation.searchRetrieve, version, config, diagnostics + [diag], request, templates)
# return query['query'] records, nHits, diagnostics = app.rp_annis.parse(res, config, searchOptions)
return res if any(diag.is_fatal() for diag in diagnostics):
# records, nHits, diagnostics = app.rp_annis.parse(res, config, searchOptions['x-fcs-dataviews']) return fatal_response(Operation.searchRetrieve, version, config, diagnostics, request, templates)
# if any(diag.is_fatal() for diag in diagnostics): return records
# return fatal_response(Operation.searchRetrieve, version, config, diagnostics, request, templates)
# records = [r.as_dict() for r in records] # records = [r.as_dict() for r in records]
# diagnostics = [str(d) for d in diagnostics] # diagnostics = [str(d) for d in diagnostics]
# return templates.TemplateResponse('search_retrieve_response.xml', # return templates.TemplateResponse('search_retrieve_response.xml',
...@@ -228,7 +227,6 @@ def process_search_retrieve(version: SRUVersion, ...@@ -228,7 +227,6 @@ def process_search_retrieve(version: SRUVersion,
strGetParams = app.qp_tsakorpus.translate_simple(query, config, searchOptions) strGetParams = app.qp_tsakorpus.translate_simple(query, config, searchOptions)
else: else:
strGetParams = app.qp_tsakorpus.translate_advanced(query, config, searchOptions) strGetParams = app.qp_tsakorpus.translate_advanced(query, config, searchOptions)
print(strGetParams)
res = app.qp_tsakorpus.send_query(strGetParams, config) res = app.qp_tsakorpus.send_query(strGetParams, config)
except Diagnostic as diag: except Diagnostic as diag:
return fatal_response(Operation.searchRetrieve, version, config, diagnostics + [diag], request, templates) return fatal_response(Operation.searchRetrieve, version, config, diagnostics + [diag], request, templates)
...@@ -253,9 +251,7 @@ def process_search_retrieve(version: SRUVersion, ...@@ -253,9 +251,7 @@ def process_search_retrieve(version: SRUVersion,
else: else:
# No advanced search for Litterae # No advanced search for Litterae
strGetParams = app.qp_litterae.translate_simple(query, config, searchOptions) strGetParams = app.qp_litterae.translate_simple(query, config, searchOptions)
# print(strGetParams)
res = app.qp_litterae.send_query(strGetParams, config) res = app.qp_litterae.send_query(strGetParams, config)
print(res)
except Diagnostic as diag: except Diagnostic as diag:
return fatal_response(Operation.searchRetrieve, version, config, diagnostics + [diag], request, templates) return fatal_response(Operation.searchRetrieve, version, config, diagnostics + [diag], request, templates)
for dv in searchOptions['x-fcs-dataviews'].split(','): for dv in searchOptions['x-fcs-dataviews'].split(','):
...@@ -297,7 +293,7 @@ def process_request(operation: Operation, ...@@ -297,7 +293,7 @@ def process_request(operation: Operation,
:param diagnostics: List of diagnostics produced by the validation :param diagnostics: List of diagnostics produced by the validation
function. function.
""" """
print(query) # print(query)
# If something is clearly wrong with the query, return # If something is clearly wrong with the query, return
# a response with the list of diagnostics # a response with the list of diagnostics
if config is None or any(d.is_fatal() for d in diagnostics): if config is None or any(d.is_fatal() for d in diagnostics):
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
"adv_supported": true, "adv_supported": true,
"resource_base_url": "http://adwhh1.server.uni-hamburg.de:17101", "resource_base_url": "http://adwhh1.server.uni-hamburg.de:17101",
"annis_corpus_id": "DGS-Corpus-r3-en", "annis_corpus_id": "DGS-Corpus-r3-en",
"annis_context_size": 5,
"tier_convert_reverse": { "tier_convert_reverse": {
"text": "Gloss", "text": "Gloss",
"lemma": "GlossType" "lemma": "GlossType"
......
...@@ -9,6 +9,7 @@ from common.litterae_response_parser import LitteraeResponseParser ...@@ -9,6 +9,7 @@ from common.litterae_response_parser import LitteraeResponseParser
from common.tsakorpus_query_parser import TsakorpusQueryParser from common.tsakorpus_query_parser import TsakorpusQueryParser
from common.tsakorpus_response_parser import TsakorpusResponseParser from common.tsakorpus_response_parser import TsakorpusResponseParser
from common.annis_query_parser import AnnisQueryParser from common.annis_query_parser import AnnisQueryParser
from common.annis_response_parser import AnnisResponseParser
from common.enums import * from common.enums import *
from common.diagnostics import Diagnostic from common.diagnostics import Diagnostic
from common.config import ResourceConfig, read_configs from common.config import ResourceConfig, read_configs
...@@ -28,6 +29,7 @@ app.rp_litterae = LitteraeResponseParser() ...@@ -28,6 +29,7 @@ app.rp_litterae = LitteraeResponseParser()
app.qp_tsakorpus = TsakorpusQueryParser() app.qp_tsakorpus = TsakorpusQueryParser()
app.rp_tsakorpus = TsakorpusResponseParser() app.rp_tsakorpus = TsakorpusResponseParser()
app.qp_annis = AnnisQueryParser() app.qp_annis = AnnisQueryParser()
app.rp_annis = AnnisResponseParser()
app.configs = read_configs() app.configs = read_configs()
app.logging = True app.logging = True
......
...@@ -4,3 +4,4 @@ lxml ...@@ -4,3 +4,4 @@ lxml
Jinja2>=3.0.3 Jinja2>=3.0.3
requests requests
a2wsgi a2wsgi
networkx
\ No newline at end of file
...@@ -15,3 +15,4 @@ http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[lemma=%2 ...@@ -15,3 +15,4 @@ http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[lemma=%2
http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[pos=%22NOUN%22]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, term only / POS (NOUN, 22639 hits) http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[pos=%22NOUN%22]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, term only / POS (NOUN, 22639 hits)
http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[lemma%3D%22m.*%22%20%26%20(pos=%22NOUN%22|pos=%22VERB%22)]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, Boolean operators inside term query (words that start with "m" and are either nouns or verbs, 3878 hits) http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[lemma%3D%22m.*%22%20%26%20(pos=%22NOUN%22|pos=%22VERB%22)]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, Boolean operators inside term query (words that start with "m" and are either nouns or verbs, 3878 hits)
http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[text=%22ud%22]%20[]{1,2}%20[text=%22no%22]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, sequence of terms ("ud" followed by "no" at the distance between 2 and 3, 7 hits) http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[text=%22ud%22]%20[]{1,2}%20[text=%22no%22]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, sequence of terms ("ud" followed by "no" at the distance between 2 and 3, 7 hits)
http://127.0.0.1:5000/fcs-endpoint/annis_test?operation=searchRetrieve&query=CAN2B%20AND%20WHY1* ANNIS -- Simple search with boolean operator ("CAN2B" AND "WHY1*" within one text, 140 hits)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment