diff --git a/common/annis_query_parser.py b/common/annis_query_parser.py index 92ecb20015e087d722a18aabae68d371e9249246..b9edca58a193285ce167ddd00586c0bdec65f322 100644 --- a/common/annis_query_parser.py +++ b/common/annis_query_parser.py @@ -1,7 +1,7 @@ from urllib.parse import quote import re import json -import urllib.request +import requests from .query_parser import QueryParser from .config import ResourceConfig from .diagnostics import Diagnostic, DiagnosticTypes @@ -13,7 +13,8 @@ class AnnisQueryParser(QueryParser): """ rxTsakorpusBool = re.compile('[()|,]') - rxRelOps = re.compile('^(?:\\^\\*|\\||\\.[*,0-9]*)$') # Operators for setting relations between query words + rxRelOps = re.compile('^(?:\\^\\*|\\||\\.[*,0-9]*)|_=_$') # Operators for setting relations between query words + rxFramingQuotes = re.compile('^[/"]|(?<!\\\\)[/"]$') def build_get_string(self, params, config: ResourceConfig, withinClause=''): """ @@ -37,6 +38,7 @@ class AnnisQueryParser(QueryParser): queryFront = '' queryTail = '' for param in sorted(params): + print(param) # For query words: # param = [annotation_layer, query_word_number, value, operator] # For relations between query words: @@ -48,7 +50,7 @@ class AnnisQueryParser(QueryParser): queryTail += '#' + str(param[1]) + ' ' + param[0] + ' #' + str(param[2]) + ' & ' else: queryFront += param[0] + param[3] + param[2] + ' & ' - q['query'] = queryFront.strip(' ') + ' ' + queryTail.strip(' &') + q['query'] = (queryFront.strip(' ') + ' ' + queryTail).strip(' &') return q def term_query(self, query: str, config: ResourceConfig): @@ -66,7 +68,7 @@ class AnnisQueryParser(QueryParser): iTerm += 1 getParams.append(['tok', iTerm, '"' + term.replace('"', '') + '"', '=']) if iTerm >= 2: - getParams.append(['.', iTerm, iTerm-1]) + getParams.append(['.', iTerm-1, iTerm]) return getParams def binary_bool(self, strOp: str, operandL, operandR, config): @@ -88,14 +90,18 @@ class AnnisQueryParser(QueryParser): raise Diagnostic(DiagnosticTypes.sru, 48, message=message) return operandL + operandR + [['^*', max(termsL), min(termsR)]] elif strOp == 'OR': - if ((len(termsL) > 1 or len(termsR) > 1) - and (any(op[0] not in ('tok', '|') for op in operandR) - or any(op[0] not in ('tok', '|') for op in operandL))): + if ((len(operandL) > 1 or len(operandR) > 1) + or operandL[0][0] != 'tok' or operandR[0][0] != 'tok' + or operandL[0][3] != operandR[0][3]): message = 'ANNIS does not support queries that combine several ' \ 'multi-word sequences with boolean operators or multiple ' \ 'boolean operators.' raise Diagnostic(DiagnosticTypes.sru, 48, message=message) - return operandL + operandR + [['|', max(termsL), min(termsR)]] + paramNew = [operandL[0][0], operandL[0][1], + '/(' + self.rxFramingQuotes.sub('', operandL[0][2]) + + ')|(' + self.rxFramingQuotes.sub('', operandR[0][2]) + ')/', + operandL[0][3]] + return [paramNew] raise Diagnostic(DiagnosticTypes.sru, 37, details=strOp) def not_bool(self, operand, config): @@ -163,7 +169,7 @@ class AnnisQueryParser(QueryParser): if minDist > 1 or maxDist != 50: op = '.' + str(minDist) + ',' + str(maxDist) getParams = [ - [op, getParams[0][1], getParams[0][1] - 1] + [op, getParams[0][1] - 1, getParams[0][1]] ] return getParams @@ -176,57 +182,55 @@ class AnnisQueryParser(QueryParser): termsR = self.term_indexes(operandR) # Find out if there is already a distance constraint wordRelPresent = (any(param[0].startswith('.') for param in operandL) - or any(param[0].startswith('.') and param[2] == max(termsL) + or any(param[0].startswith('.') and param[1] == max(termsL) for param in operandR)) if not wordRelPresent: wordRelParams = [ - ['.', min(termsR), max(termsL)] + ['.', max(termsL), min(termsR)] ] operandR += wordRelParams return operandL + operandR - # TODO: continue here def adv_binary_bool(self, strOp: str, operandL, operandR, config: ResourceConfig): # Join multiple constraints on one word in an advanced query - print('ADVANCED INTERNAL BOOL', strOp, str(operandL), str(operandR)) - getParams = [] - if strOp == '&': - strOp = ',' - paramsR = {paramR[0] for paramR in operandR} - for paramR in operandR: - paramExists = False - for paramL in operandL: - if paramL[0] == paramR[0]: - if strOp == ',' and paramL[0] != 'gr': - raise Diagnostic(DiagnosticTypes.sru, 48, - message='Tsakorpus endpoint does not support conjunctions ' - 'of multiple constraints for the same layer ' - 'within the same word.') - paramExists = True - getParams.append([paramL[0], paramL[1], '(' + paramL[2] + ')' + strOp + '(' + paramR[2] + ')']) - if not paramExists: - getParams.append(paramR[:]) - for paramL in operandL: - if paramL[0] not in paramsR: - if strOp == '|': - raise Diagnostic(DiagnosticTypes.sru, 48, - message='Tsakorpus does not support disjunctions ' - 'of constraints for multiple layers ' - 'within the same word.') - getParams.append(paramL[:]) - return getParams + # print('ADVANCED INTERNAL BOOL', strOp, str(operandL), str(operandR)) + if strOp == '|': + if (len(operandL) == 1 and len(operandR) == 1 + and operandL[0][0] == operandR[0][0] + and self.rxRelOps.search(operandL[0][0]) is None + and operandL[0][3] == operandR[0][3]): + # Disjunction of two values of the same layer: join as regex + paramNew = [operandL[0][0], operandL[0][1], + '/(' + self.rxFramingQuotes.sub('', operandL[0][2]) + + ')|(' + self.rxFramingQuotes.sub('', operandR[0][2]) + ')/', + operandL[0][3]] + return [paramNew] + raise Diagnostic(DiagnosticTypes.sru, 48, + message='ANNIS endpoint does not support disjunctions ' + 'of constraints on different layers ' + 'within the same word.') + # If we are here, strOp == '&' + # Operands are either single parameter queries or conjunctions thereof + # (arbitrary disjunctions raise an exception, see above) + termsL = self.term_indexes(operandL) + operandR = self.shift_term_indexes(operandR, max(termsL)) + termsR = self.term_indexes(operandR) + wordRelParams = [ + ['_=_', min(termsR), max(termsL)] + ] + operandR += wordRelParams + return operandL + operandR - def send_query(self, strGetParams: str, config: ResourceConfig): + # TODO: add real API links and test it + def send_query(self, query, config: ResourceConfig): """ - Send the translated query to the Tsakorpus instance. Return JSON results + Send the translated query to the ANNIS API. Return JSON results returned by the corpus. """ - url = config.resource_base_url.strip('/') + '/search_sent?' + strGetParams + url = config.resource_base_url.strip('/') + '/v1/' print(url) - response = urllib.request.urlopen(url) - data = response.read() - encoding = response.info().get_content_charset('utf-8') - responseJSON = json.loads(data.decode(encoding)) + response = requests.post(url, json=query, timeout=60) + responseJSON = response.json() return responseJSON diff --git a/common/query_parser.py b/common/query_parser.py index 8ad88014fc58be0f92e5f7b67025a3c00e001818..736f3bc7b7eb069ae4839909187b739880360b13 100644 --- a/common/query_parser.py +++ b/common/query_parser.py @@ -24,6 +24,8 @@ class QueryParser: rxQuantifierInterval = re.compile('^\\{(|0|[1-9][0-9]*),(|0|[1-9][0-9]*)\\}$') acceptableIdentifiers = {'text', 'lemma', 'pos', 'orth', 'norm', 'phonetic'} + rxRelOps = re.compile('^(?:word_rel_|word_dist_from_|word_dist_to_)$') + def __init__(self): pass @@ -163,26 +165,29 @@ class QueryParser: """ getParamsShifted = [] for param in getParams: - if type(param[2]) is int: + if param[1] <= 0 and all(type(param[i]) == str for i in range(2, len(param))): + getParamsShifted.append(copy.deepcopy(param)) + continue + newParam = [param[0]] + for i in range(1, len(param)): + # Shift all integer elemnts of the param list # int: refers to query word number - # str: refers to a distance constraint - newParam = [param[0], param[1] + shift, param[2] + shift] - elif param[1] >= 0: - newParam = [param[0], param[1] + shift, param[2]] - else: - newParam = copy.deepcopy(param) + # str: refers to a layer ID, an operator or a distance constraint + if type(param[i]) is int: + newParam.append(param[i] + shift) + else: + newParam.append(param[i]) getParamsShifted.append(newParam) return getParamsShifted - @staticmethod - def term_indexes(getParams): + def term_indexes(self, getParams): """ Find all search term indexes used in the GET parameters specified by getParams list. Return list of integers (1-based). """ terms = set() for param in getParams: - if param[0] in ('word_rel_', 'word_dist_from_', 'word_dist_to_'): + if self.rxRelOps.search(param[0]) is not None: continue elif type(param[2]) is int: terms.add(param[2]) @@ -310,7 +315,7 @@ class QueryParser: if len(query) <= 0: query = 'text=".*"' iOpPos, strOp = self.find_operator_adv_expression(query) - print('ADVANED EXPRESSION QUERY', iOpPos, strOp) + print('ADVANCED EXPRESSION QUERY', iOpPos, strOp) if iOpPos == -1: if query[0] == '(' and query[-1] == ')': return self.adv_expression_query(query[1:len(query)-1], quantifier, config) diff --git a/common/views_logic.py b/common/views_logic.py index 23d5612338390ab88423456d1bd0970af6cc4484..ca27bd2a2d95e71b129722a671d70632c863e5d1 100644 --- a/common/views_logic.py +++ b/common/views_logic.py @@ -78,6 +78,28 @@ def process_search_retrieve(version: SRUVersion, Process a searchRetrieve request. Return a rendered XML response. """ + if config.platform == CorpPlatform.annis: + try: + if queryType == QueryType.cql: + query = app.qp_annis.translate_simple(query, config) + else: + query = app.qp_annis.translate_advanced(query, config) + print(query) + # res = app.qp_annis.send_query(query, config) + except Diagnostic as diag: + return fatal_response(Operation.searchRetrieve, version, diagnostics + [diag], request, templates) + return query['query'] + # records, nHits, diagnostics = app.rp_annis.parse(res, config, searchOptions['x-fcs-dataviews']) + # records = [r.as_dict() for r in records] + # diagnostics = [str(d) for d in diagnostics] + # return templates.TemplateResponse('search_retrieve_response.xml', + # { + # 'request': request, + # 'n_hits': nHits, + # 'records': records, + # 'diagnostics': diagnostics + # }, + # media_type='application/xml') if config.platform == CorpPlatform.tsakorpus: try: if queryType == QueryType.cql: diff --git a/config/annis_test.json b/config/annis_test.json new file mode 100644 index 0000000000000000000000000000000000000000..b9a15cc756e010b054f0ead5495d6105473684d8 --- /dev/null +++ b/config/annis_test.json @@ -0,0 +1,10 @@ +{ + "host": "https://www.sign-lang.uni-hamburg.de/dgs-korpus/ (replace later)", + "port": "80", + "transport_protocol": "https", + "max_hits": 15, + "platform": "annis", + "advanced_search_capability": true, + "adv_supported": true, + "resource_base_url": "https://www.sign-lang.uni-hamburg.de/dgs-korpus/ (replace later)" +} \ No newline at end of file diff --git a/main.py b/main.py index 1796659cc93104305d3c2fa195a6c85f9c7e3ee1..53717ded4421d7f2faff5e8c76587614bf986b51 100644 --- a/main.py +++ b/main.py @@ -8,6 +8,7 @@ from common.litterae_query_parser import LitteraeQueryParser from common.litterae_response_parser import LitteraeResponseParser from common.tsakorpus_query_parser import TsakorpusQueryParser from common.tsakorpus_response_parser import TsakorpusResponseParser +from common.annis_query_parser import AnnisQueryParser from common.enums import * from common.diagnostics import Diagnostic from common.config import ResourceConfig, read_configs @@ -28,6 +29,7 @@ app.qp_litterae = LitteraeQueryParser() app.rp_litterae = LitteraeResponseParser() app.qp_tsakorpus = TsakorpusQueryParser() app.rp_tsakorpus = TsakorpusResponseParser() +app.qp_annis = AnnisQueryParser() app.configs = read_configs() diff --git a/requirements.txt b/requirements.txt index 418bc53014f9e10cb7f338f34490928e0f0ab4c3..3e16ef4296a761d48c87bac4e501619c412ea10a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ fastapi>=0.88.0 uvicorn>=0.20.0 lxml -Jinja2>=3.0.3 \ No newline at end of file +Jinja2>=3.0.3 +requests \ No newline at end of file