diff --git a/common/annis_query_parser.py b/common/annis_query_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..a2429274d07c4ee6c1349879cb8c126564c26d98 --- /dev/null +++ b/common/annis_query_parser.py @@ -0,0 +1,221 @@ +from urllib.parse import quote +import re +import json +import urllib.request +from .query_parser import QueryParser +from .config import ResourceConfig +from .diagnostics import Diagnostic, DiagnosticTypes + + +class AnnisQueryParser(QueryParser): + """ + Parses search queries for ANNIS-based corpora. + """ + + rxTsakorpusBool = re.compile('[()|,]') + + def build_get_string(self, params, config: ResourceConfig, withinClause=''): + """ + Build a payload for an ANNIS search request. + ANNIS uses POST with JSON payload rather than GET, but the + function name is the same as in the other classes for + compatibility. + """ + if len(withinClause) > 0 and withinClause not in ('text', 'session'): + raise Diagnostic(DiagnosticTypes.sru, 48, message='ANNIS only supports multi-word search within' + 'a text (with a default maximum distance of ' + '50 tokens).') + q = { + 'query': '', + 'query_language': 'AQL', + 'corpora': config.annis_corpus_list, + 'limit': config.max_hits, + 'order': 'Randomized' + } + termIndexes = self.term_indexes(params) + queryFront = '' + queryTail = '' + for param in sorted(params): + if param[0] == 'wf': + queryFront += param[2].replace('"', '') + ' & ' + else: + queryTail += '#' + str(param[1]) + ' ' + param[0] + ' #' + str(param[2]) + ' & ' + q['query'] = queryFront.strip(' ') + queryTail.strip(' &') + return q + + def term_query(self, query: str, config: ResourceConfig): + """ + Return list of query parameters for one term or sequence of terms. + """ + if len(query) >= 2 and query.startswith('"') and query.endswith('"'): + query = query[1:len(query)-1] + if len(query) <= 0: + raise Diagnostic(DiagnosticTypes.sru, 10) + getParams = [] + iTerm = 0 + for term in query.split(' '): + if len(term) > 0: + iTerm += 1 + getParams.append(['wf', iTerm, '"' + term.replace('"', '') + '"']) + if iTerm >= 2: + getParams.append(['.', iTerm, iTerm-1]) + return getParams + + def binary_bool(self, strOp: str, operandL, operandR, config): + if len(operandL) <= 0 or len(operandR) <= 0: + raise Diagnostic(DiagnosticTypes.sru, 10) + termsL = self.term_indexes(operandL) + operandR = self.shift_term_indexes(operandR, max(termsL)) + termsR = self.term_indexes(operandR) + + if operandL[0][0] != 'wf' or operandR[0][0] != 'wf': + raise Diagnostic(DiagnosticTypes.sru, 47) + if strOp == 'AND': + if ((len(termsL) > 1 or len(termsR) > 1) + and (any(op[0] not in ('wf', '^*') for op in operandR) + or any(op[0] not in ('wf', '^*') for op in operandL))): + message = 'ANNIS does not support queries that combine several ' \ + 'multi-word sequences with boolean operators or multiple ' \ + 'boolean operators.' + raise Diagnostic(DiagnosticTypes.sru, 48, message=message) + return operandL + operandR + [['^*', max(termsL), min(termsR)]] + elif strOp == 'OR': + if ((len(termsL) > 1 or len(termsR) > 1) + and (any(op[0] not in ('wf', '|') for op in operandR) + or any(op[0] not in ('wf', '|') for op in operandL))): + message = 'ANNIS does not support queries that combine several ' \ + 'multi-word sequences with boolean operators or multiple ' \ + 'boolean operators.' + raise Diagnostic(DiagnosticTypes.sru, 48, message=message) + return operandL + operandR + [['|', max(termsL), min(termsR)]] + raise Diagnostic(DiagnosticTypes.sru, 37, details=strOp) + + def not_bool(self, operand, config): + # TODO: implement + raise NotImplementedError() + + def adv_term_query_proper(self, identifier: str, op: str, value: str, flags: str, config: ResourceConfig): + """ + Return list of query parameters for one term in an advanced query. + """ + flags = flags.strip('/') + if len(value) <= 0: + raise Diagnostic(DiagnosticTypes.sru, 10) + if flags not in ('', 'I', 'C'): + raise Diagnostic(DiagnosticTypes.sru, 48, message='ANNIS does not support regex flags.') + if op != '=': + raise Diagnostic(DiagnosticTypes.sru, 10, + message='In token queries, only = is allowed as operators.') + getParams = [] + if identifier == 'text': + getParams.append(['wf', 1, '/' + value.replace('/', '\\/') + '/']) + elif identifier == 'lemma': + getParams.append(['lemma', 1, '/' + value.replace('/', '\\/') + '/']) + elif identifier == 'pos': + if value in config.pos_convert_reverse: + # UD to corpus-specific POS tags + value = config.pos_convert_reverse[value] + getParams.append(['pos', 1, '/' + value.replace('/', '\\/') + '/']) + else: + getParams.append([identifier, 1, '/' + value.replace('/', '\\/') + '/']) + # raise Diagnostic(DiagnosticTypes.sru, 10, + # message='The identifier ' + identifier + ' is not supported in ANNIS.') + return getParams + + # TODO: continue here + def adv_quantify_segment(self, getParams, quantifier: str, config: ResourceConfig): + if len(getParams) != 1 or getParams[0][0] != 'wf' or getParams[0][2] != '.*': + raise Diagnostic(DiagnosticTypes.sru, 48, + message='Token quantifiers are only allowed with empty token queries ' + 'in Tsakoprus (for setting distance constraints).') + minDist = 1 + maxDist = 100 + if quantifier == '?': + maxDist = 2 + elif quantifier == '+': + minDist = 2 + elif self.rxQuantifierExact.search(quantifier) is not None: + minDist = maxDist = int(quantifier[1:len(quantifier)-1]) + else: + m = self.rxQuantifierInterval.search(quantifier) + if m is None: + raise Diagnostic(DiagnosticTypes.sru, 10, + message='Something is wrong with a token quantifier.') + if len(m.group(1)) > 0: + minDist = int(m.group(1)) + 1 + if len(m.group(2)) > 0: + maxDist = int(m.group(2)) + 1 + getParams = [ + ['word_rel_', getParams[0][1], getParams[0][1] - 1], + ['word_dist_from_', getParams[0][1], str(minDist)], + ['word_dist_to_', getParams[0][1], str(maxDist)] + ] + return getParams + + def adv_main_sequence(self, operandL, operandR, config: ResourceConfig): + # print('SEQUENCE JOIN', str(operandL), str(operandR)) + if len(operandL) <= 0 or len(operandR) <= 0: + raise Diagnostic(DiagnosticTypes.sru, 10) + termsL = self.term_indexes(operandL) + operandR = self.shift_term_indexes(operandR, max(termsL)) + termsR = self.term_indexes(operandR) + # Find out if there is already a distance constraint + wordRelPresent = (any(param[0] == 'word_rel_' for param in operandL) + or any(param[0] == 'word_rel_' and param[2] == max(termsL) + for param in operandR)) + if not wordRelPresent: + wordRelParams = [ + ['word_rel_', min(termsR), max(termsL)], + ['word_dist_from_', min(termsR), '1'], + ['word_dist_to_', min(termsR), '1'] + ] + operandR += wordRelParams + return operandL + operandR + + def adv_binary_bool(self, strOp: str, operandL, operandR, config: ResourceConfig): + # Join multiple constraints on one word in an advanced query + print('ADVANCED INTERNAL BOOL', strOp, str(operandL), str(operandR)) + getParams = [] + if strOp == '&': + strOp = ',' + paramsR = {paramR[0] for paramR in operandR} + for paramR in operandR: + paramExists = False + for paramL in operandL: + if paramL[0] == paramR[0]: + if strOp == ',' and paramL[0] != 'gr': + raise Diagnostic(DiagnosticTypes.sru, 48, + message='Tsakorpus endpoint does not support conjunctions ' + 'of multiple constraints for the same layer ' + 'within the same word.') + paramExists = True + getParams.append([paramL[0], paramL[1], '(' + paramL[2] + ')' + strOp + '(' + paramR[2] + ')']) + if not paramExists: + getParams.append(paramR[:]) + for paramL in operandL: + if paramL[0] not in paramsR: + if strOp == '|': + raise Diagnostic(DiagnosticTypes.sru, 48, + message='Tsakorpus does not support disjunctions ' + 'of constraints for multiple layers ' + 'within the same word.') + getParams.append(paramL[:]) + return getParams + + def send_query(self, strGetParams: str, config: ResourceConfig): + """ + Send the translated query to the Tsakorpus instance. Return JSON results + returned by the corpus. + """ + url = config.resource_base_url.strip('/') + '/search_sent?' + strGetParams + print(url) + response = urllib.request.urlopen(url) + data = response.read() + encoding = response.info().get_content_charset('utf-8') + responseJSON = json.loads(data.decode(encoding)) + return responseJSON + + +if __name__ == '__main__': + pass + diff --git a/common/config.py b/common/config.py index 031c82b6976f1eca274bd9954d9270f7d8e7a540..eb22b998af226c8c7d2bd4cad6bb61dbcb65bd42 100644 --- a/common/config.py +++ b/common/config.py @@ -40,6 +40,7 @@ class ResourceConfig: self.adv_supported = False self.supported_layers = [] self.resources = [] + self.annis_corpus_list = [] self.search_lang_id = '' self.pos_convert = [] # corpus-specific to UD (regexes) self.pos_convert_reverse = {} # UD to corpus-specific diff --git a/common/litterae_query_parser.py b/common/litterae_query_parser.py index 22dd3b402e84f62ae3f850fdccedf7cda69eb2b4..2d0ca1ff8f3d89db0fbe8d35bcc6338995ebc7ae 100644 --- a/common/litterae_query_parser.py +++ b/common/litterae_query_parser.py @@ -81,6 +81,14 @@ class LitteraeQueryParser(QueryParser): raise NotImplementedError() def adv_quantify_segment(self, getParams, quantifier: str, config: ResourceConfig): + """ + This function is not used as of now. + It is only used in an advanced search, which is switched off for FLC + for now, and implements the only non-trivial advanced capability that + exists in FLC, namely distance constraints (set with the 'slop_' parameters + in the API). If advanced search is enabled for FLC at a future point, + do not forget to take 'slop_' values into account in build_get_string(). + """ if len(getParams) != 1 or getParams[0][0] != 'q_' or getParams[0][2] != '.*': raise Diagnostic(DiagnosticTypes.sru, 48, message='Token quantifiers are only allowed with empty token queries '