diff --git a/common/annis_query_parser.py b/common/annis_query_parser.py index a2429274d07c4ee6c1349879cb8c126564c26d98..92ecb20015e087d722a18aabae68d371e9249246 100644 --- a/common/annis_query_parser.py +++ b/common/annis_query_parser.py @@ -13,6 +13,7 @@ class AnnisQueryParser(QueryParser): """ rxTsakorpusBool = re.compile('[()|,]') + rxRelOps = re.compile('^(?:\\^\\*|\\||\\.[*,0-9]*)$') # Operators for setting relations between query words def build_get_string(self, params, config: ResourceConfig, withinClause=''): """ @@ -36,11 +37,18 @@ class AnnisQueryParser(QueryParser): queryFront = '' queryTail = '' for param in sorted(params): - if param[0] == 'wf': + # For query words: + # param = [annotation_layer, query_word_number, value, operator] + # For relations between query words: + # param = [relation, query_word_number_1, query_word_number_2] + if param[0] == 'tok' and param[3] == '=': + # Simplified form for token search queryFront += param[2].replace('"', '') + ' & ' - else: + elif self.rxRelOps.search(param[0]) is not None: queryTail += '#' + str(param[1]) + ' ' + param[0] + ' #' + str(param[2]) + ' & ' - q['query'] = queryFront.strip(' ') + queryTail.strip(' &') + else: + queryFront += param[0] + param[3] + param[2] + ' & ' + q['query'] = queryFront.strip(' ') + ' ' + queryTail.strip(' &') return q def term_query(self, query: str, config: ResourceConfig): @@ -56,7 +64,7 @@ class AnnisQueryParser(QueryParser): for term in query.split(' '): if len(term) > 0: iTerm += 1 - getParams.append(['wf', iTerm, '"' + term.replace('"', '') + '"']) + getParams.append(['tok', iTerm, '"' + term.replace('"', '') + '"', '=']) if iTerm >= 2: getParams.append(['.', iTerm, iTerm-1]) return getParams @@ -68,12 +76,12 @@ class AnnisQueryParser(QueryParser): operandR = self.shift_term_indexes(operandR, max(termsL)) termsR = self.term_indexes(operandR) - if operandL[0][0] != 'wf' or operandR[0][0] != 'wf': + if operandL[0][0] != 'tok' or operandR[0][0] != 'tok': raise Diagnostic(DiagnosticTypes.sru, 47) if strOp == 'AND': if ((len(termsL) > 1 or len(termsR) > 1) - and (any(op[0] not in ('wf', '^*') for op in operandR) - or any(op[0] not in ('wf', '^*') for op in operandL))): + and (any(op[0] not in ('tok', '^*') for op in operandR) + or any(op[0] not in ('tok', '^*') for op in operandL))): message = 'ANNIS does not support queries that combine several ' \ 'multi-word sequences with boolean operators or multiple ' \ 'boolean operators.' @@ -81,8 +89,8 @@ class AnnisQueryParser(QueryParser): return operandL + operandR + [['^*', max(termsL), min(termsR)]] elif strOp == 'OR': if ((len(termsL) > 1 or len(termsR) > 1) - and (any(op[0] not in ('wf', '|') for op in operandR) - or any(op[0] not in ('wf', '|') for op in operandL))): + and (any(op[0] not in ('tok', '|') for op in operandR) + or any(op[0] not in ('tok', '|') for op in operandL))): message = 'ANNIS does not support queries that combine several ' \ 'multi-word sequences with boolean operators or multiple ' \ 'boolean operators.' @@ -91,8 +99,18 @@ class AnnisQueryParser(QueryParser): raise Diagnostic(DiagnosticTypes.sru, 37, details=strOp) def not_bool(self, operand, config): - # TODO: implement - raise NotImplementedError() + if len(operand) <= 0: + raise Diagnostic(DiagnosticTypes.sru, 10) + if not (len(operand) == 1 and operand[0] == 'tok'): + message = 'ANNIS does not support queries that negate anything ' \ + 'other than a single-token subquery.' + raise Diagnostic(DiagnosticTypes.sru, 48, message=message) + result = operand[:] + if result[3] == '=': + result[3] = '!=' + else: + result[3] = '=' + return result def adv_term_query_proper(self, identifier: str, op: str, value: str, flags: str, config: ResourceConfig): """ @@ -103,33 +121,29 @@ class AnnisQueryParser(QueryParser): raise Diagnostic(DiagnosticTypes.sru, 10) if flags not in ('', 'I', 'C'): raise Diagnostic(DiagnosticTypes.sru, 48, message='ANNIS does not support regex flags.') - if op != '=': - raise Diagnostic(DiagnosticTypes.sru, 10, - message='In token queries, only = is allowed as operators.') getParams = [] if identifier == 'text': - getParams.append(['wf', 1, '/' + value.replace('/', '\\/') + '/']) + getParams.append(['tok', 1, '/' + value.replace('/', '\\/') + '/', op]) elif identifier == 'lemma': - getParams.append(['lemma', 1, '/' + value.replace('/', '\\/') + '/']) + getParams.append(['lemma', 1, '/' + value.replace('/', '\\/') + '/', op]) elif identifier == 'pos': if value in config.pos_convert_reverse: # UD to corpus-specific POS tags value = config.pos_convert_reverse[value] - getParams.append(['pos', 1, '/' + value.replace('/', '\\/') + '/']) + getParams.append(['pos', 1, '/' + value.replace('/', '\\/') + '/', op]) else: - getParams.append([identifier, 1, '/' + value.replace('/', '\\/') + '/']) + getParams.append([identifier, 1, '/' + value.replace('/', '\\/') + '/', op]) # raise Diagnostic(DiagnosticTypes.sru, 10, # message='The identifier ' + identifier + ' is not supported in ANNIS.') return getParams - # TODO: continue here def adv_quantify_segment(self, getParams, quantifier: str, config: ResourceConfig): - if len(getParams) != 1 or getParams[0][0] != 'wf' or getParams[0][2] != '.*': + if len(getParams) != 1 or getParams[0][0] != 'tok' or getParams[0][2] != '/.*/' or getParams[0][3] != '=': raise Diagnostic(DiagnosticTypes.sru, 48, message='Token quantifiers are only allowed with empty token queries ' - 'in Tsakoprus (for setting distance constraints).') + 'in ANNIS (for setting distance constraints).') minDist = 1 - maxDist = 100 + maxDist = 50 if quantifier == '?': maxDist = 2 elif quantifier == '+': @@ -145,10 +159,11 @@ class AnnisQueryParser(QueryParser): minDist = int(m.group(1)) + 1 if len(m.group(2)) > 0: maxDist = int(m.group(2)) + 1 + op = '.*' + if minDist > 1 or maxDist != 50: + op = '.' + str(minDist) + ',' + str(maxDist) getParams = [ - ['word_rel_', getParams[0][1], getParams[0][1] - 1], - ['word_dist_from_', getParams[0][1], str(minDist)], - ['word_dist_to_', getParams[0][1], str(maxDist)] + [op, getParams[0][1], getParams[0][1] - 1] ] return getParams @@ -160,18 +175,17 @@ class AnnisQueryParser(QueryParser): operandR = self.shift_term_indexes(operandR, max(termsL)) termsR = self.term_indexes(operandR) # Find out if there is already a distance constraint - wordRelPresent = (any(param[0] == 'word_rel_' for param in operandL) - or any(param[0] == 'word_rel_' and param[2] == max(termsL) + wordRelPresent = (any(param[0].startswith('.') for param in operandL) + or any(param[0].startswith('.') and param[2] == max(termsL) for param in operandR)) if not wordRelPresent: wordRelParams = [ - ['word_rel_', min(termsR), max(termsL)], - ['word_dist_from_', min(termsR), '1'], - ['word_dist_to_', min(termsR), '1'] + ['.', min(termsR), max(termsL)] ] operandR += wordRelParams return operandL + operandR + # TODO: continue here def adv_binary_bool(self, strOp: str, operandL, operandR, config: ResourceConfig): # Join multiple constraints on one word in an advanced query print('ADVANCED INTERNAL BOOL', strOp, str(operandL), str(operandR))