Skip to content
Snippets Groups Projects
Commit 332e02ff authored by Arkhangelskiy, Timofey's avatar Arkhangelskiy, Timofey
Browse files

Continue working on AnnisQueryParser (not tested yet)

parent ffb2fcd7
No related branches found
No related tags found
No related merge requests found
...@@ -13,6 +13,7 @@ class AnnisQueryParser(QueryParser): ...@@ -13,6 +13,7 @@ class AnnisQueryParser(QueryParser):
""" """
rxTsakorpusBool = re.compile('[()|,]') rxTsakorpusBool = re.compile('[()|,]')
rxRelOps = re.compile('^(?:\\^\\*|\\||\\.[*,0-9]*)$') # Operators for setting relations between query words
def build_get_string(self, params, config: ResourceConfig, withinClause=''): def build_get_string(self, params, config: ResourceConfig, withinClause=''):
""" """
...@@ -36,11 +37,18 @@ class AnnisQueryParser(QueryParser): ...@@ -36,11 +37,18 @@ class AnnisQueryParser(QueryParser):
queryFront = '' queryFront = ''
queryTail = '' queryTail = ''
for param in sorted(params): for param in sorted(params):
if param[0] == 'wf': # For query words:
# param = [annotation_layer, query_word_number, value, operator]
# For relations between query words:
# param = [relation, query_word_number_1, query_word_number_2]
if param[0] == 'tok' and param[3] == '=':
# Simplified form for token search
queryFront += param[2].replace('"', '') + ' & ' queryFront += param[2].replace('"', '') + ' & '
else: elif self.rxRelOps.search(param[0]) is not None:
queryTail += '#' + str(param[1]) + ' ' + param[0] + ' #' + str(param[2]) + ' & ' queryTail += '#' + str(param[1]) + ' ' + param[0] + ' #' + str(param[2]) + ' & '
q['query'] = queryFront.strip(' ') + queryTail.strip(' &') else:
queryFront += param[0] + param[3] + param[2] + ' & '
q['query'] = queryFront.strip(' ') + ' ' + queryTail.strip(' &')
return q return q
def term_query(self, query: str, config: ResourceConfig): def term_query(self, query: str, config: ResourceConfig):
...@@ -56,7 +64,7 @@ class AnnisQueryParser(QueryParser): ...@@ -56,7 +64,7 @@ class AnnisQueryParser(QueryParser):
for term in query.split(' '): for term in query.split(' '):
if len(term) > 0: if len(term) > 0:
iTerm += 1 iTerm += 1
getParams.append(['wf', iTerm, '"' + term.replace('"', '') + '"']) getParams.append(['tok', iTerm, '"' + term.replace('"', '') + '"', '='])
if iTerm >= 2: if iTerm >= 2:
getParams.append(['.', iTerm, iTerm-1]) getParams.append(['.', iTerm, iTerm-1])
return getParams return getParams
...@@ -68,12 +76,12 @@ class AnnisQueryParser(QueryParser): ...@@ -68,12 +76,12 @@ class AnnisQueryParser(QueryParser):
operandR = self.shift_term_indexes(operandR, max(termsL)) operandR = self.shift_term_indexes(operandR, max(termsL))
termsR = self.term_indexes(operandR) termsR = self.term_indexes(operandR)
if operandL[0][0] != 'wf' or operandR[0][0] != 'wf': if operandL[0][0] != 'tok' or operandR[0][0] != 'tok':
raise Diagnostic(DiagnosticTypes.sru, 47) raise Diagnostic(DiagnosticTypes.sru, 47)
if strOp == 'AND': if strOp == 'AND':
if ((len(termsL) > 1 or len(termsR) > 1) if ((len(termsL) > 1 or len(termsR) > 1)
and (any(op[0] not in ('wf', '^*') for op in operandR) and (any(op[0] not in ('tok', '^*') for op in operandR)
or any(op[0] not in ('wf', '^*') for op in operandL))): or any(op[0] not in ('tok', '^*') for op in operandL))):
message = 'ANNIS does not support queries that combine several ' \ message = 'ANNIS does not support queries that combine several ' \
'multi-word sequences with boolean operators or multiple ' \ 'multi-word sequences with boolean operators or multiple ' \
'boolean operators.' 'boolean operators.'
...@@ -81,8 +89,8 @@ class AnnisQueryParser(QueryParser): ...@@ -81,8 +89,8 @@ class AnnisQueryParser(QueryParser):
return operandL + operandR + [['^*', max(termsL), min(termsR)]] return operandL + operandR + [['^*', max(termsL), min(termsR)]]
elif strOp == 'OR': elif strOp == 'OR':
if ((len(termsL) > 1 or len(termsR) > 1) if ((len(termsL) > 1 or len(termsR) > 1)
and (any(op[0] not in ('wf', '|') for op in operandR) and (any(op[0] not in ('tok', '|') for op in operandR)
or any(op[0] not in ('wf', '|') for op in operandL))): or any(op[0] not in ('tok', '|') for op in operandL))):
message = 'ANNIS does not support queries that combine several ' \ message = 'ANNIS does not support queries that combine several ' \
'multi-word sequences with boolean operators or multiple ' \ 'multi-word sequences with boolean operators or multiple ' \
'boolean operators.' 'boolean operators.'
...@@ -91,8 +99,18 @@ class AnnisQueryParser(QueryParser): ...@@ -91,8 +99,18 @@ class AnnisQueryParser(QueryParser):
raise Diagnostic(DiagnosticTypes.sru, 37, details=strOp) raise Diagnostic(DiagnosticTypes.sru, 37, details=strOp)
def not_bool(self, operand, config): def not_bool(self, operand, config):
# TODO: implement if len(operand) <= 0:
raise NotImplementedError() raise Diagnostic(DiagnosticTypes.sru, 10)
if not (len(operand) == 1 and operand[0] == 'tok'):
message = 'ANNIS does not support queries that negate anything ' \
'other than a single-token subquery.'
raise Diagnostic(DiagnosticTypes.sru, 48, message=message)
result = operand[:]
if result[3] == '=':
result[3] = '!='
else:
result[3] = '='
return result
def adv_term_query_proper(self, identifier: str, op: str, value: str, flags: str, config: ResourceConfig): def adv_term_query_proper(self, identifier: str, op: str, value: str, flags: str, config: ResourceConfig):
""" """
...@@ -103,33 +121,29 @@ class AnnisQueryParser(QueryParser): ...@@ -103,33 +121,29 @@ class AnnisQueryParser(QueryParser):
raise Diagnostic(DiagnosticTypes.sru, 10) raise Diagnostic(DiagnosticTypes.sru, 10)
if flags not in ('', 'I', 'C'): if flags not in ('', 'I', 'C'):
raise Diagnostic(DiagnosticTypes.sru, 48, message='ANNIS does not support regex flags.') raise Diagnostic(DiagnosticTypes.sru, 48, message='ANNIS does not support regex flags.')
if op != '=':
raise Diagnostic(DiagnosticTypes.sru, 10,
message='In token queries, only = is allowed as operators.')
getParams = [] getParams = []
if identifier == 'text': if identifier == 'text':
getParams.append(['wf', 1, '/' + value.replace('/', '\\/') + '/']) getParams.append(['tok', 1, '/' + value.replace('/', '\\/') + '/', op])
elif identifier == 'lemma': elif identifier == 'lemma':
getParams.append(['lemma', 1, '/' + value.replace('/', '\\/') + '/']) getParams.append(['lemma', 1, '/' + value.replace('/', '\\/') + '/', op])
elif identifier == 'pos': elif identifier == 'pos':
if value in config.pos_convert_reverse: if value in config.pos_convert_reverse:
# UD to corpus-specific POS tags # UD to corpus-specific POS tags
value = config.pos_convert_reverse[value] value = config.pos_convert_reverse[value]
getParams.append(['pos', 1, '/' + value.replace('/', '\\/') + '/']) getParams.append(['pos', 1, '/' + value.replace('/', '\\/') + '/', op])
else: else:
getParams.append([identifier, 1, '/' + value.replace('/', '\\/') + '/']) getParams.append([identifier, 1, '/' + value.replace('/', '\\/') + '/', op])
# raise Diagnostic(DiagnosticTypes.sru, 10, # raise Diagnostic(DiagnosticTypes.sru, 10,
# message='The identifier ' + identifier + ' is not supported in ANNIS.') # message='The identifier ' + identifier + ' is not supported in ANNIS.')
return getParams return getParams
# TODO: continue here
def adv_quantify_segment(self, getParams, quantifier: str, config: ResourceConfig): def adv_quantify_segment(self, getParams, quantifier: str, config: ResourceConfig):
if len(getParams) != 1 or getParams[0][0] != 'wf' or getParams[0][2] != '.*': if len(getParams) != 1 or getParams[0][0] != 'tok' or getParams[0][2] != '/.*/' or getParams[0][3] != '=':
raise Diagnostic(DiagnosticTypes.sru, 48, raise Diagnostic(DiagnosticTypes.sru, 48,
message='Token quantifiers are only allowed with empty token queries ' message='Token quantifiers are only allowed with empty token queries '
'in Tsakoprus (for setting distance constraints).') 'in ANNIS (for setting distance constraints).')
minDist = 1 minDist = 1
maxDist = 100 maxDist = 50
if quantifier == '?': if quantifier == '?':
maxDist = 2 maxDist = 2
elif quantifier == '+': elif quantifier == '+':
...@@ -145,10 +159,11 @@ class AnnisQueryParser(QueryParser): ...@@ -145,10 +159,11 @@ class AnnisQueryParser(QueryParser):
minDist = int(m.group(1)) + 1 minDist = int(m.group(1)) + 1
if len(m.group(2)) > 0: if len(m.group(2)) > 0:
maxDist = int(m.group(2)) + 1 maxDist = int(m.group(2)) + 1
op = '.*'
if minDist > 1 or maxDist != 50:
op = '.' + str(minDist) + ',' + str(maxDist)
getParams = [ getParams = [
['word_rel_', getParams[0][1], getParams[0][1] - 1], [op, getParams[0][1], getParams[0][1] - 1]
['word_dist_from_', getParams[0][1], str(minDist)],
['word_dist_to_', getParams[0][1], str(maxDist)]
] ]
return getParams return getParams
...@@ -160,18 +175,17 @@ class AnnisQueryParser(QueryParser): ...@@ -160,18 +175,17 @@ class AnnisQueryParser(QueryParser):
operandR = self.shift_term_indexes(operandR, max(termsL)) operandR = self.shift_term_indexes(operandR, max(termsL))
termsR = self.term_indexes(operandR) termsR = self.term_indexes(operandR)
# Find out if there is already a distance constraint # Find out if there is already a distance constraint
wordRelPresent = (any(param[0] == 'word_rel_' for param in operandL) wordRelPresent = (any(param[0].startswith('.') for param in operandL)
or any(param[0] == 'word_rel_' and param[2] == max(termsL) or any(param[0].startswith('.') and param[2] == max(termsL)
for param in operandR)) for param in operandR))
if not wordRelPresent: if not wordRelPresent:
wordRelParams = [ wordRelParams = [
['word_rel_', min(termsR), max(termsL)], ['.', min(termsR), max(termsL)]
['word_dist_from_', min(termsR), '1'],
['word_dist_to_', min(termsR), '1']
] ]
operandR += wordRelParams operandR += wordRelParams
return operandL + operandR return operandL + operandR
# TODO: continue here
def adv_binary_bool(self, strOp: str, operandL, operandR, config: ResourceConfig): def adv_binary_bool(self, strOp: str, operandL, operandR, config: ResourceConfig):
# Join multiple constraints on one word in an advanced query # Join multiple constraints on one word in an advanced query
print('ADVANCED INTERNAL BOOL', strOp, str(operandL), str(operandR)) print('ADVANCED INTERNAL BOOL', strOp, str(operandL), str(operandR))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment