Skip to content
Snippets Groups Projects
Commit 332e02ff authored by Arkhangelskiy, Timofey's avatar Arkhangelskiy, Timofey
Browse files

Continue working on AnnisQueryParser (not tested yet)

parent ffb2fcd7
No related branches found
No related tags found
No related merge requests found
......@@ -13,6 +13,7 @@ class AnnisQueryParser(QueryParser):
"""
rxTsakorpusBool = re.compile('[()|,]')
rxRelOps = re.compile('^(?:\\^\\*|\\||\\.[*,0-9]*)$') # Operators for setting relations between query words
def build_get_string(self, params, config: ResourceConfig, withinClause=''):
"""
......@@ -36,11 +37,18 @@ class AnnisQueryParser(QueryParser):
queryFront = ''
queryTail = ''
for param in sorted(params):
if param[0] == 'wf':
# For query words:
# param = [annotation_layer, query_word_number, value, operator]
# For relations between query words:
# param = [relation, query_word_number_1, query_word_number_2]
if param[0] == 'tok' and param[3] == '=':
# Simplified form for token search
queryFront += param[2].replace('"', '') + ' & '
else:
elif self.rxRelOps.search(param[0]) is not None:
queryTail += '#' + str(param[1]) + ' ' + param[0] + ' #' + str(param[2]) + ' & '
q['query'] = queryFront.strip(' ') + queryTail.strip(' &')
else:
queryFront += param[0] + param[3] + param[2] + ' & '
q['query'] = queryFront.strip(' ') + ' ' + queryTail.strip(' &')
return q
def term_query(self, query: str, config: ResourceConfig):
......@@ -56,7 +64,7 @@ class AnnisQueryParser(QueryParser):
for term in query.split(' '):
if len(term) > 0:
iTerm += 1
getParams.append(['wf', iTerm, '"' + term.replace('"', '') + '"'])
getParams.append(['tok', iTerm, '"' + term.replace('"', '') + '"', '='])
if iTerm >= 2:
getParams.append(['.', iTerm, iTerm-1])
return getParams
......@@ -68,12 +76,12 @@ class AnnisQueryParser(QueryParser):
operandR = self.shift_term_indexes(operandR, max(termsL))
termsR = self.term_indexes(operandR)
if operandL[0][0] != 'wf' or operandR[0][0] != 'wf':
if operandL[0][0] != 'tok' or operandR[0][0] != 'tok':
raise Diagnostic(DiagnosticTypes.sru, 47)
if strOp == 'AND':
if ((len(termsL) > 1 or len(termsR) > 1)
and (any(op[0] not in ('wf', '^*') for op in operandR)
or any(op[0] not in ('wf', '^*') for op in operandL))):
and (any(op[0] not in ('tok', '^*') for op in operandR)
or any(op[0] not in ('tok', '^*') for op in operandL))):
message = 'ANNIS does not support queries that combine several ' \
'multi-word sequences with boolean operators or multiple ' \
'boolean operators.'
......@@ -81,8 +89,8 @@ class AnnisQueryParser(QueryParser):
return operandL + operandR + [['^*', max(termsL), min(termsR)]]
elif strOp == 'OR':
if ((len(termsL) > 1 or len(termsR) > 1)
and (any(op[0] not in ('wf', '|') for op in operandR)
or any(op[0] not in ('wf', '|') for op in operandL))):
and (any(op[0] not in ('tok', '|') for op in operandR)
or any(op[0] not in ('tok', '|') for op in operandL))):
message = 'ANNIS does not support queries that combine several ' \
'multi-word sequences with boolean operators or multiple ' \
'boolean operators.'
......@@ -91,8 +99,18 @@ class AnnisQueryParser(QueryParser):
raise Diagnostic(DiagnosticTypes.sru, 37, details=strOp)
def not_bool(self, operand, config):
# TODO: implement
raise NotImplementedError()
if len(operand) <= 0:
raise Diagnostic(DiagnosticTypes.sru, 10)
if not (len(operand) == 1 and operand[0] == 'tok'):
message = 'ANNIS does not support queries that negate anything ' \
'other than a single-token subquery.'
raise Diagnostic(DiagnosticTypes.sru, 48, message=message)
result = operand[:]
if result[3] == '=':
result[3] = '!='
else:
result[3] = '='
return result
def adv_term_query_proper(self, identifier: str, op: str, value: str, flags: str, config: ResourceConfig):
"""
......@@ -103,33 +121,29 @@ class AnnisQueryParser(QueryParser):
raise Diagnostic(DiagnosticTypes.sru, 10)
if flags not in ('', 'I', 'C'):
raise Diagnostic(DiagnosticTypes.sru, 48, message='ANNIS does not support regex flags.')
if op != '=':
raise Diagnostic(DiagnosticTypes.sru, 10,
message='In token queries, only = is allowed as operators.')
getParams = []
if identifier == 'text':
getParams.append(['wf', 1, '/' + value.replace('/', '\\/') + '/'])
getParams.append(['tok', 1, '/' + value.replace('/', '\\/') + '/', op])
elif identifier == 'lemma':
getParams.append(['lemma', 1, '/' + value.replace('/', '\\/') + '/'])
getParams.append(['lemma', 1, '/' + value.replace('/', '\\/') + '/', op])
elif identifier == 'pos':
if value in config.pos_convert_reverse:
# UD to corpus-specific POS tags
value = config.pos_convert_reverse[value]
getParams.append(['pos', 1, '/' + value.replace('/', '\\/') + '/'])
getParams.append(['pos', 1, '/' + value.replace('/', '\\/') + '/', op])
else:
getParams.append([identifier, 1, '/' + value.replace('/', '\\/') + '/'])
getParams.append([identifier, 1, '/' + value.replace('/', '\\/') + '/', op])
# raise Diagnostic(DiagnosticTypes.sru, 10,
# message='The identifier ' + identifier + ' is not supported in ANNIS.')
return getParams
# TODO: continue here
def adv_quantify_segment(self, getParams, quantifier: str, config: ResourceConfig):
if len(getParams) != 1 or getParams[0][0] != 'wf' or getParams[0][2] != '.*':
if len(getParams) != 1 or getParams[0][0] != 'tok' or getParams[0][2] != '/.*/' or getParams[0][3] != '=':
raise Diagnostic(DiagnosticTypes.sru, 48,
message='Token quantifiers are only allowed with empty token queries '
'in Tsakoprus (for setting distance constraints).')
'in ANNIS (for setting distance constraints).')
minDist = 1
maxDist = 100
maxDist = 50
if quantifier == '?':
maxDist = 2
elif quantifier == '+':
......@@ -145,10 +159,11 @@ class AnnisQueryParser(QueryParser):
minDist = int(m.group(1)) + 1
if len(m.group(2)) > 0:
maxDist = int(m.group(2)) + 1
op = '.*'
if minDist > 1 or maxDist != 50:
op = '.' + str(minDist) + ',' + str(maxDist)
getParams = [
['word_rel_', getParams[0][1], getParams[0][1] - 1],
['word_dist_from_', getParams[0][1], str(minDist)],
['word_dist_to_', getParams[0][1], str(maxDist)]
[op, getParams[0][1], getParams[0][1] - 1]
]
return getParams
......@@ -160,18 +175,17 @@ class AnnisQueryParser(QueryParser):
operandR = self.shift_term_indexes(operandR, max(termsL))
termsR = self.term_indexes(operandR)
# Find out if there is already a distance constraint
wordRelPresent = (any(param[0] == 'word_rel_' for param in operandL)
or any(param[0] == 'word_rel_' and param[2] == max(termsL)
wordRelPresent = (any(param[0].startswith('.') for param in operandL)
or any(param[0].startswith('.') and param[2] == max(termsL)
for param in operandR))
if not wordRelPresent:
wordRelParams = [
['word_rel_', min(termsR), max(termsL)],
['word_dist_from_', min(termsR), '1'],
['word_dist_to_', min(termsR), '1']
['.', min(termsR), max(termsL)]
]
operandR += wordRelParams
return operandL + operandR
# TODO: continue here
def adv_binary_bool(self, strOp: str, operandL, operandR, config: ResourceConfig):
# Join multiple constraints on one word in an advanced query
print('ADVANCED INTERNAL BOOL', strOp, str(operandL), str(operandR))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment