Skip to content
Snippets Groups Projects
Commit 03669606 authored by Timofey Arkhangelskiy's avatar Timofey Arkhangelskiy
Browse files

Sequences in Tsakorpus advanced queries now work

parent 717f43b9
No related branches found
No related tags found
No related merge requests found
......@@ -80,6 +80,34 @@ class LitteraeQueryParser(QueryParser):
# TODO: implement
raise NotImplementedError()
def adv_quantify_segment(self, getParams, quantifier: str, config: ResourceConfig):
if len(getParams) != 1 or getParams[0][0] != 'q_' or getParams[0][2] != '.*':
raise Diagnostic(DiagnosticTypes.sru, 48,
message='Token quantifiers are only allowed with empty token queries '
'in Litterae (for setting distance constraints).')
maxDist = 100
if quantifier == '?':
maxDist = 2
elif quantifier == '+':
raise Diagnostic(DiagnosticTypes.sru, 48,
message='Litterae does not accept "+" as a token quantifier.')
elif self.rxQuantifierExact.search(quantifier) is not None:
raise Diagnostic(DiagnosticTypes.sru, 48,
message='Litterae does not accept single numbers as token quantifiers.')
else:
m = self.rxQuantifierInterval.search(quantifier)
if m is None:
raise Diagnostic(DiagnosticTypes.sru, 10,
message='Something is wrong with a token quantifier.')
if len(m.group(1)) > 0 and m.group(1) != '0':
raise Diagnostic(DiagnosticTypes.sru, 48,
message='Litterae does not accept token quantifiers with'
'positive lower bounds.')
if len(m.group(2)) > 0:
maxDist = int(m.group(2))
getParams = ['slop_', getParams[0][1], str(maxDist)]
return getParams
def send_query(self, strGetParams: str, config: ResourceConfig):
"""
Send the translated query to the Litterae instance. Return JSON results
......
......@@ -17,9 +17,11 @@ class QueryParser:
rxWithinClause = re.compile(' +within +(s|sentence|u|utterance|p|paragraph|'
't|turn|text|session) *$')
rxNonemptyQueryPart = re.compile('[^ \t\r\n]')
rxSegmentQuery = re.compile('^\\[(.*)\\](\\{[0-9,]\\}|[?*+]|)$')
rxSegmentQuery = re.compile('^\\[(.*)\\](\\{[0-9,]+\\}|[?*+]|)$')
rxAdvTermQuery = re.compile('^ *([a-zA-Z][a-zA-Z0-9\\-]*(?::[a-zA-Z][a-zA-Z0-9\\-]*)?) *'
'(!?=) *(["\'].*["\']) *(/[iIcCld])? *$')
rxQuantifierExact = re.compile('^\\{[1-9][0-9]*\\}$')
rxQuantifierInterval = re.compile('^\\{(|0|[1-9][0-9]*),(|0|[1-9][0-9]*)\\}$')
acceptableIdentifiers = {'text', 'lemma', 'pos', 'orth', 'norm', 'phonetic'}
def __init__(self):
......@@ -122,6 +124,8 @@ class QueryParser:
start += 1
while end > 0 and strQuery[end - 1] in ' \t\n':
end -= 1
if start >= end:
return -1, ''
if strQuery[start] == '!':
return start, '!'
parenthBalance = 0
......@@ -160,9 +164,11 @@ class QueryParser:
getParamsShifted = []
for param in getParams:
if type(param[2]) is int:
newParam = (param[0], param[1] + shift, param[2] + shift)
# int: refers to query word number
# str: refers to a distance constraint
newParam = [param[0], param[1] + shift, param[2] + shift]
elif param[1] >= 0:
newParam = (param[0], param[1] + shift, param[2])
newParam = [param[0], param[1] + shift, param[2]]
else:
newParam = copy.deepcopy(param)
getParamsShifted.append(newParam)
......@@ -176,35 +182,51 @@ class QueryParser:
"""
terms = set()
for param in getParams:
if type(param[1]) is int:
if param[0] in ('word_rel_', 'word_dist_from_', 'word_dist_to_'):
continue
elif type(param[2]) is int:
terms.add(param[2])
elif type(param[1]) is int:
terms.add(param[1])
elif type(param[1]) is list:
for t in param[1]:
terms.add(t)
if len(terms) <= 0:
return [0]
return [t for t in sorted(terms)]
def build_get_string(self, getParams, config, withinClause=''):
def build_get_string(self, getParams, config: ResourceConfig, withinClause=''):
# Abstract function
raise NotImplementedError()
def term_query(self, query, config):
def term_query(self, query, config: ResourceConfig):
# Abstract function
raise NotImplementedError()
def binary_bool(self, strOp, operandL, operandR, config):
def binary_bool(self, strOp, operandL, operandR, config: ResourceConfig):
# Abstract function
raise NotImplementedError()
def not_bool(self, operand, config):
def not_bool(self, operand, config: ResourceConfig):
# Abstract function
raise NotImplementedError()
def adv_term_query_proper(self, identifier: str, op: str, value: str, flags: str, config):
def adv_term_query_proper(self, identifier: str, op: str, value: str, flags: str, config: ResourceConfig):
# Abstract function
raise NotImplementedError()
def adv_quantify_segment(self, query, quantifier: str, config: ResourceConfig):
# Abstract function
raise NotImplementedError()
def adv_main_sequence(self, resultLeft, resultRight, config: ResourceConfig):
# Abstract function
raise NotImplementedError()
def adv_term_query(self, query, config: ResourceConfig):
print('ADVANCED TERM QUERY', query)
if len(query) <= 0:
query = 'text=".*"'
m = self.rxAdvTermQuery.search(query)
if m is None:
raise Diagnostic(DiagnosticTypes.sru, 10)
......@@ -283,24 +305,38 @@ class QueryParser:
return self.not_bool(resultRight, config)
raise Diagnostic(DiagnosticTypes.sru, 10)
def adv_expression_query(self, query: str, config: ResourceConfig):
def adv_expression_query(self, query: str, quantifier: str, config: ResourceConfig):
query = query.strip()
if len(query) <= 0:
query = 'text=".*"'
iOpPos, strOp = self.find_operator_adv_expression(query)
print('ADVANED EXPRESSION QUERY', iOpPos, strOp)
if iOpPos == -1:
if query[0] == '(' and query[-1] == ')':
return self.adv_expression_query(query[1:len(query)-1], config)
return self.adv_expression_query(query[1:len(query)-1], quantifier, config)
else:
return self.adv_term_query(query, config)
if len(quantifier) <= 0:
return self.adv_term_query(query, config)
else:
return self.adv_quantify_segment(self.adv_term_query(query, config),
quantifier, config)
if strOp in ('&', '|'):
resultLeft = self.adv_expression_query(query[:iOpPos], config)
resultRight = self.adv_expression_query(query[iOpPos+1:], config)
resultLeft = self.adv_expression_query(query[:iOpPos], '', config)
resultRight = self.adv_expression_query(query[iOpPos+1:], '', config)
if len(resultLeft) <= 0 or len(resultRight) <= 0:
raise Diagnostic(DiagnosticTypes.sru, 10)
return self.adv_binary_bool(strOp, resultLeft, resultRight, config)
if len(quantifier) <= 0:
return self.adv_binary_bool(strOp, resultLeft, resultRight, config)
else:
return self.adv_quantify_segment(self.adv_binary_bool(strOp, resultLeft, resultRight, config),
quantifier, config)
elif strOp == '!':
resultRight = self.adv_expression_query(query[iOpPos+1:], config)
return self.not_bool(resultRight, config)
resultRight = self.adv_expression_query(query[iOpPos+1:], '', config)
if len(quantifier) <= 0:
return self.not_bool(resultRight, config)
else:
return self.adv_quantify_segment(self.not_bool(resultRight, config),
quantifier, config)
raise Diagnostic(DiagnosticTypes.sru, 10)
def adv_segment_query(self, query: str, config: ResourceConfig):
......@@ -310,8 +346,7 @@ class QueryParser:
raise Diagnostic(DiagnosticTypes.sru, 27)
expression = m.group(1).strip()
quantifier = m.group(2)
# TODO: quantifier
return self.adv_expression_query(expression, config)
return self.adv_expression_query(expression, quantifier, config)
def adv_simple_query(self, query: str, config: ResourceConfig, start=0, end=-1):
if len(query) <= 0:
......@@ -348,14 +383,14 @@ class QueryParser:
if iOpPos == -1:
return self.adv_simple_query(query, config, start=start, end=end)
resultLeft = self.adv_simple_query(query, config, start=start, end=iOpPos)
resultRight = self.adv_main_query(query, config, start=start, end=iOpPos + 1)
resultRight = self.adv_main_query(query, config, start=iOpPos + 1, end=end)
if strOp == 'SEQUENCE':
if len(resultLeft) <= 0 or len(resultRight) <= 0:
raise Diagnostic(DiagnosticTypes.sru, 10)
return self.adv_main_sequence(resultLeft, resultRight, config)
elif strOp == 'OR':
resultLeft = self.adv_simple_query(query, config, start=start, end=iOpPos)
resultRight = self.adv_main_query(query, config, start=start, end=iOpPos + 1)
resultRight = self.adv_main_query(query, config, start=iOpPos + 1, end=end)
if len(resultLeft) <= 0 or len(resultRight) <= 0:
raise Diagnostic(DiagnosticTypes.sru, 10)
return self.adv_main_or(resultLeft, resultRight, config)
......
......@@ -121,6 +121,55 @@ class TsakorpusQueryParser(QueryParser):
message='The identifier ' + identifier + ' is not supported in Tsakoprus.')
return getParams
def adv_quantify_segment(self, getParams, quantifier: str, config: ResourceConfig):
if len(getParams) != 1 or getParams[0][0] != 'wf' or getParams[0][2] != '.*':
raise Diagnostic(DiagnosticTypes.sru, 48,
message='Token quantifiers are only allowed with empty token queries '
'in Tsakoprus (for setting distance constraints).')
minDist = 1
maxDist = 100
if quantifier == '?':
maxDist = 2
elif quantifier == '+':
minDist = 2
elif self.rxQuantifierExact.search(quantifier) is not None:
minDist = maxDist = int(quantifier[1:len(quantifier)-1])
else:
m = self.rxQuantifierInterval.search(quantifier)
if m is None:
raise Diagnostic(DiagnosticTypes.sru, 10,
message='Something is wrong with a token quantifier.')
if len(m.group(1)) > 0:
minDist = int(m.group(1)) + 1
if len(m.group(2)) > 0:
maxDist = int(m.group(2)) + 1
getParams = [
['word_rel_', getParams[0][1], getParams[0][1] - 1],
['word_dist_from_', getParams[0][1], str(minDist)],
['word_dist_to_', getParams[0][1], str(maxDist)]
]
return getParams
def adv_main_sequence(self, operandL, operandR, config: ResourceConfig):
# print('SEQUENCE JOIN', str(operandL), str(operandR))
if len(operandL) <= 0 or len(operandR) <= 0:
raise Diagnostic(DiagnosticTypes.sru, 10)
termsL = self.term_indexes(operandL)
operandR = self.shift_term_indexes(operandR, max(termsL))
termsR = self.term_indexes(operandR)
# Find out if there is already a distance constraint
wordRelPresent = (any(param[0] == 'word_rel_' for param in operandL)
or any(param[0] == 'word_rel_' and param[2] == max(termsL)
for param in operandR))
if not wordRelPresent:
wordRelParams = [
['word_rel_', min(termsR), max(termsL)],
['word_dist_from_', min(termsR), '1'],
['word_dist_to_', min(termsR), '1']
]
operandR += wordRelParams
return operandL + operandR
def adv_binary_bool(self, strOp: str, operandL, operandR, config: ResourceConfig):
# Join multiple constraints on one word in an advanced query
print('ADVANCED INTERNAL BOOL', strOp, str(operandL), str(operandR))
......
......@@ -6,10 +6,12 @@ http://127.0.0.1:5000/fcs-endpoint/flc?operation=searchRetrieve&query=regnum%20O
http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=ke%20OR%20no Tsakorpus -- Simple search with boolean operator ("ke" OR "no")
http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=ke%20AND%20no Tsakorpus -- Simple search with boolean operator ("ke" AND "no", 103 hits)
http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=ke%20no Tsakorpus -- Simple search with multiple words ("ke no", 18 hits)
http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=ke%20no Tsakorpus -- Simple search with multiple words ("ke no", 18 hits)
http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[text=%22ke%22]%20[text=%22no%22]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search with multiple words ("ke no", 18 hits)
http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[text=%22ke%22]%20[]{0,0}%20[text=%22no%22]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search with multiple words and fake distance constraint ("ke no", 18 hits)
http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=ke%20AND%20no&x-fcs-dataviews=adv Tsakorpus -- Simple search with boolean operator, advanced view ("ke" AND "no", 103 hits)
http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=%22ka%22&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, term only ("ka", 112 hits)
http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[text=%22mon%22]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, term only / word ("mon", 1465 hits)
http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[lemma=%22mon%22]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, term only / lemma ("mon", 2284 hits)
http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[pos=%22NOUN%22]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, term only / POS (NOUN, 22639 hits)
http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[lemma%3D%22m.*%22%20%26%20(pos=%22NOUN%22|pos=%22VERB%22)]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, Boolean operators inside term query (words that start with "m" and are either nouns or verbs, 3878 hits)
\ No newline at end of file
http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[lemma%3D%22m.*%22%20%26%20(pos=%22NOUN%22|pos=%22VERB%22)]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, Boolean operators inside term query (words that start with "m" and are either nouns or verbs, 3878 hits)
http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[text=%22ud%22]%20[]{1,2}%20[text=%22no%22]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, sequence of terms ("ud" followed by "no" at the distance between 2 and 3, 7 hits)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment