diff --git a/common/litterae_query_parser.py b/common/litterae_query_parser.py index ebaf73b9bc7798f13e1c58b331cdb26e1f45697f..22dd3b402e84f62ae3f850fdccedf7cda69eb2b4 100644 --- a/common/litterae_query_parser.py +++ b/common/litterae_query_parser.py @@ -80,6 +80,34 @@ class LitteraeQueryParser(QueryParser): # TODO: implement raise NotImplementedError() + def adv_quantify_segment(self, getParams, quantifier: str, config: ResourceConfig): + if len(getParams) != 1 or getParams[0][0] != 'q_' or getParams[0][2] != '.*': + raise Diagnostic(DiagnosticTypes.sru, 48, + message='Token quantifiers are only allowed with empty token queries ' + 'in Litterae (for setting distance constraints).') + maxDist = 100 + if quantifier == '?': + maxDist = 2 + elif quantifier == '+': + raise Diagnostic(DiagnosticTypes.sru, 48, + message='Litterae does not accept "+" as a token quantifier.') + elif self.rxQuantifierExact.search(quantifier) is not None: + raise Diagnostic(DiagnosticTypes.sru, 48, + message='Litterae does not accept single numbers as token quantifiers.') + else: + m = self.rxQuantifierInterval.search(quantifier) + if m is None: + raise Diagnostic(DiagnosticTypes.sru, 10, + message='Something is wrong with a token quantifier.') + if len(m.group(1)) > 0 and m.group(1) != '0': + raise Diagnostic(DiagnosticTypes.sru, 48, + message='Litterae does not accept token quantifiers with' + 'positive lower bounds.') + if len(m.group(2)) > 0: + maxDist = int(m.group(2)) + getParams = ['slop_', getParams[0][1], str(maxDist)] + return getParams + def send_query(self, strGetParams: str, config: ResourceConfig): """ Send the translated query to the Litterae instance. Return JSON results diff --git a/common/query_parser.py b/common/query_parser.py index 46b1fd558c02e20c48a5f8da6da672203cf1ea65..8ad88014fc58be0f92e5f7b67025a3c00e001818 100644 --- a/common/query_parser.py +++ b/common/query_parser.py @@ -17,9 +17,11 @@ class QueryParser: rxWithinClause = re.compile(' +within +(s|sentence|u|utterance|p|paragraph|' 't|turn|text|session) *$') rxNonemptyQueryPart = re.compile('[^ \t\r\n]') - rxSegmentQuery = re.compile('^\\[(.*)\\](\\{[0-9,]\\}|[?*+]|)$') + rxSegmentQuery = re.compile('^\\[(.*)\\](\\{[0-9,]+\\}|[?*+]|)$') rxAdvTermQuery = re.compile('^ *([a-zA-Z][a-zA-Z0-9\\-]*(?::[a-zA-Z][a-zA-Z0-9\\-]*)?) *' '(!?=) *(["\'].*["\']) *(/[iIcCld])? *$') + rxQuantifierExact = re.compile('^\\{[1-9][0-9]*\\}$') + rxQuantifierInterval = re.compile('^\\{(|0|[1-9][0-9]*),(|0|[1-9][0-9]*)\\}$') acceptableIdentifiers = {'text', 'lemma', 'pos', 'orth', 'norm', 'phonetic'} def __init__(self): @@ -122,6 +124,8 @@ class QueryParser: start += 1 while end > 0 and strQuery[end - 1] in ' \t\n': end -= 1 + if start >= end: + return -1, '' if strQuery[start] == '!': return start, '!' parenthBalance = 0 @@ -160,9 +164,11 @@ class QueryParser: getParamsShifted = [] for param in getParams: if type(param[2]) is int: - newParam = (param[0], param[1] + shift, param[2] + shift) + # int: refers to query word number + # str: refers to a distance constraint + newParam = [param[0], param[1] + shift, param[2] + shift] elif param[1] >= 0: - newParam = (param[0], param[1] + shift, param[2]) + newParam = [param[0], param[1] + shift, param[2]] else: newParam = copy.deepcopy(param) getParamsShifted.append(newParam) @@ -176,35 +182,51 @@ class QueryParser: """ terms = set() for param in getParams: - if type(param[1]) is int: + if param[0] in ('word_rel_', 'word_dist_from_', 'word_dist_to_'): + continue + elif type(param[2]) is int: + terms.add(param[2]) + elif type(param[1]) is int: terms.add(param[1]) elif type(param[1]) is list: for t in param[1]: terms.add(t) + if len(terms) <= 0: + return [0] return [t for t in sorted(terms)] - def build_get_string(self, getParams, config, withinClause=''): + def build_get_string(self, getParams, config: ResourceConfig, withinClause=''): # Abstract function raise NotImplementedError() - def term_query(self, query, config): + def term_query(self, query, config: ResourceConfig): # Abstract function raise NotImplementedError() - def binary_bool(self, strOp, operandL, operandR, config): + def binary_bool(self, strOp, operandL, operandR, config: ResourceConfig): # Abstract function raise NotImplementedError() - def not_bool(self, operand, config): + def not_bool(self, operand, config: ResourceConfig): # Abstract function raise NotImplementedError() - def adv_term_query_proper(self, identifier: str, op: str, value: str, flags: str, config): + def adv_term_query_proper(self, identifier: str, op: str, value: str, flags: str, config: ResourceConfig): + # Abstract function + raise NotImplementedError() + + def adv_quantify_segment(self, query, quantifier: str, config: ResourceConfig): + # Abstract function + raise NotImplementedError() + + def adv_main_sequence(self, resultLeft, resultRight, config: ResourceConfig): # Abstract function raise NotImplementedError() def adv_term_query(self, query, config: ResourceConfig): print('ADVANCED TERM QUERY', query) + if len(query) <= 0: + query = 'text=".*"' m = self.rxAdvTermQuery.search(query) if m is None: raise Diagnostic(DiagnosticTypes.sru, 10) @@ -283,24 +305,38 @@ class QueryParser: return self.not_bool(resultRight, config) raise Diagnostic(DiagnosticTypes.sru, 10) - def adv_expression_query(self, query: str, config: ResourceConfig): + def adv_expression_query(self, query: str, quantifier: str, config: ResourceConfig): query = query.strip() + if len(query) <= 0: + query = 'text=".*"' iOpPos, strOp = self.find_operator_adv_expression(query) print('ADVANED EXPRESSION QUERY', iOpPos, strOp) if iOpPos == -1: if query[0] == '(' and query[-1] == ')': - return self.adv_expression_query(query[1:len(query)-1], config) + return self.adv_expression_query(query[1:len(query)-1], quantifier, config) else: - return self.adv_term_query(query, config) + if len(quantifier) <= 0: + return self.adv_term_query(query, config) + else: + return self.adv_quantify_segment(self.adv_term_query(query, config), + quantifier, config) if strOp in ('&', '|'): - resultLeft = self.adv_expression_query(query[:iOpPos], config) - resultRight = self.adv_expression_query(query[iOpPos+1:], config) + resultLeft = self.adv_expression_query(query[:iOpPos], '', config) + resultRight = self.adv_expression_query(query[iOpPos+1:], '', config) if len(resultLeft) <= 0 or len(resultRight) <= 0: raise Diagnostic(DiagnosticTypes.sru, 10) - return self.adv_binary_bool(strOp, resultLeft, resultRight, config) + if len(quantifier) <= 0: + return self.adv_binary_bool(strOp, resultLeft, resultRight, config) + else: + return self.adv_quantify_segment(self.adv_binary_bool(strOp, resultLeft, resultRight, config), + quantifier, config) elif strOp == '!': - resultRight = self.adv_expression_query(query[iOpPos+1:], config) - return self.not_bool(resultRight, config) + resultRight = self.adv_expression_query(query[iOpPos+1:], '', config) + if len(quantifier) <= 0: + return self.not_bool(resultRight, config) + else: + return self.adv_quantify_segment(self.not_bool(resultRight, config), + quantifier, config) raise Diagnostic(DiagnosticTypes.sru, 10) def adv_segment_query(self, query: str, config: ResourceConfig): @@ -310,8 +346,7 @@ class QueryParser: raise Diagnostic(DiagnosticTypes.sru, 27) expression = m.group(1).strip() quantifier = m.group(2) - # TODO: quantifier - return self.adv_expression_query(expression, config) + return self.adv_expression_query(expression, quantifier, config) def adv_simple_query(self, query: str, config: ResourceConfig, start=0, end=-1): if len(query) <= 0: @@ -348,14 +383,14 @@ class QueryParser: if iOpPos == -1: return self.adv_simple_query(query, config, start=start, end=end) resultLeft = self.adv_simple_query(query, config, start=start, end=iOpPos) - resultRight = self.adv_main_query(query, config, start=start, end=iOpPos + 1) + resultRight = self.adv_main_query(query, config, start=iOpPos + 1, end=end) if strOp == 'SEQUENCE': if len(resultLeft) <= 0 or len(resultRight) <= 0: raise Diagnostic(DiagnosticTypes.sru, 10) return self.adv_main_sequence(resultLeft, resultRight, config) elif strOp == 'OR': resultLeft = self.adv_simple_query(query, config, start=start, end=iOpPos) - resultRight = self.adv_main_query(query, config, start=start, end=iOpPos + 1) + resultRight = self.adv_main_query(query, config, start=iOpPos + 1, end=end) if len(resultLeft) <= 0 or len(resultRight) <= 0: raise Diagnostic(DiagnosticTypes.sru, 10) return self.adv_main_or(resultLeft, resultRight, config) diff --git a/common/tsakorpus_query_parser.py b/common/tsakorpus_query_parser.py index 077b9e68fc71387c55df274bb8f100a6148b9914..f6a61e4e6b558e5367191c6d0f9b7ba0dcc12b3e 100644 --- a/common/tsakorpus_query_parser.py +++ b/common/tsakorpus_query_parser.py @@ -121,6 +121,55 @@ class TsakorpusQueryParser(QueryParser): message='The identifier ' + identifier + ' is not supported in Tsakoprus.') return getParams + def adv_quantify_segment(self, getParams, quantifier: str, config: ResourceConfig): + if len(getParams) != 1 or getParams[0][0] != 'wf' or getParams[0][2] != '.*': + raise Diagnostic(DiagnosticTypes.sru, 48, + message='Token quantifiers are only allowed with empty token queries ' + 'in Tsakoprus (for setting distance constraints).') + minDist = 1 + maxDist = 100 + if quantifier == '?': + maxDist = 2 + elif quantifier == '+': + minDist = 2 + elif self.rxQuantifierExact.search(quantifier) is not None: + minDist = maxDist = int(quantifier[1:len(quantifier)-1]) + else: + m = self.rxQuantifierInterval.search(quantifier) + if m is None: + raise Diagnostic(DiagnosticTypes.sru, 10, + message='Something is wrong with a token quantifier.') + if len(m.group(1)) > 0: + minDist = int(m.group(1)) + 1 + if len(m.group(2)) > 0: + maxDist = int(m.group(2)) + 1 + getParams = [ + ['word_rel_', getParams[0][1], getParams[0][1] - 1], + ['word_dist_from_', getParams[0][1], str(minDist)], + ['word_dist_to_', getParams[0][1], str(maxDist)] + ] + return getParams + + def adv_main_sequence(self, operandL, operandR, config: ResourceConfig): + # print('SEQUENCE JOIN', str(operandL), str(operandR)) + if len(operandL) <= 0 or len(operandR) <= 0: + raise Diagnostic(DiagnosticTypes.sru, 10) + termsL = self.term_indexes(operandL) + operandR = self.shift_term_indexes(operandR, max(termsL)) + termsR = self.term_indexes(operandR) + # Find out if there is already a distance constraint + wordRelPresent = (any(param[0] == 'word_rel_' for param in operandL) + or any(param[0] == 'word_rel_' and param[2] == max(termsL) + for param in operandR)) + if not wordRelPresent: + wordRelParams = [ + ['word_rel_', min(termsR), max(termsL)], + ['word_dist_from_', min(termsR), '1'], + ['word_dist_to_', min(termsR), '1'] + ] + operandR += wordRelParams + return operandL + operandR + def adv_binary_bool(self, strOp: str, operandL, operandR, config: ResourceConfig): # Join multiple constraints on one word in an advanced query print('ADVANCED INTERNAL BOOL', strOp, str(operandL), str(operandR)) diff --git a/test_queries.txt b/test_queries.txt index db607020a7c85206f5349029665d21f372060856..b9735cc919a1017269abceb28ec59d09dc3c994b 100644 --- a/test_queries.txt +++ b/test_queries.txt @@ -6,10 +6,12 @@ http://127.0.0.1:5000/fcs-endpoint/flc?operation=searchRetrieve&query=regnum%20O http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=ke%20OR%20no Tsakorpus -- Simple search with boolean operator ("ke" OR "no") http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=ke%20AND%20no Tsakorpus -- Simple search with boolean operator ("ke" AND "no", 103 hits) http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=ke%20no Tsakorpus -- Simple search with multiple words ("ke no", 18 hits) -http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=ke%20no Tsakorpus -- Simple search with multiple words ("ke no", 18 hits) +http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[text=%22ke%22]%20[text=%22no%22]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search with multiple words ("ke no", 18 hits) +http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[text=%22ke%22]%20[]{0,0}%20[text=%22no%22]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search with multiple words and fake distance constraint ("ke no", 18 hits) http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=ke%20AND%20no&x-fcs-dataviews=adv Tsakorpus -- Simple search with boolean operator, advanced view ("ke" AND "no", 103 hits) http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=%22ka%22&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, term only ("ka", 112 hits) http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[text=%22mon%22]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, term only / word ("mon", 1465 hits) http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[lemma=%22mon%22]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, term only / lemma ("mon", 2284 hits) http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[pos=%22NOUN%22]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, term only / POS (NOUN, 22639 hits) -http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[lemma%3D%22m.*%22%20%26%20(pos=%22NOUN%22|pos=%22VERB%22)]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, Boolean operators inside term query (words that start with "m" and are either nouns or verbs, 3878 hits) \ No newline at end of file +http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[lemma%3D%22m.*%22%20%26%20(pos=%22NOUN%22|pos=%22VERB%22)]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, Boolean operators inside term query (words that start with "m" and are either nouns or verbs, 3878 hits) +http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[text=%22ud%22]%20[]{1,2}%20[text=%22no%22]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, sequence of terms ("ud" followed by "no" at the distance between 2 and 3, 7 hits) \ No newline at end of file