diff --git a/common/query_parser.py b/common/query_parser.py index ffb00f3cdfbc5af01ba65c4e170a5cdd1fdc8711..9f3d6c9076d96619ab816eff0599d16265fdf06e 100644 --- a/common/query_parser.py +++ b/common/query_parser.py @@ -17,6 +17,10 @@ class QueryParser: rxWithinClause = re.compile(' +within +(s|sentence|u|utterance|p|paragraph|' 't|turn|text|session) *$') rxNonemptyQueryPart = re.compile('[^ \t\r\n]') + rxSegmentQuery = re.compile('^\\[(.*)\\](\\{[0-9,]\\}|[?*+]|)$') + rxAdvTermQuery = re.compile('^ *([a-zA-Z][a-zA-Z0-9\-]*(?::[a-zA-Z][a-zA-Z0-9\-]*)?) * ' + '(!?=) *(["\'](.*)["\']) *(/[iIcCld])? *$') + acceptableIdentifiers = {'text', 'lemma', 'pos', 'orth', 'norm', 'phonetic'} def __init__(self): pass @@ -63,6 +67,7 @@ class QueryParser: bracketBalance = 0 curlyBalance = 0 inQuotes = False + inSingleQuotes = False for i in range(start, end): if inQuotes: if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\': @@ -71,6 +76,13 @@ class QueryParser: if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\': inQuotes = True continue + if inSingleQuotes: + if strQuery[i] == "'" and i > 0 and strQuery[i-1] != '\\': + inSingleQuotes = False + continue + if strQuery[i] == "'" and i > 0 and strQuery[i-1] != '\\': + inSingleQuotes = True + continue if strQuery[i] == '(': parenthBalance += 1 elif strQuery[i] == ')': @@ -98,6 +110,47 @@ class QueryParser: return iCurChar - 1, 'SEQUENCE' return -1, '' + @staticmethod + def find_operator_adv_expression(strQuery): + """ + Locate the highest |, & or ! operator in a segment expression + in the advanced search. + """ + start = 0 + end = len(strQuery) + while start < len(strQuery) and strQuery[start] in ' \t\n': + start += 1 + while end > 0 and strQuery[end - 1] in ' \t\n': + end -= 1 + if strQuery[start] == '!': + return start, '!' + parenthBalance = 0 + inQuotes = False + inSingleQuotes = False + for i in range(start, end): + if inQuotes: + if strQuery[i] == '"' and i > 0 and strQuery[i - 1] != '\\': + inQuotes = False + continue + if strQuery[i] == '"' and i > 0 and strQuery[i - 1] != '\\': + inQuotes = True + continue + if inSingleQuotes: + if strQuery[i] == "'" and i > 0 and strQuery[i-1] != '\\': + inSingleQuotes = False + continue + if strQuery[i] == "'" and i > 0 and strQuery[i-1] != '\\': + inSingleQuotes = True + continue + if strQuery[i] == '(': + parenthBalance += 1 + elif strQuery[i] == ')': + parenthBalance -= 1 + elif parenthBalance == 0: + if strQuery[i] in '|&': + return i, strQuery[i] + return -1, '' + @staticmethod def shift_term_indexes(getParams, shift): """ @@ -146,6 +199,35 @@ class QueryParser: # Abstract function raise NotImplementedError() + def adv_term_query_proper(self, identifier: str, op: str, value: str, flags: str, config): + # Abstract function + raise NotImplementedError() + + def adv_term_query(self, query, config: ResourceConfig): + m = self.rxAdvTermQuery.search(query) + if m is None: + raise Diagnostic(DiagnosticTypes.sru, 10) + identifier, op, value, flags = m.group(1), m.group(2), m.group(3), m.group(4) + if value[0] != value[-1]: + raise Diagnostic(DiagnosticTypes.sru, 10) # Different quotes + value = value[1:len(value)-1] # Remove quotes + if flags is None: + flags = '' + if identifier in ('token', 'word'): + identifier = 'text' # Should I do this? + if identifier not in self.acceptableIdentifiers: + raise Diagnostic(DiagnosticTypes.sru, 10, + message=identifier + ' is not an acceptable identifier in a segment query.') + return self.adv_term_query_proper(identifier, op, value, flags, config) + + def adv_binary_bool(self, strOp, operandL, operandR, config): + # Abstract function + raise NotImplementedError() + + def adv_not_bool(self, operand, config): + # Abstract function + raise NotImplementedError() + def translate_simple(self, query: str, config: ResourceConfig, start=0, end=-1): """ Translate a simple search (CQL) query into a corpus-specific query @@ -199,6 +281,52 @@ class QueryParser: return self.not_bool(resultRight, config) return {} + def adv_expression_query(self, query: str, config: ResourceConfig): + iOpPos, strOp = self.find_operator_adv_expression(query) + if iOpPos == -1: + if query[0] == '(' and query[-1] == ')': + return self.adv_expression_query(query[1:len(query)-1], config) + else: + return self.adv_term_query(query, config) + if strOp in ('&', '|'): + resultLeft = self.adv_expression_query(query[:iOpPos], config) + resultRight = self.adv_expression_query(query[iOpPos+1:], config) + if len(resultLeft) <= 0 or len(resultRight) <= 0: + raise Diagnostic(DiagnosticTypes.sru, 10) + return self.adv_binary_bool(strOp, resultLeft, resultRight, config) + elif strOp == '!': + resultRight = self.adv_expression_query(query[iOpPos+1:], config) + return self.not_bool(resultRight, config) + + def adv_segment_query(self, query: str, config: ResourceConfig): + m = self.rxSegmentQuery.search(query) + if m is None: + raise Diagnostic(DiagnosticTypes.sru, 27) + expression = m.group(1).strip() + quantifier = m.group(2) + # TODO: quantifier + return self.adv_expression_query(expression, config) + + def adv_simple_query(self, query: str, config: ResourceConfig, start=0, end=-1): + if len(query) <= 0: + raise Diagnostic(DiagnosticTypes.sru, 27) + if start >= len(query) - 1 or end <= 0: + raise Diagnostic(DiagnosticTypes.sru, 10) + while start < len(query) and query[start] in ' \t\n': + start += 1 + while end > 0 and query[end - 1] in ' \t\n': + end -= 1 + if start >= end: + raise Diagnostic(DiagnosticTypes.sru, 10) + if query[start] == '(' and query[end] == ')': + return self.adv_main_query(query, config, start=start+1, end=end-1) + if (query[end - 1] != '\\' + and ((query[start] == '"' and query[end] == '"') + or (query[start] == "'" and query[end] == "'"))): + return self.adv_segment_query('[text=' + query[start:end] + ']', config) + return self.adv_segment_query(query[start:end], config) + + def adv_main_query(self, query: str, config: ResourceConfig, start=0, end=-1): if len(query) <= 0: raise Diagnostic(DiagnosticTypes.sru, 27) diff --git a/common/tsakorpus_query_parser.py b/common/tsakorpus_query_parser.py index 0105c4d8353ad9b0e19adac3a2493765b347b6a6..abfe9d3e540f88078533942bcc78eca4305935aa 100644 --- a/common/tsakorpus_query_parser.py +++ b/common/tsakorpus_query_parser.py @@ -84,6 +84,29 @@ class TsakorpusQueryParser(QueryParser): # TODO: implement raise NotImplementedError() + def adv_term_query_proper(self, identifier: str, op: str, value: str, flags: str, config: ResourceConfig): + """ + Return list of query parameters for one term in an advanced query. + """ + if len(value) <= 0: + return Diagnostic(DiagnosticTypes.sru, 10) + if flags not in ('', 'i', 'c'): + return Diagnostic(DiagnosticTypes.sru, 10, message='Tsakorpus does not support regex flags.') + getParams = [] + if identifier == 'text': + getParams.append(['wf', 0, value]) + elif identifier == 'lemma': + getParams.append(['lex', 0, value]) + elif identifier == 'pos': + if value in config.pos_convert_reverse: + # UD to corpus-specific POS tags + value = config.pos_convert_reverse[value] + getParams.append(['gr', 0, value]) + else: + return Diagnostic(DiagnosticTypes.sru, 10, + message='The identifier ' + identifier + ' is not supported in Tsakoprus.') + return getParams + def send_query(self, strGetParams: str, config: ResourceConfig): """ Send the translated query to the Tsakorpus instance. Return JSON results