diff --git a/common/diagnostics.py b/common/diagnostics.py index a9459330e8dff11b01364888b5e2be06837d7b11..d08e3bfa3df6f5b239ec9e97ae0dc11663a9922c 100644 --- a/common/diagnostics.py +++ b/common/diagnostics.py @@ -2,7 +2,7 @@ from .enums import * import jinja2 -class Diagnostic: +class Diagnostic(Exception): """ Contains methods for issuing diagnostic messages (fatal or non-fatal) as per FCS specifications. @@ -12,10 +12,12 @@ class Diagnostic: """ fatalFCSDiagnostics = {3, 10, 11} # FCS specifications, 4.2 - fatalSRUDiagnostics = {8, 10, 27, 235} # A subset actually used by this endpoint + fatalSRUDiagnostics = {8, 10, 27, 37, 47, 48, 235} # A subset actually used by this endpoint stdMessages = { - (DiagnosticTypes.sru, 10): 'Something is wrong with the query syntax.' + (DiagnosticTypes.sru, 10): 'Something is wrong with the query syntax.', + (DiagnosticTypes.sru, 27): 'The query should not be empty.', + (DiagnosticTypes.sru, 37): 'Unsupported boolean operator.' } def __init__(self, diagType: DiagnosticTypes, diagID: int, @@ -55,7 +57,7 @@ class Diagnostic: return 'info:srw/diagnostic/1/' + str(self.diagID) return '' - def __repr__(self): + def __str__(self): """ Return the XML version of this diagnostic. """ @@ -65,6 +67,9 @@ class Diagnostic: message=self.message) return xmlText.strip() + def __repr__(self): + return str(self) + if __name__ == '__main__': # Test diff --git a/common/tsakorpus_query_parser.py b/common/tsakorpus_query_parser.py index 88624a34b21042e5cc774d552c61100fd35c5da2..f9c7b9bd79305b871be7ec40c1aab0541d9b7599 100644 --- a/common/tsakorpus_query_parser.py +++ b/common/tsakorpus_query_parser.py @@ -1,3 +1,4 @@ +import copy import re from .query_parser import QueryParser from .config import ResourceConfig @@ -8,6 +9,8 @@ class TsakorpusQueryParser(QueryParser): Parses search queries for Tsakorpus-based corpora. """ + rxTsakorpusBool = re.compile('[()|,]') + def term_query(self, query, config): """ Return list of query parameters for one term or sequence of terms. @@ -21,7 +24,7 @@ class TsakorpusQueryParser(QueryParser): for term in query.split(' '): if len(term) > 0: iTerm += 1 - getParams.append(['w', iTerm, term]) + getParams.append(['wf', iTerm, term]) if iTerm >= 2: getParams.append(['word_rel_', [iTerm-1, iTerm], '1']) getParams.append(['word_dist_from_', [iTerm-1, iTerm], '1']) @@ -29,25 +32,88 @@ class TsakorpusQueryParser(QueryParser): return getParams + def term_indexes(self, getParams): + """ + Find all search term indexes used in the GET parameters + specified by getParams list. Return list of integers (1-based). + """ + terms = set() + for param in getParams: + if type(param[1]) is int: + terms.add(param[1]) + elif type(param[1]) is list: + for t in param[1]: + terms.add(t) + return [t for t in sorted(terms)] + + + def shift_term_indexes(self, getParams, shift): + """ + Increase all search term indexes in the GET parameters + specified by getParams by shift. + """ + getParamsShifted = [] + for param in getParams: + if type(param[1]) is int: + newParam = (param[0], param[1] + shift, param[2]) + else: + newParam = (param[0], [i + shift for i in param[1]], param[2]) + getParamsShifted.append(newParam) + return getParamsShifted + + + def binary_bool(self, strOp, operandL, operandR, config): + if len(operandL) <= 0 or len(operandR) <= 0: + raise Diagnostic(DiagnosticTypes.sru, 10) + termsL = self.term_indexes(operandL) + operandR = self.shift_term_indexes(operandR, max(termsL)) + termsR = self.term_indexes(operandR) + if strOp == 'AND': + if len(termsL) > 1 and len(termsR) > 1: + message = 'Tsakorpus does not support queries that combine several ' \ + 'multi-word sequences with boolean operators.' + raise Diagnostic(DiagnosticTypes.sru, 48, message=message) + return operandL + operandR + elif strOp == 'OR': + if len(termsL) > 1 or len(termsR) > 1: + message = 'Tsakorpus does not support queries that combine several ' \ + 'multi-word sequences with boolean operators.' + raise Diagnostic(DiagnosticTypes.sru, 48, message=message) + if operandL[0][0] != 'wf' or operandR[0][0] != 'wf': + raise Diagnostic(DiagnosticTypes.sru, 47) + if self.rxTsakorpusBool.search(operandL[0][2]) is not None: + getParamsNew = [('wf', operandL[0][1], '(' + operandL[0][2] + ')|' + operandR[0][2])] + else: + getParamsNew = [('wf', operandL[0][1], operandL[0][2] + '|' + operandR[0][2])] + return getParamsNew + raise Diagnostic(DiagnosticTypes.sru, 37, details=strOp) + def translate_fcsql(self, query: str, config: ResourceConfig, basicSearch: bool = False, start=0, end=-1): """ Translate an FCS-QL query into a Tsakorpus GET query. - If something is wrong with the query, return a diagnostic. + If something is wrong with the query, raise a Diagnostic exception. The function is recursive and only looks at the part of the string delimited by start and end parameters. """ + print(query, start, end) if end == -1: end = len(query) if end == 0: - return Diagnostic(DiagnosticTypes.sru, 27, message='The query should not be empty.') + raise Diagnostic(DiagnosticTypes.sru, 27) if self.rxTermQuery.search(query) is not None: return self.term_query(query, config) # if query.count('(') != query.count(')'): # return None if len(query) <= 0: - return Diagnostic(DiagnosticTypes.sru, 27, message='The query should not be empty.') + raise Diagnostic(DiagnosticTypes.sru, 27) + if start >= len(query) - 1 or end <= 0: + raise Diagnostic(DiagnosticTypes.sru, 10) + while start < len(query) and query[start] in ' \t\n': + start += 1 + while end > 0 and query[end - 1] in ' \t\n': + end -= 1 if start >= end: - return Diagnostic(DiagnosticTypes.sru, 10) + raise Diagnostic(DiagnosticTypes.sru, 10) iOpPos, strOp = self.find_operator(query, start, end) if iOpPos == -1: if query[start] == '(' and query[end - 1] == ')': @@ -57,9 +123,8 @@ class TsakorpusQueryParser(QueryParser): if strOp in ('AND', 'OR'): resultLeft = self.translate_fcsql(query, config, basicSearch=basicSearch, start=start, end=iOpPos) resultRight = self.translate_fcsql(query, config, basicSearch=basicSearch, start=iOpPos + len(strOp), end=end) - if (type(resultRight) is Diagnostic or type(resultLeft) is Diagnostic - or len(resultLeft) <= 0 or len(resultRight) <= 0): - return Diagnostic(DiagnosticTypes.sru, 10) + if len(resultLeft) <= 0 or len(resultRight) <= 0: + raise Diagnostic(DiagnosticTypes.sru, 10) return self.binary_bool(strOp, resultLeft, resultRight, config) elif strOp == 'NOT': resultRight = self.translate_fcsql(query, config, basicSearch=basicSearch, start=iOpPos + len(strOp), diff --git a/main.py b/main.py index 099c5dad906328f17a613a657cc38e7950cf8fa6..c28ff40b5ad47065d1fe0fb666ddfcfeb1c3186e 100644 --- a/main.py +++ b/main.py @@ -66,9 +66,11 @@ def endpoint( if operation == Operation.searchRetrieve: if config.platform == CorpPlatform.tsakorpus: - res = app.qp_tsakorpus.translate_fcsql(query, config) - if type(res) == Diagnostic: - return Response(content=str(res), media_type='application/xml') + try: + res = app.qp_tsakorpus.translate_fcsql(query, config) + except Diagnostic as diag: + print('diag', str(diag)) + return Response(content=str(diag), media_type='application/xml') return str(res) return {'operation': operation, 'version': version}