diff --git a/common/litterae_query_parser.py b/common/litterae_query_parser.py index 118c3e01d7415f9098e223ad83a8e22da35576a5..ebaf73b9bc7798f13e1c58b331cdb26e1f45697f 100644 --- a/common/litterae_query_parser.py +++ b/common/litterae_query_parser.py @@ -14,11 +14,14 @@ class LitteraeQueryParser(QueryParser): Parses search queries for Formulae, Litterae, Chartae. """ - def build_get_string(self, getParams, config): + def build_get_string(self, getParams, config, withinClause=''): """ Build a GET string (everything after the ?) from a description of the GET parameters in the getParams list. """ + if len(withinClause) > 0 and withinClause not in ('text', 'session'): + raise Diagnostic(DiagnosticTypes.sru, 48, message='FLC web interface only supports multi-word search within' + 'a text.') termIndexes = self.term_indexes(getParams) nWords = len(termIndexes) boolOperatorMentioned = False diff --git a/common/query_parser.py b/common/query_parser.py index 9f3d6c9076d96619ab816eff0599d16265fdf06e..46b1fd558c02e20c48a5f8da6da672203cf1ea65 100644 --- a/common/query_parser.py +++ b/common/query_parser.py @@ -18,8 +18,8 @@ class QueryParser: 't|turn|text|session) *$') rxNonemptyQueryPart = re.compile('[^ \t\r\n]') rxSegmentQuery = re.compile('^\\[(.*)\\](\\{[0-9,]\\}|[?*+]|)$') - rxAdvTermQuery = re.compile('^ *([a-zA-Z][a-zA-Z0-9\-]*(?::[a-zA-Z][a-zA-Z0-9\-]*)?) * ' - '(!?=) *(["\'](.*)["\']) *(/[iIcCld])? *$') + rxAdvTermQuery = re.compile('^ *([a-zA-Z][a-zA-Z0-9\\-]*(?::[a-zA-Z][a-zA-Z0-9\\-]*)?) *' + '(!?=) *(["\'].*["\']) *(/[iIcCld])? *$') acceptableIdentifiers = {'text', 'lemma', 'pos', 'orth', 'norm', 'phonetic'} def __init__(self): @@ -38,10 +38,10 @@ class QueryParser: inQuotes = False for i in range(start, end): if inQuotes: - if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\': + if strQuery[i] == '"' and (i <= 0 or strQuery[i-1] != '\\'): inQuotes = False continue - if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\': + if strQuery[i] == '"' and (i <= 0 or strQuery[i-1] != '\\'): inQuotes = True continue if strQuery[i] == '(': @@ -70,17 +70,17 @@ class QueryParser: inSingleQuotes = False for i in range(start, end): if inQuotes: - if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\': + if strQuery[i] == '"' and (i <= 0 or strQuery[i-1] != '\\'): inQuotes = False continue - if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\': + if strQuery[i] == '"' and (i <= 0 or strQuery[i-1] != '\\'): inQuotes = True continue if inSingleQuotes: - if strQuery[i] == "'" and i > 0 and strQuery[i-1] != '\\': + if strQuery[i] == "'" and (i <= 0 or strQuery[i-1] != '\\'): inSingleQuotes = False continue - if strQuery[i] == "'" and i > 0 and strQuery[i-1] != '\\': + if strQuery[i] == "'" and (i <= 0 or strQuery[i-1] != '\\'): inSingleQuotes = True continue if strQuery[i] == '(': @@ -183,7 +183,7 @@ class QueryParser: terms.add(t) return [t for t in sorted(terms)] - def build_get_string(self, getParams, config): + def build_get_string(self, getParams, config, withinClause=''): # Abstract function raise NotImplementedError() @@ -204,10 +204,12 @@ class QueryParser: raise NotImplementedError() def adv_term_query(self, query, config: ResourceConfig): + print('ADVANCED TERM QUERY', query) m = self.rxAdvTermQuery.search(query) if m is None: raise Diagnostic(DiagnosticTypes.sru, 10) identifier, op, value, flags = m.group(1), m.group(2), m.group(3), m.group(4) + print('ADVANCED TERM QUERY', identifier, op, value, flags) if value[0] != value[-1]: raise Diagnostic(DiagnosticTypes.sru, 10) # Different quotes value = value[1:len(value)-1] # Remove quotes @@ -279,10 +281,12 @@ class QueryParser: resultRight = self.translate_simple(query, config, start=iOpPos + len(strOp), end=end) return self.not_bool(resultRight, config) - return {} + raise Diagnostic(DiagnosticTypes.sru, 10) def adv_expression_query(self, query: str, config: ResourceConfig): + query = query.strip() iOpPos, strOp = self.find_operator_adv_expression(query) + print('ADVANED EXPRESSION QUERY', iOpPos, strOp) if iOpPos == -1: if query[0] == '(' and query[-1] == ')': return self.adv_expression_query(query[1:len(query)-1], config) @@ -297,8 +301,10 @@ class QueryParser: elif strOp == '!': resultRight = self.adv_expression_query(query[iOpPos+1:], config) return self.not_bool(resultRight, config) + raise Diagnostic(DiagnosticTypes.sru, 10) def adv_segment_query(self, query: str, config: ResourceConfig): + print('ADVANCED SEGMENT QUERY', query) m = self.rxSegmentQuery.search(query) if m is None: raise Diagnostic(DiagnosticTypes.sru, 27) @@ -321,12 +327,11 @@ class QueryParser: if query[start] == '(' and query[end] == ')': return self.adv_main_query(query, config, start=start+1, end=end-1) if (query[end - 1] != '\\' - and ((query[start] == '"' and query[end] == '"') - or (query[start] == "'" and query[end] == "'"))): + and ((query[start] == '"' and query[end - 1] == '"') + or (query[start] == "'" and query[end - 1] == "'"))): return self.adv_segment_query('[text=' + query[start:end] + ']', config) return self.adv_segment_query(query[start:end], config) - def adv_main_query(self, query: str, config: ResourceConfig, start=0, end=-1): if len(query) <= 0: raise Diagnostic(DiagnosticTypes.sru, 27) @@ -339,6 +344,7 @@ class QueryParser: if start >= end: raise Diagnostic(DiagnosticTypes.sru, 10) iOpPos, strOp = self.find_operator_adv(query, start, end) + print('ADVANCED QUERY', iOpPos, strOp) if iOpPos == -1: return self.adv_simple_query(query, config, start=start, end=end) resultLeft = self.adv_simple_query(query, config, start=start, end=iOpPos) @@ -365,6 +371,7 @@ class QueryParser: in the current query and then calling a respective lower-level function, which may be platform-specific. """ + print('ADVANCED QUERY', query) withinClause = '' end = len(query) m = self.rxWithinClause.search(query) @@ -382,8 +389,8 @@ class QueryParser: end = len(query) if end == 0: raise Diagnostic(DiagnosticTypes.sru, 27) - return self.adv_main_query(query, config, start=0, end=end), withinClause - + return self.build_get_string(self.adv_main_query(query, config, start=0, end=end), config, + withinClause=withinClause) def validate_query(self, operation, version, queryType, query, xFcsEndpointDescription, xFcsContext, diff --git a/common/tsakorpus_query_parser.py b/common/tsakorpus_query_parser.py index 7ea8e10858f4a18a925348b6f88ec1077b8eb2a5..077b9e68fc71387c55df274bb8f100a6148b9914 100644 --- a/common/tsakorpus_query_parser.py +++ b/common/tsakorpus_query_parser.py @@ -14,11 +14,15 @@ class TsakorpusQueryParser(QueryParser): rxTsakorpusBool = re.compile('[()|,]') - def build_get_string(self, getParams, config): + def build_get_string(self, getParams, config: ResourceConfig, withinClause=''): """ Build a GET string (everything after the ?) from a description of the GET parameters in the getParams list. """ + if len(withinClause) > 0 and withinClause not in ('sentence', 'utterance', 'paragraph'): + raise Diagnostic(DiagnosticTypes.sru, 48, message='Tsakorpus only supports multi-word search within' + 'one segment that normally equals one sentence / ' + 'utterance / turn.') termIndexes = self.term_indexes(getParams) nWords = len(termIndexes) s = 'n_words=' + str(nWords) @@ -34,14 +38,14 @@ class TsakorpusQueryParser(QueryParser): s += '&precise=on&sort=random&response_format=json&distance_strict=on' return s - def term_query(self, query, config): + def term_query(self, query: str, config: ResourceConfig): """ Return list of query parameters for one term or sequence of terms. """ if len(query) >= 2 and query.startswith('"') and query.endswith('"'): query = query[1:len(query)-1] if len(query) <= 0: - return Diagnostic(DiagnosticTypes.sru, 10) + raise Diagnostic(DiagnosticTypes.sru, 10) getParams = [] iTerm = 0 for term in query.split(' '): @@ -55,7 +59,7 @@ class TsakorpusQueryParser(QueryParser): getParams.append(['word_dist_to_', iTerm, '1']) return getParams - def binary_bool(self, strOp, operandL, operandR, config): + def binary_bool(self, strOp: str, operandL, operandR, config): if len(operandL) <= 0 or len(operandR) <= 0: raise Diagnostic(DiagnosticTypes.sru, 10) termsL = self.term_indexes(operandL) @@ -89,31 +93,71 @@ class TsakorpusQueryParser(QueryParser): """ Return list of query parameters for one term in an advanced query. """ + flags = flags.strip('/') if len(value) <= 0: - return Diagnostic(DiagnosticTypes.sru, 10) + raise Diagnostic(DiagnosticTypes.sru, 10) if flags not in ('', 'i', 'c'): - return Diagnostic(DiagnosticTypes.sru, 10, message='Tsakorpus does not support regex flags.') + raise Diagnostic(DiagnosticTypes.sru, 48, message='Tsakorpus does not support regex flags.') + if op not in ('=', '!='): + raise Diagnostic(DiagnosticTypes.sru, 10, + message='In token queries, only = and != are allowed as operators.') + if op == '!=': + if identifier != 'pos': + value = '~' + value + else: + value = '~(' + value + ')' getParams = [] if identifier == 'text': - getParams.append(['wf', 0, value]) + getParams.append(['wf', 1, value]) elif identifier == 'lemma': - getParams.append(['lex', 0, value]) + getParams.append(['lex', 1, value]) elif identifier == 'pos': if value in config.pos_convert_reverse: # UD to corpus-specific POS tags value = config.pos_convert_reverse[value] - getParams.append(['gr', 0, value]) + getParams.append(['gr', 1, value]) else: - return Diagnostic(DiagnosticTypes.sru, 10, + raise Diagnostic(DiagnosticTypes.sru, 10, message='The identifier ' + identifier + ' is not supported in Tsakoprus.') return getParams + def adv_binary_bool(self, strOp: str, operandL, operandR, config: ResourceConfig): + # Join multiple constraints on one word in an advanced query + print('ADVANCED INTERNAL BOOL', strOp, str(operandL), str(operandR)) + getParams = [] + if strOp == '&': + strOp = ',' + paramsR = {paramR[0] for paramR in operandR} + for paramR in operandR: + paramExists = False + for paramL in operandL: + if paramL[0] == paramR[0]: + if strOp == ',' and paramL[0] != 'gr': + raise Diagnostic(DiagnosticTypes.sru, 48, + message='Tsakorpus endpoint does not support conjunctions ' + 'of multiple constraints for the same layer ' + 'within the same word.') + paramExists = True + getParams.append([paramL[0], paramL[1], '(' + paramL[2] + ')' + strOp + '(' + paramR[2] + ')']) + if not paramExists: + getParams.append(paramR[:]) + for paramL in operandL: + if paramL[0] not in paramsR: + if strOp == '|': + raise Diagnostic(DiagnosticTypes.sru, 48, + message='Tsakorpus does not support disjunctions ' + 'of constraints for multiple layers ' + 'within the same word.') + getParams.append(paramL[:]) + return getParams + def send_query(self, strGetParams: str, config: ResourceConfig): """ Send the translated query to the Tsakorpus instance. Return JSON results returned by the corpus. """ url = config.resource_base_url.strip('/') + '/search_sent?' + strGetParams + print(url) response = urllib.request.urlopen(url) data = response.read() encoding = response.info().get_content_charset('utf-8') diff --git a/common/tsakorpus_response_parser.py b/common/tsakorpus_response_parser.py index 10d7e68704dc1ea18dac8801629a91c5b9df91a0..4b2d112dc2b33df26e52eb3f5771fd07b5a4d9ee 100644 --- a/common/tsakorpus_response_parser.py +++ b/common/tsakorpus_response_parser.py @@ -21,6 +21,7 @@ class TsakorpusResponseParser: Parse HTML annotation for one word taken from a hit. Add the data to the layers in the record object. """ + print(anno) annoTree = fragment_fromstring(anno, create_parent='div') lemmas = set() @@ -35,13 +36,17 @@ class TsakorpusResponseParser: # This should not happen, but just in case word += '|' word += node.text - lexNodes = annoTree.xpath('div[@class="popup_word"]/div[@class="popup_ana"]/span[@class="popup_lex"]') + lexNodes = annoTree.xpath('div[@class="popup_word"]/' + 'div[contains(@class, \'popup_ana\')]/' + 'span[@class="popup_lex"]') for node in lexNodes: if node.text is not None: lemmas.add(node.text) if len(lemmas) > 0: lemmasStr = '|'.join(l for l in sorted(lemmas)) - posNodes = annoTree.xpath('div[@class="popup_word"]/div[@class="popup_ana"]/span[@class="popup_pos"]') + posNodes = annoTree.xpath('div[@class="popup_word"]/' + 'div[contains(@class, \'popup_ana\')]/' + 'span[@class="popup_pos"]') for node in posNodes: if node.text is not None: posText = re.sub(' |[ \t\ufeff]+', '', node.text) @@ -71,7 +76,6 @@ class TsakorpusResponseParser: 'value': lemmasStr }) - def parse_span(self, el, record, advancedHits=False): """ Parse one <span> element from the HTML representation @@ -123,7 +127,7 @@ class TsakorpusResponseParser: or 'text' not in hit['languages'][lang]): return record contentTxt = re.sub('[\r\n\t\ufeff]+', '', hit['languages'][lang]['text'], flags=re.DOTALL) - print(contentTxt) + # print(contentTxt) content = fragment_fromstring(contentTxt, create_parent='div') for el in content: diff --git a/common/views_logic.py b/common/views_logic.py index 77584b85d00a8ab9b459dda1fb6648eecda5b02d..23d5612338390ab88423456d1bd0970af6cc4484 100644 --- a/common/views_logic.py +++ b/common/views_logic.py @@ -84,7 +84,7 @@ def process_search_retrieve(version: SRUVersion, strGetParams = app.qp_tsakorpus.translate_simple(query, config) else: strGetParams = app.qp_tsakorpus.translate_advanced(query, config) - # print(strGetParams) + print(strGetParams) res = app.qp_tsakorpus.send_query(strGetParams, config) except Diagnostic as diag: return fatal_response(Operation.searchRetrieve, version, diagnostics + [diag], request, templates) @@ -146,6 +146,7 @@ def process_request(operation: Operation, :param diagnostics: List of diagnostics produced by the validation function. """ + print(query) # If something is clearly wrong with the query, return # a response with the list of diagnostics if config is None or any(d.is_fatal() for d in diagnostics): diff --git a/test_queries.txt b/test_queries.txt index c9143d0961971fd91111622f5d61f99fa652e0cf..db607020a7c85206f5349029665d21f372060856 100644 --- a/test_queries.txt +++ b/test_queries.txt @@ -8,3 +8,8 @@ http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=ke%20AND% http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=ke%20no Tsakorpus -- Simple search with multiple words ("ke no", 18 hits) http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=ke%20no Tsakorpus -- Simple search with multiple words ("ke no", 18 hits) http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=ke%20AND%20no&x-fcs-dataviews=adv Tsakorpus -- Simple search with boolean operator, advanced view ("ke" AND "no", 103 hits) +http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=%22ka%22&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, term only ("ka", 112 hits) +http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[text=%22mon%22]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, term only / word ("mon", 1465 hits) +http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[lemma=%22mon%22]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, term only / lemma ("mon", 2284 hits) +http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[pos=%22NOUN%22]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, term only / POS (NOUN, 22639 hits) +http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[lemma%3D%22m.*%22%20%26%20(pos=%22NOUN%22|pos=%22VERB%22)]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, Boolean operators inside term query (words that start with "m" and are either nouns or verbs, 3878 hits) \ No newline at end of file