From 241542fc4a74fb5b3c71db184614c593fe0e7f75 Mon Sep 17 00:00:00 2001 From: Timofey Arkhangelskiy <timofey.arkhangelskiy@uni-hamburg.de> Date: Sat, 14 Jan 2023 23:00:38 +0100 Subject: [PATCH] Continue with advanced search parsing (not ready yet) --- common/config.py | 40 ++++++++++++++-- common/query_parser.py | 73 +++++++++++++++++++++++++++-- common/tsakorpus_response_parser.py | 22 +-------- config/test.json | 15 ++++-- notes.txt | 11 ++++- 5 files changed, 128 insertions(+), 33 deletions(-) diff --git a/common/config.py b/common/config.py index 1e97077..031c82b 100644 --- a/common/config.py +++ b/common/config.py @@ -41,7 +41,8 @@ class ResourceConfig: self.supported_layers = [] self.resources = [] self.search_lang_id = '' - self.pos_convert = {} + self.pos_convert = [] # corpus-specific to UD (regexes) + self.pos_convert_reverse = {} # UD to corpus-specific self.query_timeout = 60 @@ -52,10 +53,10 @@ class ResourceConfig: self.lsParams = set() # dictionaries where values are strings - self.dict_sParams = {'pos_convert'} + self.dict_sParams = {'pos_convert_reverse'} # dictionaries where values are lists of strings - self.dict_lsParams = set() + self.dict_lsParams = {'pos_convert'} # dictionaries where values are dictionaries {k: string} self.dict_dParams = set() @@ -230,6 +231,39 @@ class ResourceConfig: json.dump(dictConfig, fOut, sort_keys=True, ensure_ascii=False, indent=2) +class POSConvertor: + """ + Convert corpus-specific parts of speech / grammar tags to + UPOS, using regexes correspondences set in the config. + """ + def __init__(self, config: ResourceConfig): + self.posConvert = config.pos_convert + self.posConvertReverse = config.pos_convert_reverse + self.posTests = [(re.compile(k), v) for k, v in self.posConvert] + + def convert_pos(self, pos): + """ + Convert corpus-specific POS tags to UPOS, if possible. + Regexes are sequentially applied to the corpus-specific + string with POS or more detailed grammatical tags. The + first regex in the list that matches wins. + """ + for k, v in self.posTests: + if k.search(pos) is not None: + return v + return pos + + def convert_ud_pos(self, udPos): + """ + Convert UD POS to a corpus-specific POS tag or string + with more detailed grammatical tags. + """ + try: + return self.posConvertReverse[udPos] + except KeyError: + return udPos + + def read_configs(configDir='./config'): """ Load all configuration files from the configuration directory diff --git a/common/query_parser.py b/common/query_parser.py index bd04f89..ffb00f3 100644 --- a/common/query_parser.py +++ b/common/query_parser.py @@ -16,12 +16,16 @@ class QueryParser: # Regexes for advanced search rxWithinClause = re.compile(' +within +(s|sentence|u|utterance|p|paragraph|' 't|turn|text|session) *$') + rxNonemptyQueryPart = re.compile('[^ \t\r\n]') def __init__(self): pass @staticmethod def find_operator(strQuery, start=0, end=-1): + """ + Locate the highest NOT, AND or OR operator in a simple query. + """ if end == -1: end = len(strQuery) - 1 if strQuery[start:start+3] == 'NOT': @@ -30,10 +34,10 @@ class QueryParser: inQuotes = False for i in range(start, end): if inQuotes: - if strQuery[i] == '"': + if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\': inQuotes = False continue - if strQuery[i] == '"': + if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\': inQuotes = True continue if strQuery[i] == '(': @@ -47,6 +51,53 @@ class QueryParser: return i, 'OR' return -1, '' + @staticmethod + def find_operator_adv(strQuery, start=0, end=-1): + """ + Locate the highest SEQUENCE (whitespace[s]) or OR (|) operator + in an advanced query. + """ + if end == -1: + end = len(strQuery) - 1 + parenthBalance = 0 + bracketBalance = 0 + curlyBalance = 0 + inQuotes = False + for i in range(start, end): + if inQuotes: + if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\': + inQuotes = False + continue + if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\': + inQuotes = True + continue + if strQuery[i] == '(': + parenthBalance += 1 + elif strQuery[i] == ')': + parenthBalance -= 1 + elif strQuery[i] == '[': + bracketBalance += 1 + elif strQuery[i] == ']': + bracketBalance -= 1 + elif strQuery[i] == '{': + curlyBalance += 1 + elif strQuery[i] == '}': + curlyBalance -= 1 + elif (i > 0 and parenthBalance == 0 and bracketBalance == 0 and curlyBalance == 0 + and QueryParser.rxNonemptyQueryPart.search(strQuery[:i]) is not None + and QueryParser.rxNonemptyQueryPart.search(strQuery[i:]) is not None): + iCurChar = i + while iCurChar <= end: + if strQuery[iCurChar] != ' ': + break + iCurChar += 1 + if iCurChar <= end: + if strQuery[iCurChar] == '|': + return iCurChar, 'OR' + elif strQuery[iCurChar] not in '+*?': + return iCurChar - 1, 'SEQUENCE' + return -1, '' + @staticmethod def shift_term_indexes(getParams, shift): """ @@ -159,8 +210,22 @@ class QueryParser: end -= 1 if start >= end: raise Diagnostic(DiagnosticTypes.sru, 10) + iOpPos, strOp = self.find_operator_adv(query, start, end) + if iOpPos == -1: + return self.adv_simple_query(query, config, start=start, end=end) + resultLeft = self.adv_simple_query(query, config, start=start, end=iOpPos) + resultRight = self.adv_main_query(query, config, start=start, end=iOpPos + 1) + if strOp == 'SEQUENCE': + if len(resultLeft) <= 0 or len(resultRight) <= 0: + raise Diagnostic(DiagnosticTypes.sru, 10) + return self.adv_main_sequence(resultLeft, resultRight, config) + elif strOp == 'OR': + resultLeft = self.adv_simple_query(query, config, start=start, end=iOpPos) + resultRight = self.adv_main_query(query, config, start=start, end=iOpPos + 1) + if len(resultLeft) <= 0 or len(resultRight) <= 0: + raise Diagnostic(DiagnosticTypes.sru, 10) + return self.adv_main_or(resultLeft, resultRight, config) raise NotImplementedError - return {} def translate_advanced(self, query: str, config: ResourceConfig): """ @@ -189,7 +254,7 @@ class QueryParser: end = len(query) if end == 0: raise Diagnostic(DiagnosticTypes.sru, 27) - return self.adv_main_query(query, config, start=0, end=end) + return self.adv_main_query(query, config, start=0, end=end), withinClause def validate_query(self, operation, version, queryType, query, diff --git a/common/tsakorpus_response_parser.py b/common/tsakorpus_response_parser.py index 77a051c..68b8d29 100644 --- a/common/tsakorpus_response_parser.py +++ b/common/tsakorpus_response_parser.py @@ -4,31 +4,11 @@ import json import html from lxml.html import fragment_fromstring from .enums import * -from .config import ResourceConfig +from .config import ResourceConfig, POSConvertor from .search_retrieve import Record from .diagnostics import Diagnostic, DiagnosticTypes -class POSConvertor: - """ - Convert corpus-specific parts of speech / grammar tags to - UPOS, using regexes correspondences set in the config. - """ - def __init__(self, config: ResourceConfig): - self.posConvert = config.pos_convert - self.posTests = [(re.compile(k), v) for k, v in self.posConvert] - - def convert_pos(self, pos): - """ - Convert corpus-specific POS tags to UPOS, if possible. - Ea - """ - for k, v in self.posTests: - if k.search(pos) is not None: - return v - return pos - - class TsakorpusResponseParser: """ Parses responses from a Tsakorpus instance. diff --git a/config/test.json b/config/test.json index 5cd0366..54a1cce 100644 --- a/config/test.json +++ b/config/test.json @@ -1,12 +1,21 @@ { - "host": "0.0.0.0", + "host": "multimedia-corpus.beserman.ru", "port": "80", + "transport_protocol": "http", "max_hits": 15, "platform": "tsakorpus", - "resource_base_url": "http://127.0.0.1:7342", + "advanced_search_capability": true, + "adv_supported": true, + "resource_base_url": "http://multimedia-corpus.beserman.ru/", "search_lang_id": "beserman", "pos_convert": [ + ["\\bN\\b.*?\\brel_n\\b", "ADP"], ["\\bN\\b", "NOUN"], ["\\bV\\b", "VERB"] - ] + ], + "pos_convert_reverse": { + "NOUN": "(N,~rel_n)", + "VERB": "V", + "ADP": "N,rel_n" + } } \ No newline at end of file diff --git a/notes.txt b/notes.txt index e05e36d..0974489 100644 --- a/notes.txt +++ b/notes.txt @@ -4,6 +4,8 @@ p. 3: "The value of the @version attribute MUST be 2." But SRU 1.2 EndpointDescr p. 4: Link 'section "Layers"' leads to a CLARIN-internal Trac instance. +p. 6: It is claimed that those and only those layer identifiers that are listed in the table under 2.2.2.1 can be used in FCS-QL queries. However, the example below (p. 7) contains identifiers "word" and "token" mot listed there (should have been "text"?), and the BNF grammar allows for any identifier. + p. 8-9: Links to the .xsd's lead to a CLARIN-internal Trac instance. p. 8. It is said that all endpoints must implement the Generic Hits view as 'send-by-default'. No such thing is said about the Advanced view, but it is also designated as 'send-by-default'. Why is that? @@ -14,10 +16,15 @@ p. 9-10: In the advanced search results, how does a client understand which laye p. 10: "Send explain request without version and operation parameter" -- but an explain request has to have the operation parameter with 'explain' as its value, as is stated somewhere nearby. UPDATE: SRU documentation says that, indeed, an empty request is treated as an explain request. Maybe this exception is worth mentioning this explicitly here. -p. 12 and elsewhere: http://explain.z3950.org/dtd/2.0/ mentioned as the URL of the record schema, but its actual URL is http://zeerex.z3950.org/dtd/zeerex-2.0.dtd . +p. 12 and elsewhere: http://explain.z3950.org/dtd/2.0/ mentioned as the URL of the record schema, but its actual URL is http://zeerex.z3950.org/dtd/zeerex-2.0.dtd . Should I change that in the Explain response XML? p. 12-13, example: what is "result-id" in SupportedLayer? What do I put as the text of the SupportedLayer element? p. 14: x-cmd-resource-info parameter present in the query example, but never explained (mentioned in some 2013 slides on FCS; should now probably be x-fcs-endpoint-description) -p. 14: What is the sruResponse prefix? It is never mentioned. No namespace is provided here, so the example is actually invalid XML. (Who doesn't validate their examples before showing them to a wide audience -- in an official specification, no less??) Maybe sru was actually meant (see p. 2 and an SRU v. 1.2 example above)? I'm putting sru there for now. \ No newline at end of file +p. 14: What is the sruResponse prefix? It is never mentioned. No namespace is provided here, so the example is actually invalid XML. (Who doesn't validate their examples before showing them to a wide audience -- in an official specification, no less??) Maybe sru was actually meant (see p. 2 and an SRU v. 1.2 example above)? I'm putting sru there for now. + +p. 17: The BNF for FCS-QL makes it impossible to have multi-token queries where the first token has a quantifier, but is not parenthesized, e.g.: +[pos="NOUN"]{2,3} [text="dog"] +I guess this is a consequence of a sloppily written BNF grammar rather than an intended effect? + -- GitLab