Continue with advanced search parsing (not ready yet)

241542fc · Arkhangelskiy, Timofey · 209a749e · 241542fc · 241542fc · 241542fc
Commit 241542fc authored 2 years ago by Arkhangelskiy, Timofey
--- a/common/config.py
+++ b/common/config.py
@@ -41,7 +41,8 @@ class ResourceConfig:
        self.supported_layers = []
        self.resources = []
        self.search_lang_id = ''
-        self.pos_convert = {}
+        self.pos_convert = []           # corpus-specific to UD (regexes)
+        self.pos_convert_reverse = {}   # UD to corpus-specific

        self.query_timeout = 60

@@ -52,10 +53,10 @@ class ResourceConfig:
        self.lsParams = set()

        # dictionaries where values are strings
-        self.dict_sParams = {'pos_convert'}
+        self.dict_sParams = {'pos_convert_reverse'}

        # dictionaries where values are lists of strings
-        self.dict_lsParams = set()
+        self.dict_lsParams = {'pos_convert'}

        # dictionaries where values are dictionaries {k: string}
        self.dict_dParams = set()
@@ -230,6 +231,39 @@ class ResourceConfig:
            json.dump(dictConfig, fOut, sort_keys=True, ensure_ascii=False, indent=2)


+class POSConvertor:
+    """
+    Convert corpus-specific parts of speech / grammar tags to
+    UPOS, using regexes correspondences set in the config.
+    """
+    def __init__(self, config: ResourceConfig):
+        self.posConvert = config.pos_convert
+        self.posConvertReverse = config.pos_convert_reverse
+        self.posTests = [(re.compile(k), v) for k, v in self.posConvert]
+
+    def convert_pos(self, pos):
+        """
+        Convert corpus-specific POS tags to UPOS, if possible.
+        Regexes are sequentially applied to the corpus-specific
+        string with POS or more detailed grammatical tags. The
+        first regex in the list that matches wins.
+        """
+        for k, v in self.posTests:
+            if k.search(pos) is not None:
+                return v
+        return pos
+
+    def convert_ud_pos(self, udPos):
+        """
+        Convert UD POS to a corpus-specific POS tag or string
+        with more detailed grammatical tags.
+        """
+        try:
+            return self.posConvertReverse[udPos]
+        except KeyError:
+            return udPos
+
+
 def read_configs(configDir='./config'):
    """
    Load all configuration files from the configuration directory

--- a/common/query_parser.py
+++ b/common/query_parser.py
@@ -16,12 +16,16 @@ class QueryParser:
    # Regexes for advanced search
    rxWithinClause = re.compile(' +within +(s|sentence|u|utterance|p|paragraph|'
                                't|turn|text|session) *$')
+    rxNonemptyQueryPart = re.compile('[^ \t\r\n]')

    def __init__(self):
        pass

    @staticmethod
    def find_operator(strQuery, start=0, end=-1):
+        """
+        Locate the highest NOT, AND or OR operator in a simple query.
+        """
        if end == -1:
            end = len(strQuery) - 1
        if strQuery[start:start+3] == 'NOT':
@@ -30,10 +34,10 @@ class QueryParser:
        inQuotes = False
        for i in range(start, end):
            if inQuotes:
-                if strQuery[i] == '"':
+                if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\':
                    inQuotes = False
                continue
-            if strQuery[i] == '"':
+            if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\':
                inQuotes = True
                continue
            if strQuery[i] == '(':
@@ -47,6 +51,53 @@ class QueryParser:
                    return i, 'OR'
        return -1, ''

+    @staticmethod
+    def find_operator_adv(strQuery, start=0, end=-1):
+        """
+        Locate the highest SEQUENCE (whitespace[s]) or OR (|) operator
+        in an advanced query.
+        """
+        if end == -1:
+            end = len(strQuery) - 1
+        parenthBalance = 0
+        bracketBalance = 0
+        curlyBalance = 0
+        inQuotes = False
+        for i in range(start, end):
+            if inQuotes:
+                if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\':
+                    inQuotes = False
+                continue
+            if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\':
+                inQuotes = True
+                continue
+            if strQuery[i] == '(':
+                parenthBalance += 1
+            elif strQuery[i] == ')':
+                parenthBalance -= 1
+            elif strQuery[i] == '[':
+                bracketBalance += 1
+            elif strQuery[i] == ']':
+                bracketBalance -= 1
+            elif strQuery[i] == '{':
+                curlyBalance += 1
+            elif strQuery[i] == '}':
+                curlyBalance -= 1
+            elif (i > 0 and parenthBalance == 0 and bracketBalance == 0 and curlyBalance == 0
+                  and QueryParser.rxNonemptyQueryPart.search(strQuery[:i]) is not None
+                  and QueryParser.rxNonemptyQueryPart.search(strQuery[i:]) is not None):
+                iCurChar = i
+                while iCurChar <= end:
+                    if strQuery[iCurChar] != ' ':
+                        break
+                    iCurChar += 1
+                if iCurChar <= end:
+                    if strQuery[iCurChar] == '|':
+                        return iCurChar, 'OR'
+                    elif strQuery[iCurChar] not in '+*?':
+                        return iCurChar - 1, 'SEQUENCE'
+        return -1, ''
+
    @staticmethod
    def shift_term_indexes(getParams, shift):
        """
@@ -159,8 +210,22 @@ class QueryParser:
            end -= 1
        if start >= end:
            raise Diagnostic(DiagnosticTypes.sru, 10)
+        iOpPos, strOp = self.find_operator_adv(query, start, end)
+        if iOpPos == -1:
+            return self.adv_simple_query(query, config, start=start, end=end)
+        resultLeft = self.adv_simple_query(query, config, start=start, end=iOpPos)
+        resultRight = self.adv_main_query(query, config, start=start, end=iOpPos + 1)
+        if strOp == 'SEQUENCE':
+            if len(resultLeft) <= 0 or len(resultRight) <= 0:
+                raise Diagnostic(DiagnosticTypes.sru, 10)
+            return self.adv_main_sequence(resultLeft, resultRight, config)
+        elif strOp == 'OR':
+            resultLeft = self.adv_simple_query(query, config, start=start, end=iOpPos)
+            resultRight = self.adv_main_query(query, config, start=start, end=iOpPos + 1)
+            if len(resultLeft) <= 0 or len(resultRight) <= 0:
+                raise Diagnostic(DiagnosticTypes.sru, 10)
+            return self.adv_main_or(resultLeft, resultRight, config)
        raise NotImplementedError
-        return {}

    def translate_advanced(self, query: str, config: ResourceConfig):
        """
@@ -189,7 +254,7 @@ class QueryParser:
            end = len(query)
        if end == 0:
            raise Diagnostic(DiagnosticTypes.sru, 27)
-        return self.adv_main_query(query, config, start=0, end=end)
+        return self.adv_main_query(query, config, start=0, end=end), withinClause


    def validate_query(self, operation, version, queryType, query,

--- a/common/tsakorpus_response_parser.py
+++ b/common/tsakorpus_response_parser.py
@@ -4,31 +4,11 @@ import json
 import html
 from lxml.html import fragment_fromstring
 from .enums import *
-from .config import ResourceConfig
+from .config import ResourceConfig, POSConvertor
 from .search_retrieve import Record
 from .diagnostics import Diagnostic, DiagnosticTypes


-class POSConvertor:
-    """
-    Convert corpus-specific parts of speech / grammar tags to
-    UPOS, using regexes correspondences set in the config.
-    """
-    def __init__(self, config: ResourceConfig):
-        self.posConvert = config.pos_convert
-        self.posTests = [(re.compile(k), v) for k, v in self.posConvert]
-
-    def convert_pos(self, pos):
-        """
-        Convert corpus-specific POS tags to UPOS, if possible.
-        Ea
-        """
-        for k, v in self.posTests:
-            if k.search(pos) is not None:
-                return v
-        return pos
-
-
 class TsakorpusResponseParser:
    """
    Parses responses from a Tsakorpus instance.

--- a/config/test.json
+++ b/config/test.json
 {
-	"host": "0.0.0.0",
+	"host": "multimedia-corpus.beserman.ru",
 	"port": "80",
+	"transport_protocol": "http",
 	"max_hits": 15,
 	"platform": "tsakorpus",
-	"resource_base_url": "http://127.0.0.1:7342",
+	"advanced_search_capability": true,
+	"adv_supported": true,
+	"resource_base_url": "http://multimedia-corpus.beserman.ru/",
 	"search_lang_id": "beserman",
 	"pos_convert": [
+		["\\bN\\b.*?\\brel_n\\b", "ADP"],
 		["\\bN\\b", "NOUN"],
 		["\\bV\\b", "VERB"]
-	]
+	],
+	"pos_convert_reverse": {
+		"NOUN": "(N,~rel_n)",
+		"VERB": "V",
+		"ADP": "N,rel_n"
+	}
 }
\ No newline at end of file
--- a/notes.txt
+++ b/notes.txt
@@ -4,6 +4,8 @@ p. 3: "The value of the @version attribute MUST be 2." But SRU 1.2 EndpointDescr

 p. 4: Link 'section "Layers"' leads to a CLARIN-internal Trac instance.

+p. 6: It is claimed that those and only those layer identifiers that are listed in the table under 2.2.2.1 can be used in FCS-QL queries. However, the example below (p. 7) contains identifiers "word" and "token" mot listed there (should have been "text"?), and the BNF grammar allows for any identifier.
+
 p. 8-9: Links to the .xsd's lead to a CLARIN-internal Trac instance.

 p. 8. It is said that all endpoints must implement the Generic Hits view as 'send-by-default'. No such thing is said about the Advanced view, but it is also designated as 'send-by-default'. Why is that?
@@ -14,10 +16,15 @@ p. 9-10: In the advanced search results, how does a client understand which laye

 p. 10: "Send explain request without version and operation parameter" -- but an explain request has to have the operation parameter with 'explain' as its value, as is stated somewhere nearby. UPDATE: SRU documentation says that, indeed, an empty request is treated as an explain request. Maybe this exception is worth mentioning this explicitly here.

-p. 12 and elsewhere: http://explain.z3950.org/dtd/2.0/ mentioned as the URL of the record schema, but its actual URL is http://zeerex.z3950.org/dtd/zeerex-2.0.dtd .
+p. 12 and elsewhere: http://explain.z3950.org/dtd/2.0/ mentioned as the URL of the record schema, but its actual URL is http://zeerex.z3950.org/dtd/zeerex-2.0.dtd . Should I change that in the Explain response XML?

 p. 12-13, example: what is "result-id" in SupportedLayer? What do I put as the text of the SupportedLayer element?

 p. 14: x-cmd-resource-info parameter present in the query example, but never explained (mentioned in some 2013 slides on FCS; should now probably be x-fcs-endpoint-description)

 p. 14: What is the sruResponse prefix? It is never mentioned. No namespace is provided here, so the example is actually invalid XML. (Who doesn't validate their examples before showing them to a wide audience -- in an official specification, no less??) Maybe sru was actually meant (see p. 2 and an SRU v. 1.2 example above)? I'm putting sru there for now.
+
+p. 17: The BNF for FCS-QL makes it impossible to have multi-token queries where the first token has a quantifier, but is not parenthesized, e.g.:
+[pos="NOUN"]{2,3} [text="dog"]
+I guess this is a consequence of a sloppily written BNF grammar rather than an intended effect?
+