From 241542fc4a74fb5b3c71db184614c593fe0e7f75 Mon Sep 17 00:00:00 2001
From: Timofey Arkhangelskiy <timofey.arkhangelskiy@uni-hamburg.de>
Date: Sat, 14 Jan 2023 23:00:38 +0100
Subject: [PATCH] Continue with advanced search parsing (not ready yet)

---
 common/config.py                    | 40 ++++++++++++++--
 common/query_parser.py              | 73 +++++++++++++++++++++++++++--
 common/tsakorpus_response_parser.py | 22 +--------
 config/test.json                    | 15 ++++--
 notes.txt                           | 11 ++++-
 5 files changed, 128 insertions(+), 33 deletions(-)

diff --git a/common/config.py b/common/config.py
index 1e97077..031c82b 100644
--- a/common/config.py
+++ b/common/config.py
@@ -41,7 +41,8 @@ class ResourceConfig:
         self.supported_layers = []
         self.resources = []
         self.search_lang_id = ''
-        self.pos_convert = {}
+        self.pos_convert = []           # corpus-specific to UD (regexes)
+        self.pos_convert_reverse = {}   # UD to corpus-specific
 
         self.query_timeout = 60
 
@@ -52,10 +53,10 @@ class ResourceConfig:
         self.lsParams = set()
 
         # dictionaries where values are strings
-        self.dict_sParams = {'pos_convert'}
+        self.dict_sParams = {'pos_convert_reverse'}
 
         # dictionaries where values are lists of strings
-        self.dict_lsParams = set()
+        self.dict_lsParams = {'pos_convert'}
 
         # dictionaries where values are dictionaries {k: string}
         self.dict_dParams = set()
@@ -230,6 +231,39 @@ class ResourceConfig:
             json.dump(dictConfig, fOut, sort_keys=True, ensure_ascii=False, indent=2)
 
 
+class POSConvertor:
+    """
+    Convert corpus-specific parts of speech / grammar tags to
+    UPOS, using regexes correspondences set in the config.
+    """
+    def __init__(self, config: ResourceConfig):
+        self.posConvert = config.pos_convert
+        self.posConvertReverse = config.pos_convert_reverse
+        self.posTests = [(re.compile(k), v) for k, v in self.posConvert]
+
+    def convert_pos(self, pos):
+        """
+        Convert corpus-specific POS tags to UPOS, if possible.
+        Regexes are sequentially applied to the corpus-specific
+        string with POS or more detailed grammatical tags. The
+        first regex in the list that matches wins.
+        """
+        for k, v in self.posTests:
+            if k.search(pos) is not None:
+                return v
+        return pos
+
+    def convert_ud_pos(self, udPos):
+        """
+        Convert UD POS to a corpus-specific POS tag or string
+        with more detailed grammatical tags.
+        """
+        try:
+            return self.posConvertReverse[udPos]
+        except KeyError:
+            return udPos
+
+
 def read_configs(configDir='./config'):
     """
     Load all configuration files from the configuration directory
diff --git a/common/query_parser.py b/common/query_parser.py
index bd04f89..ffb00f3 100644
--- a/common/query_parser.py
+++ b/common/query_parser.py
@@ -16,12 +16,16 @@ class QueryParser:
     # Regexes for advanced search
     rxWithinClause = re.compile(' +within +(s|sentence|u|utterance|p|paragraph|'
                                 't|turn|text|session) *$')
+    rxNonemptyQueryPart = re.compile('[^ \t\r\n]')
 
     def __init__(self):
         pass
 
     @staticmethod
     def find_operator(strQuery, start=0, end=-1):
+        """
+        Locate the highest NOT, AND or OR operator in a simple query.
+        """
         if end == -1:
             end = len(strQuery) - 1
         if strQuery[start:start+3] == 'NOT':
@@ -30,10 +34,10 @@ class QueryParser:
         inQuotes = False
         for i in range(start, end):
             if inQuotes:
-                if strQuery[i] == '"':
+                if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\':
                     inQuotes = False
                 continue
-            if strQuery[i] == '"':
+            if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\':
                 inQuotes = True
                 continue
             if strQuery[i] == '(':
@@ -47,6 +51,53 @@ class QueryParser:
                     return i, 'OR'
         return -1, ''
 
+    @staticmethod
+    def find_operator_adv(strQuery, start=0, end=-1):
+        """
+        Locate the highest SEQUENCE (whitespace[s]) or OR (|) operator
+        in an advanced query.
+        """
+        if end == -1:
+            end = len(strQuery) - 1
+        parenthBalance = 0
+        bracketBalance = 0
+        curlyBalance = 0
+        inQuotes = False
+        for i in range(start, end):
+            if inQuotes:
+                if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\':
+                    inQuotes = False
+                continue
+            if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\':
+                inQuotes = True
+                continue
+            if strQuery[i] == '(':
+                parenthBalance += 1
+            elif strQuery[i] == ')':
+                parenthBalance -= 1
+            elif strQuery[i] == '[':
+                bracketBalance += 1
+            elif strQuery[i] == ']':
+                bracketBalance -= 1
+            elif strQuery[i] == '{':
+                curlyBalance += 1
+            elif strQuery[i] == '}':
+                curlyBalance -= 1
+            elif (i > 0 and parenthBalance == 0 and bracketBalance == 0 and curlyBalance == 0
+                  and QueryParser.rxNonemptyQueryPart.search(strQuery[:i]) is not None
+                  and QueryParser.rxNonemptyQueryPart.search(strQuery[i:]) is not None):
+                iCurChar = i
+                while iCurChar <= end:
+                    if strQuery[iCurChar] != ' ':
+                        break
+                    iCurChar += 1
+                if iCurChar <= end:
+                    if strQuery[iCurChar] == '|':
+                        return iCurChar, 'OR'
+                    elif strQuery[iCurChar] not in '+*?':
+                        return iCurChar - 1, 'SEQUENCE'
+        return -1, ''
+
     @staticmethod
     def shift_term_indexes(getParams, shift):
         """
@@ -159,8 +210,22 @@ class QueryParser:
             end -= 1
         if start >= end:
             raise Diagnostic(DiagnosticTypes.sru, 10)
+        iOpPos, strOp = self.find_operator_adv(query, start, end)
+        if iOpPos == -1:
+            return self.adv_simple_query(query, config, start=start, end=end)
+        resultLeft = self.adv_simple_query(query, config, start=start, end=iOpPos)
+        resultRight = self.adv_main_query(query, config, start=start, end=iOpPos + 1)
+        if strOp == 'SEQUENCE':
+            if len(resultLeft) <= 0 or len(resultRight) <= 0:
+                raise Diagnostic(DiagnosticTypes.sru, 10)
+            return self.adv_main_sequence(resultLeft, resultRight, config)
+        elif strOp == 'OR':
+            resultLeft = self.adv_simple_query(query, config, start=start, end=iOpPos)
+            resultRight = self.adv_main_query(query, config, start=start, end=iOpPos + 1)
+            if len(resultLeft) <= 0 or len(resultRight) <= 0:
+                raise Diagnostic(DiagnosticTypes.sru, 10)
+            return self.adv_main_or(resultLeft, resultRight, config)
         raise NotImplementedError
-        return {}
 
     def translate_advanced(self, query: str, config: ResourceConfig):
         """
@@ -189,7 +254,7 @@ class QueryParser:
             end = len(query)
         if end == 0:
             raise Diagnostic(DiagnosticTypes.sru, 27)
-        return self.adv_main_query(query, config, start=0, end=end)
+        return self.adv_main_query(query, config, start=0, end=end), withinClause
 
 
     def validate_query(self, operation, version, queryType, query,
diff --git a/common/tsakorpus_response_parser.py b/common/tsakorpus_response_parser.py
index 77a051c..68b8d29 100644
--- a/common/tsakorpus_response_parser.py
+++ b/common/tsakorpus_response_parser.py
@@ -4,31 +4,11 @@ import json
 import html
 from lxml.html import fragment_fromstring
 from .enums import *
-from .config import ResourceConfig
+from .config import ResourceConfig, POSConvertor
 from .search_retrieve import Record
 from .diagnostics import Diagnostic, DiagnosticTypes
 
 
-class POSConvertor:
-    """
-    Convert corpus-specific parts of speech / grammar tags to
-    UPOS, using regexes correspondences set in the config.
-    """
-    def __init__(self, config: ResourceConfig):
-        self.posConvert = config.pos_convert
-        self.posTests = [(re.compile(k), v) for k, v in self.posConvert]
-
-    def convert_pos(self, pos):
-        """
-        Convert corpus-specific POS tags to UPOS, if possible.
-        Ea
-        """
-        for k, v in self.posTests:
-            if k.search(pos) is not None:
-                return v
-        return pos
-
-
 class TsakorpusResponseParser:
     """
     Parses responses from a Tsakorpus instance.
diff --git a/config/test.json b/config/test.json
index 5cd0366..54a1cce 100644
--- a/config/test.json
+++ b/config/test.json
@@ -1,12 +1,21 @@
 {
-	"host": "0.0.0.0",
+	"host": "multimedia-corpus.beserman.ru",
 	"port": "80",
+	"transport_protocol": "http",
 	"max_hits": 15,
 	"platform": "tsakorpus",
-	"resource_base_url": "http://127.0.0.1:7342",
+	"advanced_search_capability": true,
+	"adv_supported": true,
+	"resource_base_url": "http://multimedia-corpus.beserman.ru/",
 	"search_lang_id": "beserman",
 	"pos_convert": [
+		["\\bN\\b.*?\\brel_n\\b", "ADP"],
 		["\\bN\\b", "NOUN"],
 		["\\bV\\b", "VERB"]
-	]
+	],
+	"pos_convert_reverse": {
+		"NOUN": "(N,~rel_n)",
+		"VERB": "V",
+		"ADP": "N,rel_n"
+	}
 }
\ No newline at end of file
diff --git a/notes.txt b/notes.txt
index e05e36d..0974489 100644
--- a/notes.txt
+++ b/notes.txt
@@ -4,6 +4,8 @@ p. 3: "The value of the @version attribute MUST be 2." But SRU 1.2 EndpointDescr
 
 p. 4: Link 'section "Layers"' leads to a CLARIN-internal Trac instance.
 
+p. 6: It is claimed that those and only those layer identifiers that are listed in the table under 2.2.2.1 can be used in FCS-QL queries. However, the example below (p. 7) contains identifiers "word" and "token" mot listed there (should have been "text"?), and the BNF grammar allows for any identifier.
+
 p. 8-9: Links to the .xsd's lead to a CLARIN-internal Trac instance.
 
 p. 8. It is said that all endpoints must implement the Generic Hits view as 'send-by-default'. No such thing is said about the Advanced view, but it is also designated as 'send-by-default'. Why is that?
@@ -14,10 +16,15 @@ p. 9-10: In the advanced search results, how does a client understand which laye
 
 p. 10: "Send explain request without version and operation parameter" -- but an explain request has to have the operation parameter with 'explain' as its value, as is stated somewhere nearby. UPDATE: SRU documentation says that, indeed, an empty request is treated as an explain request. Maybe this exception is worth mentioning this explicitly here.
 
-p. 12 and elsewhere: http://explain.z3950.org/dtd/2.0/ mentioned as the URL of the record schema, but its actual URL is http://zeerex.z3950.org/dtd/zeerex-2.0.dtd .
+p. 12 and elsewhere: http://explain.z3950.org/dtd/2.0/ mentioned as the URL of the record schema, but its actual URL is http://zeerex.z3950.org/dtd/zeerex-2.0.dtd . Should I change that in the Explain response XML?
 
 p. 12-13, example: what is "result-id" in SupportedLayer? What do I put as the text of the SupportedLayer element?
 
 p. 14: x-cmd-resource-info parameter present in the query example, but never explained (mentioned in some 2013 slides on FCS; should now probably be x-fcs-endpoint-description)
 
-p. 14: What is the sruResponse prefix? It is never mentioned. No namespace is provided here, so the example is actually invalid XML. (Who doesn't validate their examples before showing them to a wide audience -- in an official specification, no less??) Maybe sru was actually meant (see p. 2 and an SRU v. 1.2 example above)? I'm putting sru there for now.
\ No newline at end of file
+p. 14: What is the sruResponse prefix? It is never mentioned. No namespace is provided here, so the example is actually invalid XML. (Who doesn't validate their examples before showing them to a wide audience -- in an official specification, no less??) Maybe sru was actually meant (see p. 2 and an SRU v. 1.2 example above)? I'm putting sru there for now.
+
+p. 17: The BNF for FCS-QL makes it impossible to have multi-token queries where the first token has a quantifier, but is not parenthesized, e.g.:
+[pos="NOUN"]{2,3} [text="dog"]
+I guess this is a consequence of a sloppily written BNF grammar rather than an intended effect?
+
-- 
GitLab