Skip to content
Snippets Groups Projects
Commit 241542fc authored by Arkhangelskiy, Timofey's avatar Arkhangelskiy, Timofey
Browse files

Continue with advanced search parsing (not ready yet)

parent 209a749e
No related branches found
No related tags found
No related merge requests found
......@@ -41,7 +41,8 @@ class ResourceConfig:
self.supported_layers = []
self.resources = []
self.search_lang_id = ''
self.pos_convert = {}
self.pos_convert = [] # corpus-specific to UD (regexes)
self.pos_convert_reverse = {} # UD to corpus-specific
self.query_timeout = 60
......@@ -52,10 +53,10 @@ class ResourceConfig:
self.lsParams = set()
# dictionaries where values are strings
self.dict_sParams = {'pos_convert'}
self.dict_sParams = {'pos_convert_reverse'}
# dictionaries where values are lists of strings
self.dict_lsParams = set()
self.dict_lsParams = {'pos_convert'}
# dictionaries where values are dictionaries {k: string}
self.dict_dParams = set()
......@@ -230,6 +231,39 @@ class ResourceConfig:
json.dump(dictConfig, fOut, sort_keys=True, ensure_ascii=False, indent=2)
class POSConvertor:
"""
Convert corpus-specific parts of speech / grammar tags to
UPOS, using regexes correspondences set in the config.
"""
def __init__(self, config: ResourceConfig):
self.posConvert = config.pos_convert
self.posConvertReverse = config.pos_convert_reverse
self.posTests = [(re.compile(k), v) for k, v in self.posConvert]
def convert_pos(self, pos):
"""
Convert corpus-specific POS tags to UPOS, if possible.
Regexes are sequentially applied to the corpus-specific
string with POS or more detailed grammatical tags. The
first regex in the list that matches wins.
"""
for k, v in self.posTests:
if k.search(pos) is not None:
return v
return pos
def convert_ud_pos(self, udPos):
"""
Convert UD POS to a corpus-specific POS tag or string
with more detailed grammatical tags.
"""
try:
return self.posConvertReverse[udPos]
except KeyError:
return udPos
def read_configs(configDir='./config'):
"""
Load all configuration files from the configuration directory
......
......@@ -16,12 +16,16 @@ class QueryParser:
# Regexes for advanced search
rxWithinClause = re.compile(' +within +(s|sentence|u|utterance|p|paragraph|'
't|turn|text|session) *$')
rxNonemptyQueryPart = re.compile('[^ \t\r\n]')
def __init__(self):
pass
@staticmethod
def find_operator(strQuery, start=0, end=-1):
"""
Locate the highest NOT, AND or OR operator in a simple query.
"""
if end == -1:
end = len(strQuery) - 1
if strQuery[start:start+3] == 'NOT':
......@@ -30,10 +34,10 @@ class QueryParser:
inQuotes = False
for i in range(start, end):
if inQuotes:
if strQuery[i] == '"':
if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\':
inQuotes = False
continue
if strQuery[i] == '"':
if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\':
inQuotes = True
continue
if strQuery[i] == '(':
......@@ -47,6 +51,53 @@ class QueryParser:
return i, 'OR'
return -1, ''
@staticmethod
def find_operator_adv(strQuery, start=0, end=-1):
"""
Locate the highest SEQUENCE (whitespace[s]) or OR (|) operator
in an advanced query.
"""
if end == -1:
end = len(strQuery) - 1
parenthBalance = 0
bracketBalance = 0
curlyBalance = 0
inQuotes = False
for i in range(start, end):
if inQuotes:
if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\':
inQuotes = False
continue
if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\':
inQuotes = True
continue
if strQuery[i] == '(':
parenthBalance += 1
elif strQuery[i] == ')':
parenthBalance -= 1
elif strQuery[i] == '[':
bracketBalance += 1
elif strQuery[i] == ']':
bracketBalance -= 1
elif strQuery[i] == '{':
curlyBalance += 1
elif strQuery[i] == '}':
curlyBalance -= 1
elif (i > 0 and parenthBalance == 0 and bracketBalance == 0 and curlyBalance == 0
and QueryParser.rxNonemptyQueryPart.search(strQuery[:i]) is not None
and QueryParser.rxNonemptyQueryPart.search(strQuery[i:]) is not None):
iCurChar = i
while iCurChar <= end:
if strQuery[iCurChar] != ' ':
break
iCurChar += 1
if iCurChar <= end:
if strQuery[iCurChar] == '|':
return iCurChar, 'OR'
elif strQuery[iCurChar] not in '+*?':
return iCurChar - 1, 'SEQUENCE'
return -1, ''
@staticmethod
def shift_term_indexes(getParams, shift):
"""
......@@ -159,8 +210,22 @@ class QueryParser:
end -= 1
if start >= end:
raise Diagnostic(DiagnosticTypes.sru, 10)
iOpPos, strOp = self.find_operator_adv(query, start, end)
if iOpPos == -1:
return self.adv_simple_query(query, config, start=start, end=end)
resultLeft = self.adv_simple_query(query, config, start=start, end=iOpPos)
resultRight = self.adv_main_query(query, config, start=start, end=iOpPos + 1)
if strOp == 'SEQUENCE':
if len(resultLeft) <= 0 or len(resultRight) <= 0:
raise Diagnostic(DiagnosticTypes.sru, 10)
return self.adv_main_sequence(resultLeft, resultRight, config)
elif strOp == 'OR':
resultLeft = self.adv_simple_query(query, config, start=start, end=iOpPos)
resultRight = self.adv_main_query(query, config, start=start, end=iOpPos + 1)
if len(resultLeft) <= 0 or len(resultRight) <= 0:
raise Diagnostic(DiagnosticTypes.sru, 10)
return self.adv_main_or(resultLeft, resultRight, config)
raise NotImplementedError
return {}
def translate_advanced(self, query: str, config: ResourceConfig):
"""
......@@ -189,7 +254,7 @@ class QueryParser:
end = len(query)
if end == 0:
raise Diagnostic(DiagnosticTypes.sru, 27)
return self.adv_main_query(query, config, start=0, end=end)
return self.adv_main_query(query, config, start=0, end=end), withinClause
def validate_query(self, operation, version, queryType, query,
......
......@@ -4,31 +4,11 @@ import json
import html
from lxml.html import fragment_fromstring
from .enums import *
from .config import ResourceConfig
from .config import ResourceConfig, POSConvertor
from .search_retrieve import Record
from .diagnostics import Diagnostic, DiagnosticTypes
class POSConvertor:
"""
Convert corpus-specific parts of speech / grammar tags to
UPOS, using regexes correspondences set in the config.
"""
def __init__(self, config: ResourceConfig):
self.posConvert = config.pos_convert
self.posTests = [(re.compile(k), v) for k, v in self.posConvert]
def convert_pos(self, pos):
"""
Convert corpus-specific POS tags to UPOS, if possible.
Ea
"""
for k, v in self.posTests:
if k.search(pos) is not None:
return v
return pos
class TsakorpusResponseParser:
"""
Parses responses from a Tsakorpus instance.
......
{
"host": "0.0.0.0",
"host": "multimedia-corpus.beserman.ru",
"port": "80",
"transport_protocol": "http",
"max_hits": 15,
"platform": "tsakorpus",
"resource_base_url": "http://127.0.0.1:7342",
"advanced_search_capability": true,
"adv_supported": true,
"resource_base_url": "http://multimedia-corpus.beserman.ru/",
"search_lang_id": "beserman",
"pos_convert": [
["\\bN\\b.*?\\brel_n\\b", "ADP"],
["\\bN\\b", "NOUN"],
["\\bV\\b", "VERB"]
]
],
"pos_convert_reverse": {
"NOUN": "(N,~rel_n)",
"VERB": "V",
"ADP": "N,rel_n"
}
}
\ No newline at end of file
......@@ -4,6 +4,8 @@ p. 3: "The value of the @version attribute MUST be 2." But SRU 1.2 EndpointDescr
p. 4: Link 'section "Layers"' leads to a CLARIN-internal Trac instance.
p. 6: It is claimed that those and only those layer identifiers that are listed in the table under 2.2.2.1 can be used in FCS-QL queries. However, the example below (p. 7) contains identifiers "word" and "token" mot listed there (should have been "text"?), and the BNF grammar allows for any identifier.
p. 8-9: Links to the .xsd's lead to a CLARIN-internal Trac instance.
p. 8. It is said that all endpoints must implement the Generic Hits view as 'send-by-default'. No such thing is said about the Advanced view, but it is also designated as 'send-by-default'. Why is that?
......@@ -14,10 +16,15 @@ p. 9-10: In the advanced search results, how does a client understand which laye
p. 10: "Send explain request without version and operation parameter" -- but an explain request has to have the operation parameter with 'explain' as its value, as is stated somewhere nearby. UPDATE: SRU documentation says that, indeed, an empty request is treated as an explain request. Maybe this exception is worth mentioning this explicitly here.
p. 12 and elsewhere: http://explain.z3950.org/dtd/2.0/ mentioned as the URL of the record schema, but its actual URL is http://zeerex.z3950.org/dtd/zeerex-2.0.dtd .
p. 12 and elsewhere: http://explain.z3950.org/dtd/2.0/ mentioned as the URL of the record schema, but its actual URL is http://zeerex.z3950.org/dtd/zeerex-2.0.dtd . Should I change that in the Explain response XML?
p. 12-13, example: what is "result-id" in SupportedLayer? What do I put as the text of the SupportedLayer element?
p. 14: x-cmd-resource-info parameter present in the query example, but never explained (mentioned in some 2013 slides on FCS; should now probably be x-fcs-endpoint-description)
p. 14: What is the sruResponse prefix? It is never mentioned. No namespace is provided here, so the example is actually invalid XML. (Who doesn't validate their examples before showing them to a wide audience -- in an official specification, no less??) Maybe sru was actually meant (see p. 2 and an SRU v. 1.2 example above)? I'm putting sru there for now.
p. 17: The BNF for FCS-QL makes it impossible to have multi-token queries where the first token has a quantifier, but is not parenthesized, e.g.:
[pos="NOUN"]{2,3} [text="dog"]
I guess this is a consequence of a sloppily written BNF grammar rather than an intended effect?
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment