Skip to content
Snippets Groups Projects
Commit 241542fc authored by Arkhangelskiy, Timofey's avatar Arkhangelskiy, Timofey
Browse files

Continue with advanced search parsing (not ready yet)

parent 209a749e
No related branches found
No related tags found
No related merge requests found
...@@ -41,7 +41,8 @@ class ResourceConfig: ...@@ -41,7 +41,8 @@ class ResourceConfig:
self.supported_layers = [] self.supported_layers = []
self.resources = [] self.resources = []
self.search_lang_id = '' self.search_lang_id = ''
self.pos_convert = {} self.pos_convert = [] # corpus-specific to UD (regexes)
self.pos_convert_reverse = {} # UD to corpus-specific
self.query_timeout = 60 self.query_timeout = 60
...@@ -52,10 +53,10 @@ class ResourceConfig: ...@@ -52,10 +53,10 @@ class ResourceConfig:
self.lsParams = set() self.lsParams = set()
# dictionaries where values are strings # dictionaries where values are strings
self.dict_sParams = {'pos_convert'} self.dict_sParams = {'pos_convert_reverse'}
# dictionaries where values are lists of strings # dictionaries where values are lists of strings
self.dict_lsParams = set() self.dict_lsParams = {'pos_convert'}
# dictionaries where values are dictionaries {k: string} # dictionaries where values are dictionaries {k: string}
self.dict_dParams = set() self.dict_dParams = set()
...@@ -230,6 +231,39 @@ class ResourceConfig: ...@@ -230,6 +231,39 @@ class ResourceConfig:
json.dump(dictConfig, fOut, sort_keys=True, ensure_ascii=False, indent=2) json.dump(dictConfig, fOut, sort_keys=True, ensure_ascii=False, indent=2)
class POSConvertor:
"""
Convert corpus-specific parts of speech / grammar tags to
UPOS, using regexes correspondences set in the config.
"""
def __init__(self, config: ResourceConfig):
self.posConvert = config.pos_convert
self.posConvertReverse = config.pos_convert_reverse
self.posTests = [(re.compile(k), v) for k, v in self.posConvert]
def convert_pos(self, pos):
"""
Convert corpus-specific POS tags to UPOS, if possible.
Regexes are sequentially applied to the corpus-specific
string with POS or more detailed grammatical tags. The
first regex in the list that matches wins.
"""
for k, v in self.posTests:
if k.search(pos) is not None:
return v
return pos
def convert_ud_pos(self, udPos):
"""
Convert UD POS to a corpus-specific POS tag or string
with more detailed grammatical tags.
"""
try:
return self.posConvertReverse[udPos]
except KeyError:
return udPos
def read_configs(configDir='./config'): def read_configs(configDir='./config'):
""" """
Load all configuration files from the configuration directory Load all configuration files from the configuration directory
......
...@@ -16,12 +16,16 @@ class QueryParser: ...@@ -16,12 +16,16 @@ class QueryParser:
# Regexes for advanced search # Regexes for advanced search
rxWithinClause = re.compile(' +within +(s|sentence|u|utterance|p|paragraph|' rxWithinClause = re.compile(' +within +(s|sentence|u|utterance|p|paragraph|'
't|turn|text|session) *$') 't|turn|text|session) *$')
rxNonemptyQueryPart = re.compile('[^ \t\r\n]')
def __init__(self): def __init__(self):
pass pass
@staticmethod @staticmethod
def find_operator(strQuery, start=0, end=-1): def find_operator(strQuery, start=0, end=-1):
"""
Locate the highest NOT, AND or OR operator in a simple query.
"""
if end == -1: if end == -1:
end = len(strQuery) - 1 end = len(strQuery) - 1
if strQuery[start:start+3] == 'NOT': if strQuery[start:start+3] == 'NOT':
...@@ -30,10 +34,10 @@ class QueryParser: ...@@ -30,10 +34,10 @@ class QueryParser:
inQuotes = False inQuotes = False
for i in range(start, end): for i in range(start, end):
if inQuotes: if inQuotes:
if strQuery[i] == '"': if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\':
inQuotes = False inQuotes = False
continue continue
if strQuery[i] == '"': if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\':
inQuotes = True inQuotes = True
continue continue
if strQuery[i] == '(': if strQuery[i] == '(':
...@@ -47,6 +51,53 @@ class QueryParser: ...@@ -47,6 +51,53 @@ class QueryParser:
return i, 'OR' return i, 'OR'
return -1, '' return -1, ''
@staticmethod
def find_operator_adv(strQuery, start=0, end=-1):
"""
Locate the highest SEQUENCE (whitespace[s]) or OR (|) operator
in an advanced query.
"""
if end == -1:
end = len(strQuery) - 1
parenthBalance = 0
bracketBalance = 0
curlyBalance = 0
inQuotes = False
for i in range(start, end):
if inQuotes:
if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\':
inQuotes = False
continue
if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\':
inQuotes = True
continue
if strQuery[i] == '(':
parenthBalance += 1
elif strQuery[i] == ')':
parenthBalance -= 1
elif strQuery[i] == '[':
bracketBalance += 1
elif strQuery[i] == ']':
bracketBalance -= 1
elif strQuery[i] == '{':
curlyBalance += 1
elif strQuery[i] == '}':
curlyBalance -= 1
elif (i > 0 and parenthBalance == 0 and bracketBalance == 0 and curlyBalance == 0
and QueryParser.rxNonemptyQueryPart.search(strQuery[:i]) is not None
and QueryParser.rxNonemptyQueryPart.search(strQuery[i:]) is not None):
iCurChar = i
while iCurChar <= end:
if strQuery[iCurChar] != ' ':
break
iCurChar += 1
if iCurChar <= end:
if strQuery[iCurChar] == '|':
return iCurChar, 'OR'
elif strQuery[iCurChar] not in '+*?':
return iCurChar - 1, 'SEQUENCE'
return -1, ''
@staticmethod @staticmethod
def shift_term_indexes(getParams, shift): def shift_term_indexes(getParams, shift):
""" """
...@@ -159,8 +210,22 @@ class QueryParser: ...@@ -159,8 +210,22 @@ class QueryParser:
end -= 1 end -= 1
if start >= end: if start >= end:
raise Diagnostic(DiagnosticTypes.sru, 10) raise Diagnostic(DiagnosticTypes.sru, 10)
iOpPos, strOp = self.find_operator_adv(query, start, end)
if iOpPos == -1:
return self.adv_simple_query(query, config, start=start, end=end)
resultLeft = self.adv_simple_query(query, config, start=start, end=iOpPos)
resultRight = self.adv_main_query(query, config, start=start, end=iOpPos + 1)
if strOp == 'SEQUENCE':
if len(resultLeft) <= 0 or len(resultRight) <= 0:
raise Diagnostic(DiagnosticTypes.sru, 10)
return self.adv_main_sequence(resultLeft, resultRight, config)
elif strOp == 'OR':
resultLeft = self.adv_simple_query(query, config, start=start, end=iOpPos)
resultRight = self.adv_main_query(query, config, start=start, end=iOpPos + 1)
if len(resultLeft) <= 0 or len(resultRight) <= 0:
raise Diagnostic(DiagnosticTypes.sru, 10)
return self.adv_main_or(resultLeft, resultRight, config)
raise NotImplementedError raise NotImplementedError
return {}
def translate_advanced(self, query: str, config: ResourceConfig): def translate_advanced(self, query: str, config: ResourceConfig):
""" """
...@@ -189,7 +254,7 @@ class QueryParser: ...@@ -189,7 +254,7 @@ class QueryParser:
end = len(query) end = len(query)
if end == 0: if end == 0:
raise Diagnostic(DiagnosticTypes.sru, 27) raise Diagnostic(DiagnosticTypes.sru, 27)
return self.adv_main_query(query, config, start=0, end=end) return self.adv_main_query(query, config, start=0, end=end), withinClause
def validate_query(self, operation, version, queryType, query, def validate_query(self, operation, version, queryType, query,
......
...@@ -4,31 +4,11 @@ import json ...@@ -4,31 +4,11 @@ import json
import html import html
from lxml.html import fragment_fromstring from lxml.html import fragment_fromstring
from .enums import * from .enums import *
from .config import ResourceConfig from .config import ResourceConfig, POSConvertor
from .search_retrieve import Record from .search_retrieve import Record
from .diagnostics import Diagnostic, DiagnosticTypes from .diagnostics import Diagnostic, DiagnosticTypes
class POSConvertor:
"""
Convert corpus-specific parts of speech / grammar tags to
UPOS, using regexes correspondences set in the config.
"""
def __init__(self, config: ResourceConfig):
self.posConvert = config.pos_convert
self.posTests = [(re.compile(k), v) for k, v in self.posConvert]
def convert_pos(self, pos):
"""
Convert corpus-specific POS tags to UPOS, if possible.
Ea
"""
for k, v in self.posTests:
if k.search(pos) is not None:
return v
return pos
class TsakorpusResponseParser: class TsakorpusResponseParser:
""" """
Parses responses from a Tsakorpus instance. Parses responses from a Tsakorpus instance.
......
{ {
"host": "0.0.0.0", "host": "multimedia-corpus.beserman.ru",
"port": "80", "port": "80",
"transport_protocol": "http",
"max_hits": 15, "max_hits": 15,
"platform": "tsakorpus", "platform": "tsakorpus",
"resource_base_url": "http://127.0.0.1:7342", "advanced_search_capability": true,
"adv_supported": true,
"resource_base_url": "http://multimedia-corpus.beserman.ru/",
"search_lang_id": "beserman", "search_lang_id": "beserman",
"pos_convert": [ "pos_convert": [
["\\bN\\b.*?\\brel_n\\b", "ADP"],
["\\bN\\b", "NOUN"], ["\\bN\\b", "NOUN"],
["\\bV\\b", "VERB"] ["\\bV\\b", "VERB"]
] ],
"pos_convert_reverse": {
"NOUN": "(N,~rel_n)",
"VERB": "V",
"ADP": "N,rel_n"
}
} }
\ No newline at end of file
...@@ -4,6 +4,8 @@ p. 3: "The value of the @version attribute MUST be 2." But SRU 1.2 EndpointDescr ...@@ -4,6 +4,8 @@ p. 3: "The value of the @version attribute MUST be 2." But SRU 1.2 EndpointDescr
p. 4: Link 'section "Layers"' leads to a CLARIN-internal Trac instance. p. 4: Link 'section "Layers"' leads to a CLARIN-internal Trac instance.
p. 6: It is claimed that those and only those layer identifiers that are listed in the table under 2.2.2.1 can be used in FCS-QL queries. However, the example below (p. 7) contains identifiers "word" and "token" mot listed there (should have been "text"?), and the BNF grammar allows for any identifier.
p. 8-9: Links to the .xsd's lead to a CLARIN-internal Trac instance. p. 8-9: Links to the .xsd's lead to a CLARIN-internal Trac instance.
p. 8. It is said that all endpoints must implement the Generic Hits view as 'send-by-default'. No such thing is said about the Advanced view, but it is also designated as 'send-by-default'. Why is that? p. 8. It is said that all endpoints must implement the Generic Hits view as 'send-by-default'. No such thing is said about the Advanced view, but it is also designated as 'send-by-default'. Why is that?
...@@ -14,10 +16,15 @@ p. 9-10: In the advanced search results, how does a client understand which laye ...@@ -14,10 +16,15 @@ p. 9-10: In the advanced search results, how does a client understand which laye
p. 10: "Send explain request without version and operation parameter" -- but an explain request has to have the operation parameter with 'explain' as its value, as is stated somewhere nearby. UPDATE: SRU documentation says that, indeed, an empty request is treated as an explain request. Maybe this exception is worth mentioning this explicitly here. p. 10: "Send explain request without version and operation parameter" -- but an explain request has to have the operation parameter with 'explain' as its value, as is stated somewhere nearby. UPDATE: SRU documentation says that, indeed, an empty request is treated as an explain request. Maybe this exception is worth mentioning this explicitly here.
p. 12 and elsewhere: http://explain.z3950.org/dtd/2.0/ mentioned as the URL of the record schema, but its actual URL is http://zeerex.z3950.org/dtd/zeerex-2.0.dtd . p. 12 and elsewhere: http://explain.z3950.org/dtd/2.0/ mentioned as the URL of the record schema, but its actual URL is http://zeerex.z3950.org/dtd/zeerex-2.0.dtd . Should I change that in the Explain response XML?
p. 12-13, example: what is "result-id" in SupportedLayer? What do I put as the text of the SupportedLayer element? p. 12-13, example: what is "result-id" in SupportedLayer? What do I put as the text of the SupportedLayer element?
p. 14: x-cmd-resource-info parameter present in the query example, but never explained (mentioned in some 2013 slides on FCS; should now probably be x-fcs-endpoint-description) p. 14: x-cmd-resource-info parameter present in the query example, but never explained (mentioned in some 2013 slides on FCS; should now probably be x-fcs-endpoint-description)
p. 14: What is the sruResponse prefix? It is never mentioned. No namespace is provided here, so the example is actually invalid XML. (Who doesn't validate their examples before showing them to a wide audience -- in an official specification, no less??) Maybe sru was actually meant (see p. 2 and an SRU v. 1.2 example above)? I'm putting sru there for now. p. 14: What is the sruResponse prefix? It is never mentioned. No namespace is provided here, so the example is actually invalid XML. (Who doesn't validate their examples before showing them to a wide audience -- in an official specification, no less??) Maybe sru was actually meant (see p. 2 and an SRU v. 1.2 example above)? I'm putting sru there for now.
p. 17: The BNF for FCS-QL makes it impossible to have multi-token queries where the first token has a quantifier, but is not parenthesized, e.g.:
[pos="NOUN"]{2,3} [text="dog"]
I guess this is a consequence of a sloppily written BNF grammar rather than an intended effect?
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment