Skip to content
Snippets Groups Projects
Commit 5ff4a239 authored by Arkhangelskiy, Timofey's avatar Arkhangelskiy, Timofey
Browse files

Continue with advanced search parsing (not ready yet)

parent 241542fc
Branches
No related tags found
No related merge requests found
......@@ -17,6 +17,10 @@ class QueryParser:
rxWithinClause = re.compile(' +within +(s|sentence|u|utterance|p|paragraph|'
't|turn|text|session) *$')
rxNonemptyQueryPart = re.compile('[^ \t\r\n]')
rxSegmentQuery = re.compile('^\\[(.*)\\](\\{[0-9,]\\}|[?*+]|)$')
rxAdvTermQuery = re.compile('^ *([a-zA-Z][a-zA-Z0-9\-]*(?::[a-zA-Z][a-zA-Z0-9\-]*)?) * '
'(!?=) *(["\'](.*)["\']) *(/[iIcCld])? *$')
acceptableIdentifiers = {'text', 'lemma', 'pos', 'orth', 'norm', 'phonetic'}
def __init__(self):
pass
......@@ -63,6 +67,7 @@ class QueryParser:
bracketBalance = 0
curlyBalance = 0
inQuotes = False
inSingleQuotes = False
for i in range(start, end):
if inQuotes:
if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\':
......@@ -71,6 +76,13 @@ class QueryParser:
if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\':
inQuotes = True
continue
if inSingleQuotes:
if strQuery[i] == "'" and i > 0 and strQuery[i-1] != '\\':
inSingleQuotes = False
continue
if strQuery[i] == "'" and i > 0 and strQuery[i-1] != '\\':
inSingleQuotes = True
continue
if strQuery[i] == '(':
parenthBalance += 1
elif strQuery[i] == ')':
......@@ -98,6 +110,47 @@ class QueryParser:
return iCurChar - 1, 'SEQUENCE'
return -1, ''
@staticmethod
def find_operator_adv_expression(strQuery):
"""
Locate the highest |, & or ! operator in a segment expression
in the advanced search.
"""
start = 0
end = len(strQuery)
while start < len(strQuery) and strQuery[start] in ' \t\n':
start += 1
while end > 0 and strQuery[end - 1] in ' \t\n':
end -= 1
if strQuery[start] == '!':
return start, '!'
parenthBalance = 0
inQuotes = False
inSingleQuotes = False
for i in range(start, end):
if inQuotes:
if strQuery[i] == '"' and i > 0 and strQuery[i - 1] != '\\':
inQuotes = False
continue
if strQuery[i] == '"' and i > 0 and strQuery[i - 1] != '\\':
inQuotes = True
continue
if inSingleQuotes:
if strQuery[i] == "'" and i > 0 and strQuery[i-1] != '\\':
inSingleQuotes = False
continue
if strQuery[i] == "'" and i > 0 and strQuery[i-1] != '\\':
inSingleQuotes = True
continue
if strQuery[i] == '(':
parenthBalance += 1
elif strQuery[i] == ')':
parenthBalance -= 1
elif parenthBalance == 0:
if strQuery[i] in '|&':
return i, strQuery[i]
return -1, ''
@staticmethod
def shift_term_indexes(getParams, shift):
"""
......@@ -146,6 +199,35 @@ class QueryParser:
# Abstract function
raise NotImplementedError()
def adv_term_query_proper(self, identifier: str, op: str, value: str, flags: str, config):
# Abstract function
raise NotImplementedError()
def adv_term_query(self, query, config: ResourceConfig):
m = self.rxAdvTermQuery.search(query)
if m is None:
raise Diagnostic(DiagnosticTypes.sru, 10)
identifier, op, value, flags = m.group(1), m.group(2), m.group(3), m.group(4)
if value[0] != value[-1]:
raise Diagnostic(DiagnosticTypes.sru, 10) # Different quotes
value = value[1:len(value)-1] # Remove quotes
if flags is None:
flags = ''
if identifier in ('token', 'word'):
identifier = 'text' # Should I do this?
if identifier not in self.acceptableIdentifiers:
raise Diagnostic(DiagnosticTypes.sru, 10,
message=identifier + ' is not an acceptable identifier in a segment query.')
return self.adv_term_query_proper(identifier, op, value, flags, config)
def adv_binary_bool(self, strOp, operandL, operandR, config):
# Abstract function
raise NotImplementedError()
def adv_not_bool(self, operand, config):
# Abstract function
raise NotImplementedError()
def translate_simple(self, query: str, config: ResourceConfig, start=0, end=-1):
"""
Translate a simple search (CQL) query into a corpus-specific query
......@@ -199,6 +281,52 @@ class QueryParser:
return self.not_bool(resultRight, config)
return {}
def adv_expression_query(self, query: str, config: ResourceConfig):
iOpPos, strOp = self.find_operator_adv_expression(query)
if iOpPos == -1:
if query[0] == '(' and query[-1] == ')':
return self.adv_expression_query(query[1:len(query)-1], config)
else:
return self.adv_term_query(query, config)
if strOp in ('&', '|'):
resultLeft = self.adv_expression_query(query[:iOpPos], config)
resultRight = self.adv_expression_query(query[iOpPos+1:], config)
if len(resultLeft) <= 0 or len(resultRight) <= 0:
raise Diagnostic(DiagnosticTypes.sru, 10)
return self.adv_binary_bool(strOp, resultLeft, resultRight, config)
elif strOp == '!':
resultRight = self.adv_expression_query(query[iOpPos+1:], config)
return self.not_bool(resultRight, config)
def adv_segment_query(self, query: str, config: ResourceConfig):
m = self.rxSegmentQuery.search(query)
if m is None:
raise Diagnostic(DiagnosticTypes.sru, 27)
expression = m.group(1).strip()
quantifier = m.group(2)
# TODO: quantifier
return self.adv_expression_query(expression, config)
def adv_simple_query(self, query: str, config: ResourceConfig, start=0, end=-1):
if len(query) <= 0:
raise Diagnostic(DiagnosticTypes.sru, 27)
if start >= len(query) - 1 or end <= 0:
raise Diagnostic(DiagnosticTypes.sru, 10)
while start < len(query) and query[start] in ' \t\n':
start += 1
while end > 0 and query[end - 1] in ' \t\n':
end -= 1
if start >= end:
raise Diagnostic(DiagnosticTypes.sru, 10)
if query[start] == '(' and query[end] == ')':
return self.adv_main_query(query, config, start=start+1, end=end-1)
if (query[end - 1] != '\\'
and ((query[start] == '"' and query[end] == '"')
or (query[start] == "'" and query[end] == "'"))):
return self.adv_segment_query('[text=' + query[start:end] + ']', config)
return self.adv_segment_query(query[start:end], config)
def adv_main_query(self, query: str, config: ResourceConfig, start=0, end=-1):
if len(query) <= 0:
raise Diagnostic(DiagnosticTypes.sru, 27)
......
......@@ -84,6 +84,29 @@ class TsakorpusQueryParser(QueryParser):
# TODO: implement
raise NotImplementedError()
def adv_term_query_proper(self, identifier: str, op: str, value: str, flags: str, config: ResourceConfig):
"""
Return list of query parameters for one term in an advanced query.
"""
if len(value) <= 0:
return Diagnostic(DiagnosticTypes.sru, 10)
if flags not in ('', 'i', 'c'):
return Diagnostic(DiagnosticTypes.sru, 10, message='Tsakorpus does not support regex flags.')
getParams = []
if identifier == 'text':
getParams.append(['wf', 0, value])
elif identifier == 'lemma':
getParams.append(['lex', 0, value])
elif identifier == 'pos':
if value in config.pos_convert_reverse:
# UD to corpus-specific POS tags
value = config.pos_convert_reverse[value]
getParams.append(['gr', 0, value])
else:
return Diagnostic(DiagnosticTypes.sru, 10,
message='The identifier ' + identifier + ' is not supported in Tsakoprus.')
return getParams
def send_query(self, strGetParams: str, config: ResourceConfig):
"""
Send the translated query to the Tsakorpus instance. Return JSON results
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment