Skip to content
Snippets Groups Projects
Commit 5ff4a239 authored by Arkhangelskiy, Timofey's avatar Arkhangelskiy, Timofey
Browse files

Continue with advanced search parsing (not ready yet)

parent 241542fc
No related branches found
No related tags found
No related merge requests found
...@@ -17,6 +17,10 @@ class QueryParser: ...@@ -17,6 +17,10 @@ class QueryParser:
rxWithinClause = re.compile(' +within +(s|sentence|u|utterance|p|paragraph|' rxWithinClause = re.compile(' +within +(s|sentence|u|utterance|p|paragraph|'
't|turn|text|session) *$') 't|turn|text|session) *$')
rxNonemptyQueryPart = re.compile('[^ \t\r\n]') rxNonemptyQueryPart = re.compile('[^ \t\r\n]')
rxSegmentQuery = re.compile('^\\[(.*)\\](\\{[0-9,]\\}|[?*+]|)$')
rxAdvTermQuery = re.compile('^ *([a-zA-Z][a-zA-Z0-9\-]*(?::[a-zA-Z][a-zA-Z0-9\-]*)?) * '
'(!?=) *(["\'](.*)["\']) *(/[iIcCld])? *$')
acceptableIdentifiers = {'text', 'lemma', 'pos', 'orth', 'norm', 'phonetic'}
def __init__(self): def __init__(self):
pass pass
...@@ -63,6 +67,7 @@ class QueryParser: ...@@ -63,6 +67,7 @@ class QueryParser:
bracketBalance = 0 bracketBalance = 0
curlyBalance = 0 curlyBalance = 0
inQuotes = False inQuotes = False
inSingleQuotes = False
for i in range(start, end): for i in range(start, end):
if inQuotes: if inQuotes:
if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\': if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\':
...@@ -71,6 +76,13 @@ class QueryParser: ...@@ -71,6 +76,13 @@ class QueryParser:
if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\': if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\':
inQuotes = True inQuotes = True
continue continue
if inSingleQuotes:
if strQuery[i] == "'" and i > 0 and strQuery[i-1] != '\\':
inSingleQuotes = False
continue
if strQuery[i] == "'" and i > 0 and strQuery[i-1] != '\\':
inSingleQuotes = True
continue
if strQuery[i] == '(': if strQuery[i] == '(':
parenthBalance += 1 parenthBalance += 1
elif strQuery[i] == ')': elif strQuery[i] == ')':
...@@ -98,6 +110,47 @@ class QueryParser: ...@@ -98,6 +110,47 @@ class QueryParser:
return iCurChar - 1, 'SEQUENCE' return iCurChar - 1, 'SEQUENCE'
return -1, '' return -1, ''
@staticmethod
def find_operator_adv_expression(strQuery):
"""
Locate the highest |, & or ! operator in a segment expression
in the advanced search.
"""
start = 0
end = len(strQuery)
while start < len(strQuery) and strQuery[start] in ' \t\n':
start += 1
while end > 0 and strQuery[end - 1] in ' \t\n':
end -= 1
if strQuery[start] == '!':
return start, '!'
parenthBalance = 0
inQuotes = False
inSingleQuotes = False
for i in range(start, end):
if inQuotes:
if strQuery[i] == '"' and i > 0 and strQuery[i - 1] != '\\':
inQuotes = False
continue
if strQuery[i] == '"' and i > 0 and strQuery[i - 1] != '\\':
inQuotes = True
continue
if inSingleQuotes:
if strQuery[i] == "'" and i > 0 and strQuery[i-1] != '\\':
inSingleQuotes = False
continue
if strQuery[i] == "'" and i > 0 and strQuery[i-1] != '\\':
inSingleQuotes = True
continue
if strQuery[i] == '(':
parenthBalance += 1
elif strQuery[i] == ')':
parenthBalance -= 1
elif parenthBalance == 0:
if strQuery[i] in '|&':
return i, strQuery[i]
return -1, ''
@staticmethod @staticmethod
def shift_term_indexes(getParams, shift): def shift_term_indexes(getParams, shift):
""" """
...@@ -146,6 +199,35 @@ class QueryParser: ...@@ -146,6 +199,35 @@ class QueryParser:
# Abstract function # Abstract function
raise NotImplementedError() raise NotImplementedError()
def adv_term_query_proper(self, identifier: str, op: str, value: str, flags: str, config):
# Abstract function
raise NotImplementedError()
def adv_term_query(self, query, config: ResourceConfig):
m = self.rxAdvTermQuery.search(query)
if m is None:
raise Diagnostic(DiagnosticTypes.sru, 10)
identifier, op, value, flags = m.group(1), m.group(2), m.group(3), m.group(4)
if value[0] != value[-1]:
raise Diagnostic(DiagnosticTypes.sru, 10) # Different quotes
value = value[1:len(value)-1] # Remove quotes
if flags is None:
flags = ''
if identifier in ('token', 'word'):
identifier = 'text' # Should I do this?
if identifier not in self.acceptableIdentifiers:
raise Diagnostic(DiagnosticTypes.sru, 10,
message=identifier + ' is not an acceptable identifier in a segment query.')
return self.adv_term_query_proper(identifier, op, value, flags, config)
def adv_binary_bool(self, strOp, operandL, operandR, config):
# Abstract function
raise NotImplementedError()
def adv_not_bool(self, operand, config):
# Abstract function
raise NotImplementedError()
def translate_simple(self, query: str, config: ResourceConfig, start=0, end=-1): def translate_simple(self, query: str, config: ResourceConfig, start=0, end=-1):
""" """
Translate a simple search (CQL) query into a corpus-specific query Translate a simple search (CQL) query into a corpus-specific query
...@@ -199,6 +281,52 @@ class QueryParser: ...@@ -199,6 +281,52 @@ class QueryParser:
return self.not_bool(resultRight, config) return self.not_bool(resultRight, config)
return {} return {}
def adv_expression_query(self, query: str, config: ResourceConfig):
iOpPos, strOp = self.find_operator_adv_expression(query)
if iOpPos == -1:
if query[0] == '(' and query[-1] == ')':
return self.adv_expression_query(query[1:len(query)-1], config)
else:
return self.adv_term_query(query, config)
if strOp in ('&', '|'):
resultLeft = self.adv_expression_query(query[:iOpPos], config)
resultRight = self.adv_expression_query(query[iOpPos+1:], config)
if len(resultLeft) <= 0 or len(resultRight) <= 0:
raise Diagnostic(DiagnosticTypes.sru, 10)
return self.adv_binary_bool(strOp, resultLeft, resultRight, config)
elif strOp == '!':
resultRight = self.adv_expression_query(query[iOpPos+1:], config)
return self.not_bool(resultRight, config)
def adv_segment_query(self, query: str, config: ResourceConfig):
m = self.rxSegmentQuery.search(query)
if m is None:
raise Diagnostic(DiagnosticTypes.sru, 27)
expression = m.group(1).strip()
quantifier = m.group(2)
# TODO: quantifier
return self.adv_expression_query(expression, config)
def adv_simple_query(self, query: str, config: ResourceConfig, start=0, end=-1):
if len(query) <= 0:
raise Diagnostic(DiagnosticTypes.sru, 27)
if start >= len(query) - 1 or end <= 0:
raise Diagnostic(DiagnosticTypes.sru, 10)
while start < len(query) and query[start] in ' \t\n':
start += 1
while end > 0 and query[end - 1] in ' \t\n':
end -= 1
if start >= end:
raise Diagnostic(DiagnosticTypes.sru, 10)
if query[start] == '(' and query[end] == ')':
return self.adv_main_query(query, config, start=start+1, end=end-1)
if (query[end - 1] != '\\'
and ((query[start] == '"' and query[end] == '"')
or (query[start] == "'" and query[end] == "'"))):
return self.adv_segment_query('[text=' + query[start:end] + ']', config)
return self.adv_segment_query(query[start:end], config)
def adv_main_query(self, query: str, config: ResourceConfig, start=0, end=-1): def adv_main_query(self, query: str, config: ResourceConfig, start=0, end=-1):
if len(query) <= 0: if len(query) <= 0:
raise Diagnostic(DiagnosticTypes.sru, 27) raise Diagnostic(DiagnosticTypes.sru, 27)
......
...@@ -84,6 +84,29 @@ class TsakorpusQueryParser(QueryParser): ...@@ -84,6 +84,29 @@ class TsakorpusQueryParser(QueryParser):
# TODO: implement # TODO: implement
raise NotImplementedError() raise NotImplementedError()
def adv_term_query_proper(self, identifier: str, op: str, value: str, flags: str, config: ResourceConfig):
"""
Return list of query parameters for one term in an advanced query.
"""
if len(value) <= 0:
return Diagnostic(DiagnosticTypes.sru, 10)
if flags not in ('', 'i', 'c'):
return Diagnostic(DiagnosticTypes.sru, 10, message='Tsakorpus does not support regex flags.')
getParams = []
if identifier == 'text':
getParams.append(['wf', 0, value])
elif identifier == 'lemma':
getParams.append(['lex', 0, value])
elif identifier == 'pos':
if value in config.pos_convert_reverse:
# UD to corpus-specific POS tags
value = config.pos_convert_reverse[value]
getParams.append(['gr', 0, value])
else:
return Diagnostic(DiagnosticTypes.sru, 10,
message='The identifier ' + identifier + ' is not supported in Tsakoprus.')
return getParams
def send_query(self, strGetParams: str, config: ResourceConfig): def send_query(self, strGetParams: str, config: ResourceConfig):
""" """
Send the translated query to the Tsakorpus instance. Return JSON results Send the translated query to the Tsakorpus instance. Return JSON results
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment