Skip to content
Snippets Groups Projects
Commit 717f43b9 authored by Timofey Arkhangelskiy's avatar Timofey Arkhangelskiy
Browse files

Parts of advanced queries now work

parent fb7fc800
Branches
No related tags found
No related merge requests found
......@@ -14,11 +14,14 @@ class LitteraeQueryParser(QueryParser):
Parses search queries for Formulae, Litterae, Chartae.
"""
def build_get_string(self, getParams, config):
def build_get_string(self, getParams, config, withinClause=''):
"""
Build a GET string (everything after the ?) from a description
of the GET parameters in the getParams list.
"""
if len(withinClause) > 0 and withinClause not in ('text', 'session'):
raise Diagnostic(DiagnosticTypes.sru, 48, message='FLC web interface only supports multi-word search within'
'a text.')
termIndexes = self.term_indexes(getParams)
nWords = len(termIndexes)
boolOperatorMentioned = False
......
......@@ -18,8 +18,8 @@ class QueryParser:
't|turn|text|session) *$')
rxNonemptyQueryPart = re.compile('[^ \t\r\n]')
rxSegmentQuery = re.compile('^\\[(.*)\\](\\{[0-9,]\\}|[?*+]|)$')
rxAdvTermQuery = re.compile('^ *([a-zA-Z][a-zA-Z0-9\-]*(?::[a-zA-Z][a-zA-Z0-9\-]*)?) * '
'(!?=) *(["\'](.*)["\']) *(/[iIcCld])? *$')
rxAdvTermQuery = re.compile('^ *([a-zA-Z][a-zA-Z0-9\\-]*(?::[a-zA-Z][a-zA-Z0-9\\-]*)?) *'
'(!?=) *(["\'].*["\']) *(/[iIcCld])? *$')
acceptableIdentifiers = {'text', 'lemma', 'pos', 'orth', 'norm', 'phonetic'}
def __init__(self):
......@@ -38,10 +38,10 @@ class QueryParser:
inQuotes = False
for i in range(start, end):
if inQuotes:
if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\':
if strQuery[i] == '"' and (i <= 0 or strQuery[i-1] != '\\'):
inQuotes = False
continue
if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\':
if strQuery[i] == '"' and (i <= 0 or strQuery[i-1] != '\\'):
inQuotes = True
continue
if strQuery[i] == '(':
......@@ -70,17 +70,17 @@ class QueryParser:
inSingleQuotes = False
for i in range(start, end):
if inQuotes:
if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\':
if strQuery[i] == '"' and (i <= 0 or strQuery[i-1] != '\\'):
inQuotes = False
continue
if strQuery[i] == '"' and i > 0 and strQuery[i-1] != '\\':
if strQuery[i] == '"' and (i <= 0 or strQuery[i-1] != '\\'):
inQuotes = True
continue
if inSingleQuotes:
if strQuery[i] == "'" and i > 0 and strQuery[i-1] != '\\':
if strQuery[i] == "'" and (i <= 0 or strQuery[i-1] != '\\'):
inSingleQuotes = False
continue
if strQuery[i] == "'" and i > 0 and strQuery[i-1] != '\\':
if strQuery[i] == "'" and (i <= 0 or strQuery[i-1] != '\\'):
inSingleQuotes = True
continue
if strQuery[i] == '(':
......@@ -183,7 +183,7 @@ class QueryParser:
terms.add(t)
return [t for t in sorted(terms)]
def build_get_string(self, getParams, config):
def build_get_string(self, getParams, config, withinClause=''):
# Abstract function
raise NotImplementedError()
......@@ -204,10 +204,12 @@ class QueryParser:
raise NotImplementedError()
def adv_term_query(self, query, config: ResourceConfig):
print('ADVANCED TERM QUERY', query)
m = self.rxAdvTermQuery.search(query)
if m is None:
raise Diagnostic(DiagnosticTypes.sru, 10)
identifier, op, value, flags = m.group(1), m.group(2), m.group(3), m.group(4)
print('ADVANCED TERM QUERY', identifier, op, value, flags)
if value[0] != value[-1]:
raise Diagnostic(DiagnosticTypes.sru, 10) # Different quotes
value = value[1:len(value)-1] # Remove quotes
......@@ -279,10 +281,12 @@ class QueryParser:
resultRight = self.translate_simple(query, config, start=iOpPos + len(strOp),
end=end)
return self.not_bool(resultRight, config)
return {}
raise Diagnostic(DiagnosticTypes.sru, 10)
def adv_expression_query(self, query: str, config: ResourceConfig):
query = query.strip()
iOpPos, strOp = self.find_operator_adv_expression(query)
print('ADVANED EXPRESSION QUERY', iOpPos, strOp)
if iOpPos == -1:
if query[0] == '(' and query[-1] == ')':
return self.adv_expression_query(query[1:len(query)-1], config)
......@@ -297,8 +301,10 @@ class QueryParser:
elif strOp == '!':
resultRight = self.adv_expression_query(query[iOpPos+1:], config)
return self.not_bool(resultRight, config)
raise Diagnostic(DiagnosticTypes.sru, 10)
def adv_segment_query(self, query: str, config: ResourceConfig):
print('ADVANCED SEGMENT QUERY', query)
m = self.rxSegmentQuery.search(query)
if m is None:
raise Diagnostic(DiagnosticTypes.sru, 27)
......@@ -321,12 +327,11 @@ class QueryParser:
if query[start] == '(' and query[end] == ')':
return self.adv_main_query(query, config, start=start+1, end=end-1)
if (query[end - 1] != '\\'
and ((query[start] == '"' and query[end] == '"')
or (query[start] == "'" and query[end] == "'"))):
and ((query[start] == '"' and query[end - 1] == '"')
or (query[start] == "'" and query[end - 1] == "'"))):
return self.adv_segment_query('[text=' + query[start:end] + ']', config)
return self.adv_segment_query(query[start:end], config)
def adv_main_query(self, query: str, config: ResourceConfig, start=0, end=-1):
if len(query) <= 0:
raise Diagnostic(DiagnosticTypes.sru, 27)
......@@ -339,6 +344,7 @@ class QueryParser:
if start >= end:
raise Diagnostic(DiagnosticTypes.sru, 10)
iOpPos, strOp = self.find_operator_adv(query, start, end)
print('ADVANCED QUERY', iOpPos, strOp)
if iOpPos == -1:
return self.adv_simple_query(query, config, start=start, end=end)
resultLeft = self.adv_simple_query(query, config, start=start, end=iOpPos)
......@@ -365,6 +371,7 @@ class QueryParser:
in the current query and then calling a respective lower-level
function, which may be platform-specific.
"""
print('ADVANCED QUERY', query)
withinClause = ''
end = len(query)
m = self.rxWithinClause.search(query)
......@@ -382,8 +389,8 @@ class QueryParser:
end = len(query)
if end == 0:
raise Diagnostic(DiagnosticTypes.sru, 27)
return self.adv_main_query(query, config, start=0, end=end), withinClause
return self.build_get_string(self.adv_main_query(query, config, start=0, end=end), config,
withinClause=withinClause)
def validate_query(self, operation, version, queryType, query,
xFcsEndpointDescription, xFcsContext,
......
......@@ -14,11 +14,15 @@ class TsakorpusQueryParser(QueryParser):
rxTsakorpusBool = re.compile('[()|,]')
def build_get_string(self, getParams, config):
def build_get_string(self, getParams, config: ResourceConfig, withinClause=''):
"""
Build a GET string (everything after the ?) from a description
of the GET parameters in the getParams list.
"""
if len(withinClause) > 0 and withinClause not in ('sentence', 'utterance', 'paragraph'):
raise Diagnostic(DiagnosticTypes.sru, 48, message='Tsakorpus only supports multi-word search within'
'one segment that normally equals one sentence / '
'utterance / turn.')
termIndexes = self.term_indexes(getParams)
nWords = len(termIndexes)
s = 'n_words=' + str(nWords)
......@@ -34,14 +38,14 @@ class TsakorpusQueryParser(QueryParser):
s += '&precise=on&sort=random&response_format=json&distance_strict=on'
return s
def term_query(self, query, config):
def term_query(self, query: str, config: ResourceConfig):
"""
Return list of query parameters for one term or sequence of terms.
"""
if len(query) >= 2 and query.startswith('"') and query.endswith('"'):
query = query[1:len(query)-1]
if len(query) <= 0:
return Diagnostic(DiagnosticTypes.sru, 10)
raise Diagnostic(DiagnosticTypes.sru, 10)
getParams = []
iTerm = 0
for term in query.split(' '):
......@@ -55,7 +59,7 @@ class TsakorpusQueryParser(QueryParser):
getParams.append(['word_dist_to_', iTerm, '1'])
return getParams
def binary_bool(self, strOp, operandL, operandR, config):
def binary_bool(self, strOp: str, operandL, operandR, config):
if len(operandL) <= 0 or len(operandR) <= 0:
raise Diagnostic(DiagnosticTypes.sru, 10)
termsL = self.term_indexes(operandL)
......@@ -89,31 +93,71 @@ class TsakorpusQueryParser(QueryParser):
"""
Return list of query parameters for one term in an advanced query.
"""
flags = flags.strip('/')
if len(value) <= 0:
return Diagnostic(DiagnosticTypes.sru, 10)
raise Diagnostic(DiagnosticTypes.sru, 10)
if flags not in ('', 'i', 'c'):
return Diagnostic(DiagnosticTypes.sru, 10, message='Tsakorpus does not support regex flags.')
raise Diagnostic(DiagnosticTypes.sru, 48, message='Tsakorpus does not support regex flags.')
if op not in ('=', '!='):
raise Diagnostic(DiagnosticTypes.sru, 10,
message='In token queries, only = and != are allowed as operators.')
if op == '!=':
if identifier != 'pos':
value = '~' + value
else:
value = '~(' + value + ')'
getParams = []
if identifier == 'text':
getParams.append(['wf', 0, value])
getParams.append(['wf', 1, value])
elif identifier == 'lemma':
getParams.append(['lex', 0, value])
getParams.append(['lex', 1, value])
elif identifier == 'pos':
if value in config.pos_convert_reverse:
# UD to corpus-specific POS tags
value = config.pos_convert_reverse[value]
getParams.append(['gr', 0, value])
getParams.append(['gr', 1, value])
else:
return Diagnostic(DiagnosticTypes.sru, 10,
raise Diagnostic(DiagnosticTypes.sru, 10,
message='The identifier ' + identifier + ' is not supported in Tsakoprus.')
return getParams
def adv_binary_bool(self, strOp: str, operandL, operandR, config: ResourceConfig):
# Join multiple constraints on one word in an advanced query
print('ADVANCED INTERNAL BOOL', strOp, str(operandL), str(operandR))
getParams = []
if strOp == '&':
strOp = ','
paramsR = {paramR[0] for paramR in operandR}
for paramR in operandR:
paramExists = False
for paramL in operandL:
if paramL[0] == paramR[0]:
if strOp == ',' and paramL[0] != 'gr':
raise Diagnostic(DiagnosticTypes.sru, 48,
message='Tsakorpus endpoint does not support conjunctions '
'of multiple constraints for the same layer '
'within the same word.')
paramExists = True
getParams.append([paramL[0], paramL[1], '(' + paramL[2] + ')' + strOp + '(' + paramR[2] + ')'])
if not paramExists:
getParams.append(paramR[:])
for paramL in operandL:
if paramL[0] not in paramsR:
if strOp == '|':
raise Diagnostic(DiagnosticTypes.sru, 48,
message='Tsakorpus does not support disjunctions '
'of constraints for multiple layers '
'within the same word.')
getParams.append(paramL[:])
return getParams
def send_query(self, strGetParams: str, config: ResourceConfig):
"""
Send the translated query to the Tsakorpus instance. Return JSON results
returned by the corpus.
"""
url = config.resource_base_url.strip('/') + '/search_sent?' + strGetParams
print(url)
response = urllib.request.urlopen(url)
data = response.read()
encoding = response.info().get_content_charset('utf-8')
......
......@@ -21,6 +21,7 @@ class TsakorpusResponseParser:
Parse HTML annotation for one word taken from a hit.
Add the data to the layers in the record object.
"""
print(anno)
annoTree = fragment_fromstring(anno,
create_parent='div')
lemmas = set()
......@@ -35,13 +36,17 @@ class TsakorpusResponseParser:
# This should not happen, but just in case
word += '|'
word += node.text
lexNodes = annoTree.xpath('div[@class="popup_word"]/div[@class="popup_ana"]/span[@class="popup_lex"]')
lexNodes = annoTree.xpath('div[@class="popup_word"]/'
'div[contains(@class, \'popup_ana\')]/'
'span[@class="popup_lex"]')
for node in lexNodes:
if node.text is not None:
lemmas.add(node.text)
if len(lemmas) > 0:
lemmasStr = '|'.join(l for l in sorted(lemmas))
posNodes = annoTree.xpath('div[@class="popup_word"]/div[@class="popup_ana"]/span[@class="popup_pos"]')
posNodes = annoTree.xpath('div[@class="popup_word"]/'
'div[contains(@class, \'popup_ana\')]/'
'span[@class="popup_pos"]')
for node in posNodes:
if node.text is not None:
posText = re.sub('&nbsp;|[ \t\ufeff]+', '', node.text)
......@@ -71,7 +76,6 @@ class TsakorpusResponseParser:
'value': lemmasStr
})
def parse_span(self, el, record, advancedHits=False):
"""
Parse one <span> element from the HTML representation
......@@ -123,7 +127,7 @@ class TsakorpusResponseParser:
or 'text' not in hit['languages'][lang]):
return record
contentTxt = re.sub('[\r\n\t\ufeff]+', '', hit['languages'][lang]['text'], flags=re.DOTALL)
print(contentTxt)
# print(contentTxt)
content = fragment_fromstring(contentTxt,
create_parent='div')
for el in content:
......
......@@ -84,7 +84,7 @@ def process_search_retrieve(version: SRUVersion,
strGetParams = app.qp_tsakorpus.translate_simple(query, config)
else:
strGetParams = app.qp_tsakorpus.translate_advanced(query, config)
# print(strGetParams)
print(strGetParams)
res = app.qp_tsakorpus.send_query(strGetParams, config)
except Diagnostic as diag:
return fatal_response(Operation.searchRetrieve, version, diagnostics + [diag], request, templates)
......@@ -146,6 +146,7 @@ def process_request(operation: Operation,
:param diagnostics: List of diagnostics produced by the validation
function.
"""
print(query)
# If something is clearly wrong with the query, return
# a response with the list of diagnostics
if config is None or any(d.is_fatal() for d in diagnostics):
......
......@@ -8,3 +8,8 @@ http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=ke%20AND%
http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=ke%20no Tsakorpus -- Simple search with multiple words ("ke no", 18 hits)
http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=ke%20no Tsakorpus -- Simple search with multiple words ("ke no", 18 hits)
http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=ke%20AND%20no&x-fcs-dataviews=adv Tsakorpus -- Simple search with boolean operator, advanced view ("ke" AND "no", 103 hits)
http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=%22ka%22&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, term only ("ka", 112 hits)
http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[text=%22mon%22]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, term only / word ("mon", 1465 hits)
http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[lemma=%22mon%22]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, term only / lemma ("mon", 2284 hits)
http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[pos=%22NOUN%22]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, term only / POS (NOUN, 22639 hits)
http://127.0.0.1:5000/fcs-endpoint/test?operation=searchRetrieve&query=[lemma%3D%22m.*%22%20%26%20(pos=%22NOUN%22|pos=%22VERB%22)]&x-fcs-dataviews=adv&queryType=fcs Tsakorpus -- Advanced search, Boolean operators inside term query (words that start with "m" and are either nouns or verbs, 3878 hits)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment