Skip to content
Snippets Groups Projects
Commit 5bc4ddfe authored by Arkhangelskiy, Timofey's avatar Arkhangelskiy, Timofey
Browse files

Start translating FCS-QL queries into Tsakorpus GET queries

parent 177d5028
No related branches found
No related tags found
No related merge requests found
...@@ -16,9 +16,10 @@ class ResourceConfig: ...@@ -16,9 +16,10 @@ class ResourceConfig:
for one of the resources this endpoint communicates with. for one of the resources this endpoint communicates with.
""" """
rxExt = re.compile('\\.[^.]*$')
def __init__(self, fnameConfig=None): def __init__(self, fnameConfig=None):
self.platform = CorpPlatform.annis self.platform = CorpPlatform.annis
self.corpus_id = ''
self.transport_protocol = 'https' self.transport_protocol = 'https'
self.host = '127.0.0.1' self.host = '127.0.0.1'
self.port = '5000' self.port = '5000'
...@@ -168,3 +169,20 @@ class ResourceConfig: ...@@ -168,3 +169,20 @@ class ResourceConfig:
dictConfig = self.processed_gui_config(data) dictConfig = self.processed_gui_config(data)
with open(fnameOut, 'w', encoding='utf-8') as fOut: with open(fnameOut, 'w', encoding='utf-8') as fOut:
json.dump(dictConfig, fOut, sort_keys=True, ensure_ascii=False, indent=2) json.dump(dictConfig, fOut, sort_keys=True, ensure_ascii=False, indent=2)
def read_configs(configDir='./config'):
"""
Load all configuration files from the configuration directory
(one file per resource). Initialize ResourceConfig instances for
each of them. Return a dictionary where the keys are filenames and
the values contain respective ResourceConfig objects.
"""
configs = {}
for fname in os.listdir(configDir):
if not fname.lower().endswith('.json'):
continue
fnameFull = os.path.join(configDir, fname)
fnameNoExt = ResourceConfig.rxExt.sub('', fname)
configs[fnameNoExt] = ResourceConfig(fnameFull)
return configs
...@@ -12,7 +12,11 @@ class Diagnostic: ...@@ -12,7 +12,11 @@ class Diagnostic:
""" """
fatalFCSDiagnostics = {3, 10, 11} # FCS specifications, 4.2 fatalFCSDiagnostics = {3, 10, 11} # FCS specifications, 4.2
fatalSRUDiagnostics = {8, 10} # A subset actually used by this endpoint fatalSRUDiagnostics = {8, 10, 27, 235} # A subset actually used by this endpoint
stdMessages = {
(DiagnosticTypes.sru, 10): 'Something is wrong with the query syntax.'
}
def __init__(self, diagType: DiagnosticTypes, diagID: int, def __init__(self, diagType: DiagnosticTypes, diagID: int,
details: str = '', details: str = '',
...@@ -24,6 +28,8 @@ class Diagnostic: ...@@ -24,6 +28,8 @@ class Diagnostic:
self.diagID = diagID self.diagID = diagID
self.details = details self.details = details
self.message = message self.message = message
if len(self.message) <= 0 and (diagType, diagID) in self.stdMessages:
self.message = self.stdMessages[(diagType, diagID)]
self.templateLoader = jinja2.FileSystemLoader(searchpath="./static") self.templateLoader = jinja2.FileSystemLoader(searchpath="./static")
self.templateEnv = jinja2.Environment(loader=self.templateLoader) self.templateEnv = jinja2.Environment(loader=self.templateLoader)
......
from .enums import * from .enums import *
from .diagnostics import Diagnostic from .diagnostics import Diagnostic
import re
class QueryParser: class QueryParser:
...@@ -7,9 +8,39 @@ class QueryParser: ...@@ -7,9 +8,39 @@ class QueryParser:
This class contains commonly used methods for initial parsing of a GET This class contains commonly used methods for initial parsing of a GET
query. It does not include platform-specific methods. query. It does not include platform-specific methods.
""" """
rxTermQuery = re.compile('^(?:(?:[^ "]|\\\\")*|"(?:[^"]|\\\\")*")$')
def __init__(self): def __init__(self):
pass pass
@staticmethod
def find_operator(strQuery, start=0, end=-1):
if end == -1:
end = len(strQuery) - 1
if strQuery[start:start+3] == 'NOT':
return start, 'NOT'
parenthBalance = 0
inQuotes = False
for i in range(start, end):
if inQuotes:
if strQuery[i] == '"':
inQuotes = False
continue
if strQuery[i] == '"':
inQuotes = True
continue
if strQuery[i] == '(':
parenthBalance += 1
elif strQuery[i] == ')':
parenthBalance -= 1
elif parenthBalance == 0:
if strQuery[i:i+3] == 'AND':
return i, 'AND'
elif strQuery[i:i+2] == 'OR':
return i, 'OR'
return -1, ''
def validate_query(self, operation, version, queryType, query, def validate_query(self, operation, version, queryType, query,
xFcsEndpointDescription, xFcsContext, xFcsEndpointDescription, xFcsContext,
xFcsDataviews, xFcsRewritesAllowed): xFcsDataviews, xFcsRewritesAllowed):
......
import re
from .query_parser import QueryParser
from .config import ResourceConfig
from .diagnostics import Diagnostic, DiagnosticTypes
class TsakorpusQueryParser(QueryParser):
"""
Parses search queries for Tsakorpus-based corpora.
"""
def term_query(self, query, config):
"""
Return list of query parameters for one term or sequence of terms.
"""
if len(query) >= 2 and query.startswith('"') and query.endswith('"'):
query = query[1:len(query)-1]
if len(query) <= 0:
return Diagnostic(DiagnosticTypes.sru, 10)
getParams = []
iTerm = 0
for term in query.split(' '):
if len(term) > 0:
iTerm += 1
getParams.append(['w', iTerm, term])
if iTerm >= 2:
getParams.append(['word_rel_', [iTerm-1, iTerm], '1'])
getParams.append(['word_dist_from_', [iTerm-1, iTerm], '1'])
getParams.append(['word_dist_to_', [iTerm-1, iTerm], '1'])
return getParams
def translate_fcsql(self, query: str, config: ResourceConfig, basicSearch: bool = False, start=0, end=-1):
"""
Translate an FCS-QL query into a Tsakorpus GET query.
If something is wrong with the query, return a diagnostic.
The function is recursive and only looks at the part of the string
delimited by start and end parameters.
"""
if end == -1:
end = len(query)
if end == 0:
return Diagnostic(DiagnosticTypes.sru, 27, message='The query should not be empty.')
if self.rxTermQuery.search(query) is not None:
return self.term_query(query, config)
# if query.count('(') != query.count(')'):
# return None
if len(query) <= 0:
return Diagnostic(DiagnosticTypes.sru, 27, message='The query should not be empty.')
if start >= end:
return Diagnostic(DiagnosticTypes.sru, 10)
iOpPos, strOp = self.find_operator(query, start, end)
if iOpPos == -1:
if query[start] == '(' and query[end - 1] == ')':
return self.translate_fcsql(query, config, basicSearch=basicSearch, start=start + 1, end=end - 1)
else:
return self.term_query(query[start:end], config)
if strOp in ('AND', 'OR'):
resultLeft = self.translate_fcsql(query, config, basicSearch=basicSearch, start=start, end=iOpPos)
resultRight = self.translate_fcsql(query, config, basicSearch=basicSearch, start=iOpPos + len(strOp), end=end)
if (type(resultRight) is Diagnostic or type(resultLeft) is Diagnostic
or len(resultLeft) <= 0 or len(resultRight) <= 0):
return Diagnostic(DiagnosticTypes.sru, 10)
return self.binary_bool(strOp, resultLeft, resultRight, config)
elif strOp == 'NOT':
resultRight = self.translate_fcsql(query, config, basicSearch=basicSearch, start=iOpPos + len(strOp),
end=end)
return self.not_bool(resultRight)
return {}
if __name__ == '__main__':
pass
{ {
"host": "0.0.0.0", "host": "0.0.0.0",
"port": "80", "port": "80",
"max_hits": 20 "max_hits": 20,
"platform": "tsakorpus"
} }
\ No newline at end of file
from fastapi import FastAPI, Request, Query from fastapi import FastAPI, Request, Query, Response
from fastapi.staticfiles import StaticFiles from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates from fastapi.templating import Jinja2Templates
from fastapi.encoders import jsonable_encoder from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse from fastapi.responses import JSONResponse
from common.query_parser import QueryParser from common.query_parser import QueryParser
from common.tsakorpus_query_parser import TsakorpusQueryParser
from common.enums import * from common.enums import *
from common.diagnostics import Diagnostic from common.diagnostics import Diagnostic
from common.config import ResourceConfig from common.config import ResourceConfig, read_configs
import json import json
import os import os
import re import re
import uvicorn import uvicorn
rxExt = re.compile('\\.[^.]*$')
app = FastAPI() app = FastAPI()
app.mount('/static', StaticFiles(directory='static'), name='static') app.mount('/static', StaticFiles(directory='static'), name='static')
templates = Jinja2Templates(directory='static') templates = Jinja2Templates(directory='static')
app.qp = QueryParser() app.qp = QueryParser()
app.configs = {} app.qp_tsakorpus = TsakorpusQueryParser()
i = 0 app.configs = read_configs()
for fname in os.listdir('config'):
if not fname.lower().endswith('.json'):
continue
i += 1
fnameFull = os.path.join('config', fname)
fnameNoExt = rxExt.sub('', fname)
app.configs[fnameNoExt] = ResourceConfig(fnameFull)
@app.get('/') @app.get('/')
...@@ -35,9 +27,8 @@ def root(): ...@@ -35,9 +27,8 @@ def root():
return {"message": "Hello World"} return {"message": "Hello World"}
@app.get('/fcs-endpoint/{platform}/{corpusID}') @app.get('/fcs-endpoint/{corpusID}')
def endpoint( def endpoint(
platform: CorpPlatform,
corpusID: str, corpusID: str,
operation: Operation = Operation.explain, operation: Operation = Operation.explain,
version: SRUVersion = SRUVersion.v2_0, version: SRUVersion = SRUVersion.v2_0,
...@@ -60,19 +51,27 @@ def endpoint( ...@@ -60,19 +51,27 @@ def endpoint(
alias='x-fcs-rewrites-allowed' alias='x-fcs-rewrites-allowed'
) )
): ):
if corpusID not in app.configs:
message = 'No corpus with this ID (' + corpusID +') is served by this Endpoint. ' \
'Valid corpus IDs are: ' + '; '.join(cID for cID in sorted(app.configs)) + '.'
diagBody = str(Diagnostic(DiagnosticTypes.sru, 235,
message=message)) # "Database does not exist"
return Response(content=diagBody, media_type='application/xml')
config = app.configs[corpusID]
diagnostics = app.qp.validate_query(operation, version, queryType, query, diagnostics = app.qp.validate_query(operation, version, queryType, query,
xFcsEndpointDescription, xFcsContext, xFcsEndpointDescription, xFcsContext,
xFcsDataviews, xFcsRewritesAllowed) xFcsDataviews, xFcsRewritesAllowed)
if any(d.is_fatal() for d in diagnostics): if any(d.is_fatal() for d in diagnostics):
return '\n'.join(str(d) for d in diagnostics) return '\n'.join(str(d) for d in diagnostics)
if platform == CorpPlatform.annis: if operation == Operation.searchRetrieve:
return {'platform': 'annis', 'operation': operation, 'version': version} if config.platform == CorpPlatform.tsakorpus:
elif platform == CorpPlatform.litterae: res = app.qp_tsakorpus.translate_fcsql(query, config)
return {'platform': 'litterae', 'operation': operation, 'version': version} if type(res) == Diagnostic:
elif platform == CorpPlatform.tsakorpus: return Response(content=str(res), media_type='application/xml')
return {'platform': 'tsakorpus', 'operation': operation, 'version': version} return str(res)
return
return {'operation': operation, 'version': version}
if __name__ == '__main__': if __name__ == '__main__':
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment