From 5bc4ddfe63ae87a0acd55bd8d0aca0458825b756 Mon Sep 17 00:00:00 2001 From: Timofey Arkhangelskiy <timofey.arkhangelskiy@uni-hamburg.de> Date: Sat, 10 Dec 2022 23:20:00 +0100 Subject: [PATCH] Start translating FCS-QL queries into Tsakorpus GET queries --- common/config.py | 20 ++++++++- common/diagnostics.py | 8 +++- common/query_parser.py | 31 ++++++++++++++ common/tsakorpus_query_parser.py | 73 ++++++++++++++++++++++++++++++++ config/test.json | 3 +- main.py | 43 +++++++++---------- 6 files changed, 153 insertions(+), 25 deletions(-) create mode 100644 common/tsakorpus_query_parser.py diff --git a/common/config.py b/common/config.py index 370b108..688a145 100644 --- a/common/config.py +++ b/common/config.py @@ -16,9 +16,10 @@ class ResourceConfig: for one of the resources this endpoint communicates with. """ + rxExt = re.compile('\\.[^.]*$') + def __init__(self, fnameConfig=None): self.platform = CorpPlatform.annis - self.corpus_id = '' self.transport_protocol = 'https' self.host = '127.0.0.1' self.port = '5000' @@ -168,3 +169,20 @@ class ResourceConfig: dictConfig = self.processed_gui_config(data) with open(fnameOut, 'w', encoding='utf-8') as fOut: json.dump(dictConfig, fOut, sort_keys=True, ensure_ascii=False, indent=2) + + +def read_configs(configDir='./config'): + """ + Load all configuration files from the configuration directory + (one file per resource). Initialize ResourceConfig instances for + each of them. Return a dictionary where the keys are filenames and + the values contain respective ResourceConfig objects. + """ + configs = {} + for fname in os.listdir(configDir): + if not fname.lower().endswith('.json'): + continue + fnameFull = os.path.join(configDir, fname) + fnameNoExt = ResourceConfig.rxExt.sub('', fname) + configs[fnameNoExt] = ResourceConfig(fnameFull) + return configs diff --git a/common/diagnostics.py b/common/diagnostics.py index e26add3..a945933 100644 --- a/common/diagnostics.py +++ b/common/diagnostics.py @@ -12,7 +12,11 @@ class Diagnostic: """ fatalFCSDiagnostics = {3, 10, 11} # FCS specifications, 4.2 - fatalSRUDiagnostics = {8, 10} # A subset actually used by this endpoint + fatalSRUDiagnostics = {8, 10, 27, 235} # A subset actually used by this endpoint + + stdMessages = { + (DiagnosticTypes.sru, 10): 'Something is wrong with the query syntax.' + } def __init__(self, diagType: DiagnosticTypes, diagID: int, details: str = '', @@ -24,6 +28,8 @@ class Diagnostic: self.diagID = diagID self.details = details self.message = message + if len(self.message) <= 0 and (diagType, diagID) in self.stdMessages: + self.message = self.stdMessages[(diagType, diagID)] self.templateLoader = jinja2.FileSystemLoader(searchpath="./static") self.templateEnv = jinja2.Environment(loader=self.templateLoader) diff --git a/common/query_parser.py b/common/query_parser.py index 32497a4..17affdb 100644 --- a/common/query_parser.py +++ b/common/query_parser.py @@ -1,5 +1,6 @@ from .enums import * from .diagnostics import Diagnostic +import re class QueryParser: @@ -7,9 +8,39 @@ class QueryParser: This class contains commonly used methods for initial parsing of a GET query. It does not include platform-specific methods. """ + + rxTermQuery = re.compile('^(?:(?:[^ "]|\\\\")*|"(?:[^"]|\\\\")*")$') + def __init__(self): pass + @staticmethod + def find_operator(strQuery, start=0, end=-1): + if end == -1: + end = len(strQuery) - 1 + if strQuery[start:start+3] == 'NOT': + return start, 'NOT' + parenthBalance = 0 + inQuotes = False + for i in range(start, end): + if inQuotes: + if strQuery[i] == '"': + inQuotes = False + continue + if strQuery[i] == '"': + inQuotes = True + continue + if strQuery[i] == '(': + parenthBalance += 1 + elif strQuery[i] == ')': + parenthBalance -= 1 + elif parenthBalance == 0: + if strQuery[i:i+3] == 'AND': + return i, 'AND' + elif strQuery[i:i+2] == 'OR': + return i, 'OR' + return -1, '' + def validate_query(self, operation, version, queryType, query, xFcsEndpointDescription, xFcsContext, xFcsDataviews, xFcsRewritesAllowed): diff --git a/common/tsakorpus_query_parser.py b/common/tsakorpus_query_parser.py new file mode 100644 index 0000000..88624a3 --- /dev/null +++ b/common/tsakorpus_query_parser.py @@ -0,0 +1,73 @@ +import re +from .query_parser import QueryParser +from .config import ResourceConfig +from .diagnostics import Diagnostic, DiagnosticTypes + +class TsakorpusQueryParser(QueryParser): + """ + Parses search queries for Tsakorpus-based corpora. + """ + + def term_query(self, query, config): + """ + Return list of query parameters for one term or sequence of terms. + """ + if len(query) >= 2 and query.startswith('"') and query.endswith('"'): + query = query[1:len(query)-1] + if len(query) <= 0: + return Diagnostic(DiagnosticTypes.sru, 10) + getParams = [] + iTerm = 0 + for term in query.split(' '): + if len(term) > 0: + iTerm += 1 + getParams.append(['w', iTerm, term]) + if iTerm >= 2: + getParams.append(['word_rel_', [iTerm-1, iTerm], '1']) + getParams.append(['word_dist_from_', [iTerm-1, iTerm], '1']) + getParams.append(['word_dist_to_', [iTerm-1, iTerm], '1']) + return getParams + + + def translate_fcsql(self, query: str, config: ResourceConfig, basicSearch: bool = False, start=0, end=-1): + """ + Translate an FCS-QL query into a Tsakorpus GET query. + If something is wrong with the query, return a diagnostic. + The function is recursive and only looks at the part of the string + delimited by start and end parameters. + """ + if end == -1: + end = len(query) + if end == 0: + return Diagnostic(DiagnosticTypes.sru, 27, message='The query should not be empty.') + if self.rxTermQuery.search(query) is not None: + return self.term_query(query, config) + # if query.count('(') != query.count(')'): + # return None + if len(query) <= 0: + return Diagnostic(DiagnosticTypes.sru, 27, message='The query should not be empty.') + if start >= end: + return Diagnostic(DiagnosticTypes.sru, 10) + iOpPos, strOp = self.find_operator(query, start, end) + if iOpPos == -1: + if query[start] == '(' and query[end - 1] == ')': + return self.translate_fcsql(query, config, basicSearch=basicSearch, start=start + 1, end=end - 1) + else: + return self.term_query(query[start:end], config) + if strOp in ('AND', 'OR'): + resultLeft = self.translate_fcsql(query, config, basicSearch=basicSearch, start=start, end=iOpPos) + resultRight = self.translate_fcsql(query, config, basicSearch=basicSearch, start=iOpPos + len(strOp), end=end) + if (type(resultRight) is Diagnostic or type(resultLeft) is Diagnostic + or len(resultLeft) <= 0 or len(resultRight) <= 0): + return Diagnostic(DiagnosticTypes.sru, 10) + return self.binary_bool(strOp, resultLeft, resultRight, config) + elif strOp == 'NOT': + resultRight = self.translate_fcsql(query, config, basicSearch=basicSearch, start=iOpPos + len(strOp), + end=end) + return self.not_bool(resultRight) + return {} + + +if __name__ == '__main__': + pass + diff --git a/config/test.json b/config/test.json index 1b9ba57..31ad20d 100644 --- a/config/test.json +++ b/config/test.json @@ -1,5 +1,6 @@ { "host": "0.0.0.0", "port": "80", - "max_hits": 20 + "max_hits": 20, + "platform": "tsakorpus" } \ No newline at end of file diff --git a/main.py b/main.py index d4b3b29..099c5da 100644 --- a/main.py +++ b/main.py @@ -1,33 +1,25 @@ -from fastapi import FastAPI, Request, Query +from fastapi import FastAPI, Request, Query, Response from fastapi.staticfiles import StaticFiles from fastapi.templating import Jinja2Templates from fastapi.encoders import jsonable_encoder from fastapi.responses import JSONResponse from common.query_parser import QueryParser +from common.tsakorpus_query_parser import TsakorpusQueryParser from common.enums import * from common.diagnostics import Diagnostic -from common.config import ResourceConfig +from common.config import ResourceConfig, read_configs import json import os import re import uvicorn -rxExt = re.compile('\\.[^.]*$') - app = FastAPI() app.mount('/static', StaticFiles(directory='static'), name='static') templates = Jinja2Templates(directory='static') app.qp = QueryParser() -app.configs = {} -i = 0 -for fname in os.listdir('config'): - if not fname.lower().endswith('.json'): - continue - i += 1 - fnameFull = os.path.join('config', fname) - fnameNoExt = rxExt.sub('', fname) - app.configs[fnameNoExt] = ResourceConfig(fnameFull) +app.qp_tsakorpus = TsakorpusQueryParser() +app.configs = read_configs() @app.get('/') @@ -35,9 +27,8 @@ def root(): return {"message": "Hello World"} -@app.get('/fcs-endpoint/{platform}/{corpusID}') +@app.get('/fcs-endpoint/{corpusID}') def endpoint( - platform: CorpPlatform, corpusID: str, operation: Operation = Operation.explain, version: SRUVersion = SRUVersion.v2_0, @@ -60,19 +51,27 @@ def endpoint( alias='x-fcs-rewrites-allowed' ) ): + if corpusID not in app.configs: + message = 'No corpus with this ID (' + corpusID +') is served by this Endpoint. ' \ + 'Valid corpus IDs are: ' + '; '.join(cID for cID in sorted(app.configs)) + '.' + diagBody = str(Diagnostic(DiagnosticTypes.sru, 235, + message=message)) # "Database does not exist" + return Response(content=diagBody, media_type='application/xml') + config = app.configs[corpusID] diagnostics = app.qp.validate_query(operation, version, queryType, query, xFcsEndpointDescription, xFcsContext, xFcsDataviews, xFcsRewritesAllowed) if any(d.is_fatal() for d in diagnostics): return '\n'.join(str(d) for d in diagnostics) - if platform == CorpPlatform.annis: - return {'platform': 'annis', 'operation': operation, 'version': version} - elif platform == CorpPlatform.litterae: - return {'platform': 'litterae', 'operation': operation, 'version': version} - elif platform == CorpPlatform.tsakorpus: - return {'platform': 'tsakorpus', 'operation': operation, 'version': version} - return + if operation == Operation.searchRetrieve: + if config.platform == CorpPlatform.tsakorpus: + res = app.qp_tsakorpus.translate_fcsql(query, config) + if type(res) == Diagnostic: + return Response(content=str(res), media_type='application/xml') + return str(res) + + return {'operation': operation, 'version': version} if __name__ == '__main__': -- GitLab