From 4903a3e237984d2db3dc69c18940c7ff882461d1 Mon Sep 17 00:00:00 2001 From: Timofey Arkhangelskiy <timofey.arkhangelskiy@uni-hamburg.de> Date: Tue, 6 Jun 2023 16:59:35 +0200 Subject: [PATCH] Fix some problems with CLARIN endpoint tester --- common/annis_query_parser.py | 5 +- common/config.py | 15 ++- common/diagnostics.py | 19 +++- common/litterae_query_parser.py | 2 +- common/litterae_response_parser.py | 23 +++- common/query_parser.py | 29 +++-- common/tsakorpus_query_parser.py | 4 +- common/tsakorpus_response_parser.py | 14 ++- common/views_logic.py | 170 ++++++++++++++++++++++++---- main.py | 64 +++++++++-- notes.txt | 2 + requirements.txt | 3 +- static/diagnostic.xml | 3 +- static/endpoint_description.xml | 2 +- static/explain_response.xml | 2 +- static/search_retrieve_response.xml | 5 +- 16 files changed, 290 insertions(+), 72 deletions(-) diff --git a/common/annis_query_parser.py b/common/annis_query_parser.py index b9edca5..1fc38bf 100644 --- a/common/annis_query_parser.py +++ b/common/annis_query_parser.py @@ -16,7 +16,8 @@ class AnnisQueryParser(QueryParser): rxRelOps = re.compile('^(?:\\^\\*|\\||\\.[*,0-9]*)|_=_$') # Operators for setting relations between query words rxFramingQuotes = re.compile('^[/"]|(?<!\\\\)[/"]$') - def build_get_string(self, params, config: ResourceConfig, withinClause=''): + def build_get_string(self, params, config: ResourceConfig, + searchOptions: dict, withinClause=''): """ Build a payload for an ANNIS search request. ANNIS uses POST with JSON payload rather than GET, but the @@ -31,7 +32,7 @@ class AnnisQueryParser(QueryParser): 'query': '', 'query_language': 'AQL', 'corpora': config.annis_corpus_list, - 'limit': config.max_hits, + 'limit': min(config.max_hits, searchOptions['maximumRecords']), 'order': 'Randomized' } termIndexes = self.term_indexes(params) diff --git a/common/config.py b/common/config.py index eb22b99..fab703d 100644 --- a/common/config.py +++ b/common/config.py @@ -47,18 +47,18 @@ class ResourceConfig: self.query_timeout = 60 + # NB: The following properties are not used right now. + # They may be used if somebody develops a GUI for editing + # configuration files in the future. self.boolParams = set(k for k in self.__dict__ if type(self.__dict__[k]) == bool) self.intParams = set(k for k in self.__dict__ if type(self.__dict__[k]) == int) self.lsParams = set() - # dictionaries where values are strings self.dict_sParams = {'pos_convert_reverse'} - # dictionaries where values are lists of strings self.dict_lsParams = {'pos_convert'} - # dictionaries where values are dictionaries {k: string} self.dict_dParams = set() @@ -143,6 +143,9 @@ class ResourceConfig: def gui_str_to_dict(self, s, value_type='list'): """ Process one input string that describes a dictionary. + NB: This function is not used right now. It may be used if + somebody develops a GUI for editing configuration files + in the future. """ d = {} s = s.replace('\r', '').strip() @@ -187,6 +190,9 @@ class ResourceConfig: """ Turn form data filled by the user in the configuration GUI to a dictionary in the correct format. + NB: This function is not used right now. It may be used if + somebody develops a GUI for editing configuration files + in the future. """ dictConfig = {} for f in self.boolParams: @@ -223,6 +229,9 @@ class ResourceConfig: """ Save current or new configuration as a JSON file (can be used to edit configuration files through a web interface). + NB: This function is not used right now. It may be used if + somebody develops a GUI for editing configuration files + in the future. """ if data is None or len(data) <= 0: dictConfig = self.as_dict() diff --git a/common/diagnostics.py b/common/diagnostics.py index 47103c3..e90db7b 100644 --- a/common/diagnostics.py +++ b/common/diagnostics.py @@ -12,20 +12,23 @@ class Diagnostic(Exception): """ fatalFCSDiagnostics = {3, 10, 11} # FCS specifications, 4.2 - fatalSRUDiagnostics = {1, 4, 8, 10, 27, 37, 47, 48, 235} # A subset actually used by this endpoint + fatalSRUDiagnostics = {1, 4, 5, 6, 8, 10, 27, 37, 47, 48, 61, 71, 235} # A subset actually used by this endpoint stdMessages = { (DiagnosticTypes.fcs, 4): 'Requested Data View not valid for this resource.', - (DiagnosticTypes.sru, 4): 'Unsupported operation.', + (DiagnosticTypes.sru, 4): 'Unsupported operation. Supported operation: explain, searchRetrieve, scan.', + (DiagnosticTypes.sru, 5): 'Unsupported version. Supported SRU versions: 1.2 and 2.0.', (DiagnosticTypes.sru, 8): 'Unsupported parameter.', (DiagnosticTypes.sru, 10): 'Something is wrong with the query syntax.', (DiagnosticTypes.sru, 27): 'The query should not be empty.', - (DiagnosticTypes.sru, 37): 'Unsupported boolean operator.' + (DiagnosticTypes.sru, 37): 'Unsupported boolean operator.', + (DiagnosticTypes.sru, 61): 'Start record position out of range.' } def __init__(self, diagType: DiagnosticTypes, diagID: int, details: str = '', - message: str = ''): + message: str = '', + version: SRUVersion = SRUVersion.v2_0): """ Initialize a diagnostic with a given numerical ID. """ @@ -33,6 +36,7 @@ class Diagnostic(Exception): self.diagID = diagID self.details = details self.message = message + self.version = version if len(self.message) <= 0 and (diagType, diagID) in self.stdMessages: self.message = self.stdMessages[(diagType, diagID)] @@ -64,10 +68,15 @@ class Diagnostic(Exception): """ Return the XML version of this diagnostic. """ + if self.version == SRUVersion.v1_2: + templateVersion = 1 + else: + templateVersion = 2 template = self.templateEnv.get_template('diagnostic.xml') xmlText = template.render(uri=self.uri(), details=self.details, - message=self.message) + message=self.message, + version=templateVersion) return xmlText.strip() def __repr__(self): diff --git a/common/litterae_query_parser.py b/common/litterae_query_parser.py index 2d0ca1f..34a5441 100644 --- a/common/litterae_query_parser.py +++ b/common/litterae_query_parser.py @@ -14,7 +14,7 @@ class LitteraeQueryParser(QueryParser): Parses search queries for Formulae, Litterae, Chartae. """ - def build_get_string(self, getParams, config, withinClause=''): + def build_get_string(self, getParams, config, searchOptions: dict, withinClause=''): """ Build a GET string (everything after the ?) from a description of the GET parameters in the getParams list. diff --git a/common/litterae_response_parser.py b/common/litterae_response_parser.py index d6476bf..a9233b2 100644 --- a/common/litterae_response_parser.py +++ b/common/litterae_response_parser.py @@ -20,7 +20,8 @@ class LitteraeResponseParser: def __init__(self): self.pc = None # POS convertor, rebuilt with each parse call - def process_hits(self, tableNode, config: ResourceConfig, diagnostics: list[Diagnostic], advancedHits=False): + def process_hits(self, tableNode, config: ResourceConfig, searchOptions: dict, + diagnostics: list[Diagnostic], advancedHits=False): """ Process hits from an HTML node with the results table. If anything goes wrong, add Diagnostic objects to diagnostics list. @@ -32,7 +33,8 @@ class LitteraeResponseParser: rows = tableNode.xpath('tr') iRow = 0 iRowOffset = 0 - while iRow < len(rows) and iRow - iRowOffset < config.max_hits: + maxHits = min(config.max_hits, searchOptions['maximumRecords']) + while iRow < len(rows) and iRow - iRowOffset < maxHits: row = rows[iRow] iRow += 1 paragraphs = row.xpath('td/p') @@ -51,7 +53,7 @@ class LitteraeResponseParser: records.append(record) return records - def parse(self, response, config: ResourceConfig, xFcsDataviews): + def parse(self, response, config: ResourceConfig, searchOptions: dict): """ Read HTML response with the first N hits returned by a Litterae instance. Return a list of Record objects and the total number of @@ -61,7 +63,7 @@ class LitteraeResponseParser: """ diagnostics = [] advancedHits = False - dataViewsRequested = {v.strip() for v in xFcsDataviews.split(',') if len(v.strip()) > 0} + dataViewsRequested = {v.strip() for v in searchOptions['x-fcs-dataviews'].split(',') if len(v.strip()) > 0} if 'adv' in dataViewsRequested: advancedHits = True srcTree = fromstring(response) @@ -73,11 +75,20 @@ class LitteraeResponseParser: nRecords = int(m.group(1)) resTableNodes = srcTree.xpath('//table[@id="partsSearchResultTable"]/tbody') records = [] + if searchOptions['startRecord'] > 1 and nRecords < searchOptions['startRecord']: + # We don't actually care about startRecord, but we should + # return a fatal diagnostic if it is larger than the number + # of hits. + diagnostics.append(Diagnostic(DiagnosticTypes.sru, 61)) + return records, nRecords, diagnostics if len(resTableNodes) <= 0: nRecords = 0 else: - records = self.process_hits(resTableNodes[0], config, diagnostics, advancedHits=advancedHits) - if len(records) < nRecords and len(records) < config.max_hits: + records = self.process_hits(resTableNodes[0], config, searchOptions, + diagnostics, advancedHits=advancedHits) + if (len(records) < nRecords + and len(records) < config.max_hits + and len(records) < searchOptions['maximumRecords']): diagnostics.append(Diagnostic(DiagnosticTypes.sru, 59, message='Some results could not be shown due to copyright restrictions.')) return records, nRecords, diagnostics diff --git a/common/query_parser.py b/common/query_parser.py index 736f3bc..56afd4d 100644 --- a/common/query_parser.py +++ b/common/query_parser.py @@ -12,6 +12,7 @@ class QueryParser: """ # Regexes for simple search rxTermQuery = re.compile('^(?:(?:[^ "]|\\\\")*|"(?:[^"]|\\\\")*")$') + rxQueryWSpaces = re.compile('[^ \t][ \t]+[^ \t]') # Regexes for advanced search rxWithinClause = re.compile(' +within +(s|sentence|u|utterance|p|paragraph|' @@ -200,7 +201,7 @@ class QueryParser: return [0] return [t for t in sorted(terms)] - def build_get_string(self, getParams, config: ResourceConfig, withinClause=''): + def build_get_string(self, getParams, config: ResourceConfig, searchOptions: dict, withinClause=''): # Abstract function raise NotImplementedError() @@ -257,7 +258,7 @@ class QueryParser: # Abstract function raise NotImplementedError() - def translate_simple(self, query: str, config: ResourceConfig, start=0, end=-1): + def translate_simple(self, query: str, config: ResourceConfig, searchOptions: dict, start=0, end=-1): """ Translate a simple search (CQL) query into a corpus-specific query (GET query, JSON Elasticsearch query or whatever). @@ -275,10 +276,10 @@ class QueryParser: if end == 0: raise Diagnostic(DiagnosticTypes.sru, 27) if self.rxTermQuery.search(query) is not None: - return self.build_get_string(self.term_query(query, config), config) - return self.build_get_string(self.translate_simple(query, config, + return self.build_get_string(self.term_query(query, config), config, searchOptions) + return self.build_get_string(self.translate_simple(query, config, searchOptions, start=start, end=end), - config) + config, searchOptions) # if query.count('(') != query.count(')'): # return None if len(query) <= 0: @@ -294,18 +295,22 @@ class QueryParser: iOpPos, strOp = self.find_operator(query, start, end) if iOpPos == -1: if query[start] == '(' and query[end - 1] == ')': - return self.translate_simple(query, config, start=start + 1, end=end - 1) + return self.translate_simple(query, config, searchOptions, start=start + 1, end=end - 1) else: - return self.term_query(query[start:end], config) + queryPart = query[start:end] + if ((not queryPart.startswith('"') or not queryPart.endswith('"')) + and self.rxQueryWSpaces.search(queryPart.strip('"')) is not None): + raise Diagnostic(DiagnosticTypes.sru, 10) + return self.term_query(queryPart, config) if strOp in ('AND', 'OR'): - resultLeft = self.translate_simple(query, config, start=start, end=iOpPos) - resultRight = self.translate_simple(query, config, start=iOpPos + len(strOp), + resultLeft = self.translate_simple(query, config, searchOptions, start=start, end=iOpPos) + resultRight = self.translate_simple(query, config, searchOptions, start=iOpPos + len(strOp), end=end) if len(resultLeft) <= 0 or len(resultRight) <= 0: raise Diagnostic(DiagnosticTypes.sru, 10) return self.binary_bool(strOp, resultLeft, resultRight, config) elif strOp == 'NOT': - resultRight = self.translate_simple(query, config, start=iOpPos + len(strOp), + resultRight = self.translate_simple(query, config, searchOptions, start=iOpPos + len(strOp), end=end) return self.not_bool(resultRight, config) raise Diagnostic(DiagnosticTypes.sru, 10) @@ -401,7 +406,7 @@ class QueryParser: return self.adv_main_or(resultLeft, resultRight, config) raise NotImplementedError - def translate_advanced(self, query: str, config: ResourceConfig): + def translate_advanced(self, query: str, config: ResourceConfig, searchOptions: dict): """ Translate an advanced search (FCS-QL) query into a corpus-specific query (GET query, JSON Elasticsearch query or whatever). @@ -429,7 +434,7 @@ class QueryParser: end = len(query) if end == 0: raise Diagnostic(DiagnosticTypes.sru, 27) - return self.build_get_string(self.adv_main_query(query, config, start=0, end=end), config, + return self.build_get_string(self.adv_main_query(query, config, start=0, end=end), config, searchOptions, withinClause=withinClause) def validate_query(self, operation, version, queryType, query, diff --git a/common/tsakorpus_query_parser.py b/common/tsakorpus_query_parser.py index f6a61e4..8602bc9 100644 --- a/common/tsakorpus_query_parser.py +++ b/common/tsakorpus_query_parser.py @@ -14,7 +14,7 @@ class TsakorpusQueryParser(QueryParser): rxTsakorpusBool = re.compile('[()|,]') - def build_get_string(self, getParams, config: ResourceConfig, withinClause=''): + def build_get_string(self, getParams, config: ResourceConfig, searchOptions: dict, withinClause=''): """ Build a GET string (everything after the ?) from a description of the GET parameters in the getParams list. @@ -34,7 +34,7 @@ class TsakorpusQueryParser(QueryParser): s += '&' + param[0] + str(param[1]) + sfx + '=' + quote(str(param[2])) for i in termIndexes: s += '&lang' + str(i) + '=' + config.search_lang_id - s += '&page_size=' + str(config.max_hits) + s += '&page_size=' + str(min(config.max_hits, searchOptions['maximumRecords'])) s += '&precise=on&sort=random&response_format=json&distance_strict=on' return s diff --git a/common/tsakorpus_response_parser.py b/common/tsakorpus_response_parser.py index 4b2d112..be5ca79 100644 --- a/common/tsakorpus_response_parser.py +++ b/common/tsakorpus_response_parser.py @@ -135,7 +135,7 @@ class TsakorpusResponseParser: return record - def parse(self, response, config: ResourceConfig, xFcsDataviews, lang=''): + def parse(self, response, config: ResourceConfig, searchOptions: dict, lang=''): """ Read a dictionary with the first N hits returned by a Tsakorpus instance. Return a list of Record objects and the total number of @@ -144,15 +144,21 @@ class TsakorpusResponseParser: self.pc = POSConvertor(config) diagnostics = [] advancedHits = False - dataViewsRequested = {v.strip() for v in xFcsDataviews.split(',') if len(v.strip()) > 0} + dataViewsRequested = {v.strip() for v in searchOptions['x-fcs-dataviews'].split(',') if len(v.strip()) > 0} if 'adv' in dataViewsRequested: advancedHits = True nRecords = 0 + records = [] if 'n_sentences' in response: nRecords = response['n_sentences'] + if searchOptions['startRecord'] > 1 and nRecords < searchOptions['startRecord']: + # We don't actually care about startRecord, but we should + # return a fatal diagnostic if it is larger than the number + # of hits. + diagnostics.append(Diagnostic(DiagnosticTypes.sru, 61)) + return records, nRecords, diagnostics if nRecords <= 0 or 'contexts' not in response: - return [], nRecords, diagnostics - records = [] + return records, nRecords, diagnostics for context in response['contexts']: records.append(self.parse_context(context, config, lang, advancedHits)) return records, nRecords, diagnostics diff --git a/common/views_logic.py b/common/views_logic.py index ca27bd2..e8aff25 100644 --- a/common/views_logic.py +++ b/common/views_logic.py @@ -8,24 +8,133 @@ from .diagnostics import Diagnostic from .config import ResourceConfig +def initial_validation(operation, version, queryType, searchOptions, query): + """ + Validate and convert values of the main request parameters. + Return converted values and a list of fatal diagnostics, if anything is wrong. + """ + failDiagnoctics = [] + + if version == '1.2': + version = SRUVersion.v1_2 + elif version == '2.0': + version = SRUVersion.v2_0 + else: + version = SRUVersion.v2_0 + failDiagnoctics.append(Diagnostic(DiagnosticTypes.sru, 5, details='2.0')) + + if operation == '': + if len(query) > 0: + operation = Operation.searchRetrieve + else: + operation = Operation.explain + elif operation == 'explain': + operation = Operation.explain + elif operation == 'searchRetrieve': + operation = Operation.searchRetrieve + elif operation == 'scan': + operation = Operation.scan + else: + operation = Operation.explain + failDiagnoctics.append(Diagnostic(DiagnosticTypes.sru, 4, version=version)) + + if queryType == 'fcs': + queryType = QueryType.fcs + elif queryType == 'cql': + queryType = QueryType.cql + else: + queryType = QueryType.cql + failDiagnoctics.append(Diagnostic(DiagnosticTypes.sru, 6, message='Supported query types: fcs and cql.', + version=version)) + + try: + searchOptions['startRecord'] = int(searchOptions['startRecord']) + except ValueError: + searchOptions['startRecord'] = 1 + failDiagnoctics.append(Diagnostic(DiagnosticTypes.sru, 6, message='startRecord should be a positive integer.', + version=version)) + if searchOptions['startRecord'] < 1: + failDiagnoctics.append(Diagnostic(DiagnosticTypes.sru, 6, message='startRecord should be a positive integer.', + version=version)) + + try: + searchOptions['maximumRecords'] = int(searchOptions['maximumRecords']) + except ValueError: + searchOptions['maximumRecords'] = 0 + failDiagnoctics.append( + Diagnostic(DiagnosticTypes.sru, 6, message='maximumRecords should be a non-negative integer.', + version=version)) + if searchOptions['maximumRecords'] < 0: + failDiagnoctics.append( + Diagnostic(DiagnosticTypes.sru, 6, message='maximumRecords should be a non-negative integer.', + version=version)) + + # recordPacking has entirely different semantics in SRU 1.2 and SRU 2.0 + if version == SRUVersion.v1_2: + if searchOptions['recordPacking'] == '': + searchOptions['recordPacking'] = 'xml' + if searchOptions['recordPacking'] not in ('xml', 'string'): + failDiagnoctics.append( + Diagnostic(DiagnosticTypes.sru, 71, message='recordPacking should equal "xml" or "string".', + version=version)) + else: + if searchOptions['recordXMLEscaping'] == '': + searchOptions['recordXMLEscaping'] = 'xml' + if searchOptions['recordPacking'] == '': + searchOptions['recordPacking'] = 'packed' + if searchOptions['recordXMLEscaping'] not in ('xml', 'string'): + failDiagnoctics.append( + Diagnostic(DiagnosticTypes.sru, 71, message='recordXMLEscaping should equal "xml" or "string".', + version=version)) + if searchOptions['recordPacking'] not in ('packed', 'unpacked'): + failDiagnoctics.append( + Diagnostic(DiagnosticTypes.sru, 6, message='recordPacking should equal "packed" or "unpacked".', + version=version)) + + try: + searchOptions['resultSetTTL'] = int(searchOptions['resultSetTTL']) + except ValueError: + searchOptions['resultSetTTL'] = 0 + failDiagnoctics.append( + Diagnostic(DiagnosticTypes.sru, 6, message='resultSetTTL should be a positive integer.', + version=version)) + if searchOptions['resultSetTTL'] < 0: + # This does not look good, but we don't care because this + # value is not used anyway + pass + # failDiagnoctics.append( + # Diagnostic(DiagnosticTypes.sru, 6, message='resultSetTTL should be a positive integer.', + # version=version)) + + return operation, version, queryType, searchOptions, failDiagnoctics + + def fatal_response(operation: Operation, version: SRUVersion, + config: Optional[ResourceConfig], diagnostics: list[Diagnostic], request, templates): """ Return a response with the fatal diagnostics and no other payload. """ + if config is None: + configStr = '' + else: + configStr = config.as_dict() + if version == SRUVersion.v1_2: + templateVersion = 1 + else: + templateVersion = 2 + for diag in diagnostics: + diag.version = version diagStr = [str(d) for d in diagnostics] if operation in (Operation.explain, Operation.scan): - if version == SRUVersion.v1_2: - templateVersion = 1 - else: - templateVersion = 2 return templates.TemplateResponse('explain_response.xml', { 'request': request, 'diagnostics': diagStr, + 'config': configStr, 'version': templateVersion }, media_type='application/xml') @@ -34,7 +143,8 @@ def fatal_response(operation: Operation, { 'request': request, 'diagnostics': diagStr, - 'n_hits': 0 + 'n_hits': 0, + 'version': templateVersion }, media_type='application/xml') @@ -52,6 +162,8 @@ def process_explain(version: SRUVersion, templateVersion = 1 else: templateVersion = 2 + for diag in diagnostics: + diag.version = version endpointDescNeeded = False if 'x-fcs-endpoint-description' in searchOptions and searchOptions['x-fcs-endpoint-description'] == 'true': endpointDescNeeded = True @@ -70,7 +182,7 @@ def process_explain(version: SRUVersion, def process_search_retrieve(version: SRUVersion, queryType: QueryType, query: str, - searchOptions: dict[str, str], + searchOptions: dict, config: Optional[ResourceConfig], diagnostics: list[Diagnostic], app, request, templates): @@ -78,18 +190,26 @@ def process_search_retrieve(version: SRUVersion, Process a searchRetrieve request. Return a rendered XML response. """ + if version == SRUVersion.v1_2: + templateVersion = 1 + else: + templateVersion = 2 + for diag in diagnostics: + diag.version = version if config.platform == CorpPlatform.annis: try: if queryType == QueryType.cql: - query = app.qp_annis.translate_simple(query, config) + query = app.qp_annis.translate_simple(query, config, searchOptions) else: - query = app.qp_annis.translate_advanced(query, config) + query = app.qp_annis.translate_advanced(query, config, searchOptions) print(query) # res = app.qp_annis.send_query(query, config) except Diagnostic as diag: - return fatal_response(Operation.searchRetrieve, version, diagnostics + [diag], request, templates) + return fatal_response(Operation.searchRetrieve, version, config, diagnostics + [diag], request, templates) return query['query'] # records, nHits, diagnostics = app.rp_annis.parse(res, config, searchOptions['x-fcs-dataviews']) + # if any(diag.is_fatal() for diag in diagnostics): + # return fatal_response(Operation.searchRetrieve, version, config, diagnostics, request, templates) # records = [r.as_dict() for r in records] # diagnostics = [str(d) for d in diagnostics] # return templates.TemplateResponse('search_retrieve_response.xml', @@ -97,20 +217,23 @@ def process_search_retrieve(version: SRUVersion, # 'request': request, # 'n_hits': nHits, # 'records': records, + # 'version': templateVersion, # 'diagnostics': diagnostics # }, # media_type='application/xml') if config.platform == CorpPlatform.tsakorpus: try: if queryType == QueryType.cql: - strGetParams = app.qp_tsakorpus.translate_simple(query, config) + strGetParams = app.qp_tsakorpus.translate_simple(query, config, searchOptions) else: - strGetParams = app.qp_tsakorpus.translate_advanced(query, config) + strGetParams = app.qp_tsakorpus.translate_advanced(query, config, searchOptions) print(strGetParams) res = app.qp_tsakorpus.send_query(strGetParams, config) except Diagnostic as diag: - return fatal_response(Operation.searchRetrieve, version, diagnostics + [diag], request, templates) - records, nHits, diagnostics = app.rp_tsakorpus.parse(res, config, searchOptions['x-fcs-dataviews']) + return fatal_response(Operation.searchRetrieve, version, config, diagnostics + [diag], request, templates) + records, nHits, diagnostics = app.rp_tsakorpus.parse(res, config, searchOptions) + if any(diag.is_fatal() for diag in diagnostics): + return fatal_response(Operation.searchRetrieve, version, config, diagnostics, request, templates) records = [r.as_dict() for r in records] diagnostics = [str(d) for d in diagnostics] return templates.TemplateResponse('search_retrieve_response.xml', @@ -118,20 +241,22 @@ def process_search_retrieve(version: SRUVersion, 'request': request, 'n_hits': nHits, 'records': records, + 'version': templateVersion, 'diagnostics': diagnostics }, media_type='application/xml') elif config.platform == CorpPlatform.litterae: try: if queryType == QueryType.cql: - strGetParams = app.qp_litterae.translate_simple(query, config) + strGetParams = app.qp_litterae.translate_simple(query, config, searchOptions) else: - strGetParams = app.qp_litterae.translate_simple(query, config) + # No advanced search for Litterae + strGetParams = app.qp_litterae.translate_simple(query, config, searchOptions) # print(strGetParams) res = app.qp_litterae.send_query(strGetParams, config) print(res) except Diagnostic as diag: - return fatal_response(Operation.searchRetrieve, version, diagnostics + [diag], request, templates) + return fatal_response(Operation.searchRetrieve, version, config, diagnostics + [diag], request, templates) for dv in searchOptions['x-fcs-dataviews'].split(','): dv = dv.strip() if dv != 'hits' and version == SRUVersion.v2_0: @@ -139,8 +264,10 @@ def process_search_retrieve(version: SRUVersion, # are available as a data view. # If SRU 1.2 is used, such a diagnostic has already been added # at a previous step. - diagnostics.append(Diagnostic(DiagnosticTypes.fcs, 4, details=dv)) - records, nHits, diagnostics = app.rp_litterae.parse(res, config, searchOptions['x-fcs-dataviews']) + diagnostics.append(Diagnostic(DiagnosticTypes.fcs, 4, details=dv, version=version)) + records, nHits, diagnostics = app.rp_litterae.parse(res, config, searchOptions) + if any (diag.is_fatal() for diag in diagnostics): + return fatal_response(Operation.searchRetrieve, version, config, diagnostics, request, templates) records = [r.as_dict() for r in records] diagnostics = [str(d) for d in diagnostics] return templates.TemplateResponse('search_retrieve_response.xml', @@ -148,6 +275,7 @@ def process_search_retrieve(version: SRUVersion, 'request': request, 'n_hits': nHits, 'records': records, + 'version': templateVersion, 'diagnostics': diagnostics }, media_type='application/xml') @@ -172,7 +300,7 @@ def process_request(operation: Operation, # If something is clearly wrong with the query, return # a response with the list of diagnostics if config is None or any(d.is_fatal() for d in diagnostics): - return fatal_response(operation, version, diagnostics, request, templates) + return fatal_response(operation, version, config, diagnostics, request, templates) # If everything looks good, proceed to query parsing if operation == Operation.searchRetrieve: @@ -183,8 +311,8 @@ def process_request(operation: Operation, # We should not end up here, but if we did, something went wrong and # no fatal diagnostic describes the problem. Add a generic fatal diagnostic # and return a fatal response. - diagnostics.append(Diagnostic(DiagnosticTypes.sru, 1)) - return fatal_response(operation, version, diagnostics, request, templates) + diagnostics.append(Diagnostic(DiagnosticTypes.sru, 1, version=version)) + return fatal_response(operation, version, config, diagnostics, request, templates) if __name__ == '__main__': diff --git a/main.py b/main.py index 53717de..ebcfdd4 100644 --- a/main.py +++ b/main.py @@ -13,11 +13,9 @@ from common.enums import * from common.diagnostics import Diagnostic from common.config import ResourceConfig, read_configs from common.views_logic import * -import json -import os -import re -import copy import uvicorn +from a2wsgi import ASGIMiddleware +from datetime import datetime app = FastAPI() app.mount('/static', StaticFiles(directory='static'), name='static') @@ -31,6 +29,12 @@ app.qp_tsakorpus = TsakorpusQueryParser() app.rp_tsakorpus = TsakorpusResponseParser() app.qp_annis = AnnisQueryParser() app.configs = read_configs() +app.logging = True + +# The following line is needed in case you want to deploy the endpoint +# under Apache2 with WSGI. Apache's mod_wsgi will import the variable +# named 'application' from this file. +application = ASGIMiddleware(app) @app.get('/') @@ -42,10 +46,19 @@ def root(): def endpoint( request: Request, corpusID: str, - operation: Operation = Operation.explain, - version: SRUVersion = SRUVersion.v2_0, - queryType: QueryType = QueryType.cql, + operation: str = '', + version: str = '2.0', + queryType: str = 'cql', query: str = '', + startRecord: str = '1', + maximumRecords: str = '999999', + recordPacking: str = '', + recordXMLEscaping: str = '', + recordSchema: str = '', + resultSetTTL: str = '999999', + stylesheet: str = '', + extraRequestData: str = '', + httpAccept: str = 'application/sru+xml', xFcsEndpointDescription: str = Query( default='', alias='x-fcs-endpoint-description' @@ -63,20 +76,51 @@ def endpoint( alias='x-fcs-rewrites-allowed' ) ): + """ + Process incoming HTTP requests. Return an XML response. + Main parameters are defined here: https://www.loc.gov/standards/sru/sru-1-2.html (SRU 1.2) + and here http://docs.oasis-open.org/search-ws/searchRetrieve/v1.0/os/part3-sru2.0/searchRetrieve-v1.0-os-part3-sru2.0.html (SRU 2.0). + Only a part of them is actually taken into account. + Extra parameters (starting with x-) are defined in the FCS specifications. + """ searchOptions = { + 'startRecord': startRecord, + 'maximumRecords': maximumRecords, + 'recordPacking': recordPacking, + 'recordXMLEscaping': recordXMLEscaping, + 'recordSchema': recordSchema, + 'resultSetTTL': resultSetTTL, + 'stylesheet': stylesheet, + 'extraRequestData': extraRequestData, + 'httpAccept': httpAccept, 'x-fcs-endpoint-description': xFcsEndpointDescription, 'x-fcs-context': xFcsContext, 'x-fcs-dataviews': xFcsDataviews, 'x-fcs-rewrites-allowed': xFcsRewritesAllowed } + if app.logging: + msg = str(datetime.now()) + '\t' + str(request.query_params) + '\n' + with open('query_log.txt', 'a', encoding='utf-8') as fLog: + fLog.write(msg) + + # Validate values of operation, version, queryType and some optional parameters + operation, version, queryType, searchOptions, failDiagnoctics = initial_validation(operation, version, queryType, + searchOptions, query) + # Check if the corpus ID is correct if corpusID not in app.configs: message = 'No corpus with this ID (' + corpusID +') is served by this Endpoint. ' \ 'Valid corpus IDs are: ' + '; '.join(cID for cID in sorted(app.configs)) + '.' - diag = Diagnostic(DiagnosticTypes.sru, 235, message=message) # "Database does not exist" - return process_request(operation, version, queryType, query, searchOptions, None, [diag], app, request, templates) - config = app.configs[corpusID] + failDiagnoctics.append(Diagnostic(DiagnosticTypes.sru, 235, message=message, version=version)) # "Database does not exist" + config = None + else: + config = app.configs[corpusID] + + if len(failDiagnoctics) > 0: + # This is as far as we can get with bad parameter values + return process_request(operation, version, queryType, query, searchOptions, config, failDiagnoctics, app, request, + templates) # Check for common problems with parameter values diagnostics = app.qp.validate_query(operation, version, queryType, query, diff --git a/notes.txt b/notes.txt index 0974489..c5bebb2 100644 --- a/notes.txt +++ b/notes.txt @@ -20,6 +20,8 @@ p. 12 and elsewhere: http://explain.z3950.org/dtd/2.0/ mentioned as the URL of t p. 12-13, example: what is "result-id" in SupportedLayer? What do I put as the text of the SupportedLayer element? +p. 13: It says 'scan' operation is not used for now, but CLARIN endpoint tester has tests that send scan requests. Why? + p. 14: x-cmd-resource-info parameter present in the query example, but never explained (mentioned in some 2013 slides on FCS; should now probably be x-fcs-endpoint-description) p. 14: What is the sruResponse prefix? It is never mentioned. No namespace is provided here, so the example is actually invalid XML. (Who doesn't validate their examples before showing them to a wide audience -- in an official specification, no less??) Maybe sru was actually meant (see p. 2 and an SRU v. 1.2 example above)? I'm putting sru there for now. diff --git a/requirements.txt b/requirements.txt index 3e16ef4..8886832 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,5 @@ fastapi>=0.88.0 uvicorn>=0.20.0 lxml Jinja2>=3.0.3 -requests \ No newline at end of file +requests +a2wsgi \ No newline at end of file diff --git a/static/diagnostic.xml b/static/diagnostic.xml index 7a157a9..0292e4c 100644 --- a/static/diagnostic.xml +++ b/static/diagnostic.xml @@ -1,5 +1,6 @@ {# Per FCS specifications (section 1.5), SRU 2.0 diagnostics must use this namespace and prefixed namespace syntax #} -<diag:diagnostic xmlns:diag="http://docs.oasis-open.org/ns/search-ws/diagnostic"> +{% if version == 1 -%}{% set sru_version = '1.2' %}{% else -%}{% set sru_version = '2.0' %}{% endif -%} +<diag:diagnostic xmlns:diag="{% if sru_version == '2.0' %}http://docs.oasis-open.org/ns/search-ws/diagnostic{% else %}http://www.loc.gov/zing/srw/diagnostic/{% endif %}"> <diag:uri>{{ uri }}</diag:uri>{% if details|length > 0 %} <diag:details>{{ details }}</diag:details>{% endif %}{% if message|length > 0 %} <diag:message>{{ message }}</diag:message>{% endif %} diff --git a/static/endpoint_description.xml b/static/endpoint_description.xml index 0f5c3c3..a695229 100644 --- a/static/endpoint_description.xml +++ b/static/endpoint_description.xml @@ -1,5 +1,5 @@ <sru:extraResponseData> - <ed:EndpointDescription xmlns:ed="http://clarin.eu/fcs/endpoint-description" version="{{ sru_version }}"> + <ed:EndpointDescription xmlns:ed="http://clarin.eu/fcs/endpoint-description" version="1.0"> <ed:Capabilities>{% if config.basic_search_capability %} <ed:Capability>http://clarin.eu/fcs/capability/basic-search</ed:Capability>{% endif %}{% if config.advanced_search_capability and version >= 2 %} <ed:Capability>http://clarin.eu/fcs/capability/advanced-search</ed:Capability>{% endif %} diff --git a/static/explain_response.xml b/static/explain_response.xml index 3941c8a..69356e0 100644 --- a/static/explain_response.xml +++ b/static/explain_response.xml @@ -1,6 +1,6 @@ {% if version == 1 -%}{% set sru_version = '1.2' %}{% else -%}{% set sru_version = '2.0' %}{% endif -%} <?xml version='1.0' encoding='utf-8'?> -<sru:explainResponse xmlns:sru="http://www.loc.gov/zing/srw/"> +<sru:explainResponse xmlns:sru="{% if sru_version == '2.0' %}http://docs.oasis-open.org/ns/search-ws/sruResponse{% else %}http://www.loc.gov/zing/srw/{% endif %}"> <sru:version>{{ sru_version }}</sru:version>{% if config %} <sru:record> <sru:recordSchema>http://explain.z3950.org/dtd/2.0/</sru:recordSchema> diff --git a/static/search_retrieve_response.xml b/static/search_retrieve_response.xml index 0e73071..544e999 100644 --- a/static/search_retrieve_response.xml +++ b/static/search_retrieve_response.xml @@ -1,6 +1,7 @@ <?xml version='1.0' encoding='utf-8'?> -<sru:searchRetrieveResponse xmlns:sru="http://docs.oasis-open.org/ns/search-ws/sruResponse"> - <sru:version>2.0</sru:version> +{% if version == 1 -%}{% set sru_version = '1.2' %}{% else -%}{% set sru_version = '2.0' %}{% endif -%} +<sru:searchRetrieveResponse xmlns:sru="{% if sru_version == '2.0' %}http://docs.oasis-open.org/ns/search-ws/sruResponse{% else %}http://www.loc.gov/zing/srw/{% endif %}"> + <sru:version>{{ sru_version }}</sru:version> <sru:numberOfRecords>{{ n_hits }}</sru:numberOfRecords>{% if records %} <sru:records>{% for record in records %} <sru:record> -- GitLab