diff --git a/common/litterae_query_parser.py b/common/litterae_query_parser.py index 13e2b8f8560efeaf191aa88430e36e24b00610a1..118c3e01d7415f9098e223ad83a8e22da35576a5 100644 --- a/common/litterae_query_parser.py +++ b/common/litterae_query_parser.py @@ -22,7 +22,7 @@ class LitteraeQueryParser(QueryParser): termIndexes = self.term_indexes(getParams) nWords = len(termIndexes) boolOperatorMentioned = False - s = 'source=advanced&sort=urn&lemma_search=False&simple_search_id=' + str(random.randint(100000, 1000000)) + s = 'source=advanced&sort=urn&lemma_search=False' for param in getParams: nSfx = '' if param[1] > 0: diff --git a/common/litterae_response_parser.py b/common/litterae_response_parser.py index c91b7bd4d27b4ddd414164c5137b803dcb24f69a..7c5d1cf97e31b3861204dc7acd0f565160efab56 100644 --- a/common/litterae_response_parser.py +++ b/common/litterae_response_parser.py @@ -2,147 +2,59 @@ from urllib.parse import quote import re import json import html -from lxml.html import fragment_fromstring +from lxml.html import fromstring, tostring from .enums import * from .config import ResourceConfig from .search_retrieve import Record from .diagnostics import Diagnostic, DiagnosticTypes -class POSConvertor: - """ - Convert corpus-specific parts of speech / grammar tags to - UPOS, using regexes correspondences set in the config. - """ - def __init__(self, config: ResourceConfig): - self.posConvert = config.pos_convert - self.posTests = [(re.compile(k), v) for k, v in self.posConvert] - - def convert_pos(self, pos): - """ - Convert corpus-specific POS tags to UPOS, if possible. - Ea - """ - for k, v in self.posTests: - if k.search(pos) is not None: - return v - return pos - - class LitteraeResponseParser: """ Parses responses from a Litterae instance. - TODO: implement """ + rxNHits = re.compile('(?:Suchergebnisse:|Search [rR]esults:)[ \t\r\n]*([0-9]+)') + rxUselessTags = re.compile('</?(?:p|small)[^\r\n<>]*>') + rxHitTag = re.compile('(</?)strong>') + def __init__(self): self.pc = None # POS convertor, rebuilt with each parse call - def parse_annotation(self, anno, segID, record): - """ - Parse HTML annotation for one word taken from a hit. - Add the data to the layers in the record object. - """ - annoTree = fragment_fromstring(anno, - create_parent='div') - lemmas = set() - lemmasStr = '_' - pos = set() - posStr = '_' - lexNodes = annoTree.xpath('div[@class="popup_word"]/div[@class="popup_ana"]/span[@class="popup_lex"]') - for node in lexNodes: - if node.text is not None: - lemmas.add(node.text) - if len(lemmas) > 0: - lemmasStr = '|'.join(l for l in sorted(lemmas)) - posNodes = annoTree.xpath('div[@class="popup_word"]/div[@class="popup_ana"]/span[@class="popup_pos"]') - for node in posNodes: - if node.text is not None: - posText = re.sub(' |[ \t\ufeff]+', '', node.text) - posText = self.pc.convert_pos(posText) - pos.add(posText) - if len(pos) > 0: - posStr = '|'.join(p for p in sorted(pos)) - - if 'pos' not in record.layers: - record.layers['pos'] = [] - record.layers['pos'].append({ - 'ref': segID, - 'value': posStr - }) - - if 'lemma' not in record.layers: - record.layers['lemma'] = [] - record.layers['lemma'].append({ - 'ref': segID, - 'value': lemmasStr - }) - - - def parse_span(self, el, record, advancedHits=False): + def process_hits(self, tableNode, config: ResourceConfig, diagnostics: list[Diagnostic], advancedHits=False): """ - Parse one <span> element from the HTML representation - of one hit returned by a Litterae instance. Add the extracted - text to the record object. + Process hits from an HTML node with the results table. + If anything goes wrong, add Diagnostic objects to diagnostics list. + Return a list of Record objects. """ - if el.tag == 'span' and 'class' in el.attrib and 'sentence_meta' in el.attrib['class']: - # This is the introductory span that only contains the header - # (title, author etc.) - if el.tail is not None: - record.text += el.tail.strip('\n\t ') - return - - if el.text is not None: - bMatch = False - if 'class' in el.attrib and re.search('\\bword\\b', el.attrib['class']) is not None: - if re.search('\\bwmatch\\b', el.attrib['class']) is not None: - bMatch = True - record.textNoHighlight += el.text - if advancedHits: - segID = 's' + str(len(record.segments)) - segment = { - 'id': segID, - 'start': len(record.textNoHighlight) + 1, - 'end': len(record.textNoHighlight) + len(el.text) - } - record.segments.append(segment) - if 'data-ana' in el.attrib: - self.parse_annotation(el.attrib['data-ana'], segID, record) - if bMatch: - record.text += '<hits:Hit>' + el.text + '</hits:Hit>' - else: - record.text += el.text - if el.tail is not None: - record.text += el.tail - record.textNoHighlight += el.tail - - - def parse_context(self, hit, config: ResourceConfig, lang='', advancedHits=False): - """ - Parse one hit. Return it as a Record object. - """ - record = Record(advancedHits=advancedHits) - if len(lang) <= 0: - lang = config.search_lang_id - if ('languages' not in hit - or lang not in hit['languages'] - or 'text' not in hit['languages'][lang]): - return record - contentTxt = re.sub('[\r\n\t\ufeff]+', '', hit['languages'][lang]['text'], flags=re.DOTALL) - print(contentTxt) - content = fragment_fromstring(contentTxt, - create_parent='div') - for el in content: - self.parse_span(el, record, advancedHits) - return record - - - def parse(self, response, config: ResourceConfig, xFcsDataviews, lang=''): + records = [] + rows = tableNode.xpath('tr') + iRow = 0 + iRowOffset = 0 + while iRow < len(rows) and iRow - iRowOffset < config.max_hits: + row = rows[iRow] + iRow += 1 + paragraphs = row.xpath('td/p') + if len(paragraphs) <= 0: + iRowOffset += 1 + continue + record = Record(advancedHits=advancedHits) + txtParagraphs = [] + for p in paragraphs: + txt = tostring(p, encoding='utf-8').decode('utf-8') + print(txt, type(txt)) + txt = self.rxUselessTags.sub('', txt) + txt = self.rxHitTag.sub('\\1hits:Hit>', txt) + txtParagraphs.append(txt.strip()) + record.text = ' <...> '.join(txtParagraphs).strip() + records.append(record) + return records + + def parse(self, response, config: ResourceConfig, xFcsDataviews): """ - Read a dictionary with the first N hits returned by a Litterae + Read HTML response with the first N hits returned by a Litterae instance. Return a list of Record objects and the total number of records found. """ - self.pc = POSConvertor(config) diagnostics = [] advancedHits = False dataViewsRequested = {v.strip() for v in xFcsDataviews.split(',') if len(v.strip()) > 0} @@ -151,15 +63,23 @@ class LitteraeResponseParser: for v in dataViewsRequested: if v not in ('hits', 'adv'): diagnostics.append(Diagnostic(DiagnosticTypes.fcs, 4, details=v)) + srcTree = fromstring(response) nRecords = 0 - if 'n_sentences' in response: - nRecords = response['n_sentences'] - if nRecords <= 0 or 'contexts' not in response: - return [], nRecords + nHitsNode = srcTree.xpath('//article[@class="container-fluid"]/header/h1') + if len(nHitsNode) > 0 and nHitsNode[0].text is not None: + m = self.rxNHits.search(nHitsNode[0].text) + if m is not None: + nRecords = int(m.group(1)) + resTableNodes = srcTree.xpath('//table[@id="partsSearchResultTable"]/tbody') records = [] - for context in response['contexts']: - records.append(self.parse_context(context, config, lang, advancedHits)) - return records, nRecords + if len(resTableNodes) <= 0: + nRecords = 0 + else: + records = self.process_hits(resTableNodes[0], config, diagnostics, advancedHits=advancedHits) + if len(records) < nRecords and len(records) < config.max_hits: + diagnostics.append(Diagnostic(DiagnosticTypes.sru, 59, + message='Some results could not be shown due to copyright restrictions.')) + return records, nRecords, diagnostics if __name__ == '__main__': diff --git a/common/tsakorpus_response_parser.py b/common/tsakorpus_response_parser.py index 17e5e43e95bfe3526d19f13bcb496d0d95101ed3..63bef0a5bc91de0eed3b5b9af2873318bfe8f946 100644 --- a/common/tsakorpus_response_parser.py +++ b/common/tsakorpus_response_parser.py @@ -154,11 +154,11 @@ class TsakorpusResponseParser: if 'n_sentences' in response: nRecords = response['n_sentences'] if nRecords <= 0 or 'contexts' not in response: - return [], nRecords + return [], nRecords, diagnostics records = [] for context in response['contexts']: records.append(self.parse_context(context, config, lang, advancedHits)) - return records, nRecords + return records, nRecords, diagnostics if __name__ == '__main__': diff --git a/common/views_logic.py b/common/views_logic.py new file mode 100644 index 0000000000000000000000000000000000000000..bda11f20f91ce85786f7b60d7ac22b0ffe48d302 --- /dev/null +++ b/common/views_logic.py @@ -0,0 +1,117 @@ +# Contains functions called by the top-level view functions +# that process the user's request and return a rendered XML +# template +from typing import Optional +from fastapi import Request, Query, Response +from .enums import * +from .diagnostics import Diagnostic +from .config import ResourceConfig + + +def fatal_response(operation: Operation, + version: SRUVersion, + diagnostics: list[Diagnostic], + request, templates): + """ + Return a response with the fatal diagnostics + and no other payload. + """ + diagStr = [str(d) for d in diagnostics] + if operation == Operation.explain: + templateName = 'explain_response_2.0.xml' + if version == SRUVersion.v1_2: + templateName = 'explain_response_1.2.xml' + return templates.TemplateResponse(templateName, + { + 'request': request, + 'diagnostics': diagStr + }, + media_type='application/xml') + elif operation == Operation.searchRetrieve: + return templates.TemplateResponse('search_retrieve_response.xml', + { + 'request': request, + 'diagnostics': diagStr, + 'n_hits': 0, + }, + media_type='application/xml') + + +def process_search_retrieve(version: SRUVersion, + queryType: QueryType, + query: str, + searchOptions: dict[str, str], + config: Optional[ResourceConfig], + diagnostics: list[Diagnostic], + app, request, templates): + """ + Process a searchRetrieve request. + Return a rendered XML response. + """ + if config.platform == CorpPlatform.tsakorpus: + try: + strGetParams = app.qp_tsakorpus.translate_fcsql(query, config) + print(strGetParams) + res = app.qp_tsakorpus.send_query(strGetParams, config) + except Diagnostic as diag: + return fatal_response(Operation.searchRetrieve, version, [diag], request, templates) + records, nHits, diagnostics = app.rp_tsakorpus.parse(res, config, searchOptions['x-fcs-dataviews']) + records = [r.as_dict() for r in records] + diagnostics = [str(d) for d in diagnostics] + return templates.TemplateResponse('search_retrieve_response.xml', + { + 'request': request, + 'n_hits': nHits, + 'records': records, + 'diagnostics': diagnostics + }, + media_type='application/xml') + elif config.platform == CorpPlatform.litterae: + try: + strGetParams = app.qp_litterae.translate_fcsql(query, config) + print(strGetParams) + # return strGetParams + res = app.qp_litterae.send_query(strGetParams, config) + print(res) + except Diagnostic as diag: + return fatal_response(Operation.searchRetrieve, version, [diag], request, templates) + records, nHits, diagnostics = app.rp_litterae.parse(res, config, searchOptions['x-fcs-dataviews']) + records = [r.as_dict() for r in records] + diagnostics = [str(d) for d in diagnostics] + return templates.TemplateResponse('search_retrieve_response.xml', + { + 'request': request, + 'n_hits': nHits, + 'records': records, + 'diagnostics': diagnostics + }, + media_type='application/xml') + + +def process_request(operation: Operation, + version: SRUVersion, + queryType: QueryType, + query: str, + searchOptions: dict[str, str], + config: Optional[ResourceConfig], + diagnostics: list[Diagnostic], + app, request, templates): + """ + Process validated user request that came in through the endpoint() + function in main.py. + Return a rendered template. + :param diagnostics: List of diagnostics produced by the validation + function. + """ + # If something is clearly wrong with the query, return + # a response with the list of diagnostics + if config is None or any(d.is_fatal() for d in diagnostics): + return fatal_response(operation, version, diagnostics, request, templates) + + # If everything looks good, proceed to query parsing + if operation == Operation.searchRetrieve: + return process_search_retrieve(version, queryType, query, searchOptions, config, diagnostics, app, request, templates) + + +if __name__ == '__main__': + pass diff --git a/main.py b/main.py index 289b57b33985e11df51abd43ec2e7b8b6f1bcf21..fdb2842388476a5bae76df352b03e4a71e5ec4b5 100644 --- a/main.py +++ b/main.py @@ -11,9 +11,11 @@ from common.tsakorpus_response_parser import TsakorpusResponseParser from common.enums import * from common.diagnostics import Diagnostic from common.config import ResourceConfig, read_configs +from common.views_logic import * import json import os import re +import copy import uvicorn app = FastAPI() @@ -59,58 +61,28 @@ def endpoint( alias='x-fcs-rewrites-allowed' ) ): + searchOptions = { + 'x-fcs-endpoint-description': xFcsEndpointDescription, + 'x-fcs-context': xFcsContext, + 'x-fcs-dataviews': xFcsDataviews, + 'x-fcs-rewrites-allowed': xFcsRewritesAllowed + } + + # Check if the corpus ID is correct if corpusID not in app.configs: message = 'No corpus with this ID (' + corpusID +') is served by this Endpoint. ' \ 'Valid corpus IDs are: ' + '; '.join(cID for cID in sorted(app.configs)) + '.' - diagBody = str(Diagnostic(DiagnosticTypes.sru, 235, - message=message)) # "Database does not exist" - return Response(content=diagBody, media_type='application/xml') + diag = Diagnostic(DiagnosticTypes.sru, 235, message=message) # "Database does not exist" + return process_request(operation, version, queryType, query, searchOptions, None, [diag], app, request, templates) config = app.configs[corpusID] + + # Check for common problems with parameter values diagnostics = app.qp.validate_query(operation, version, queryType, query, xFcsEndpointDescription, xFcsContext, xFcsDataviews, xFcsRewritesAllowed) - if any(d.is_fatal() for d in diagnostics): - return '\n'.join(str(d) for d in diagnostics) - - if operation == Operation.searchRetrieve: - if config.platform == CorpPlatform.tsakorpus: - try: - strGetParams = app.qp_tsakorpus.translate_fcsql(query, config) - print(strGetParams) - res = app.qp_tsakorpus.send_query(strGetParams, config) - except Diagnostic as diag: - print('diag', str(diag)) - return Response(content=str(diag), media_type='application/xml') - records, nHits = app.rp_tsakorpus.parse(res, config, xFcsDataviews) - records = [r.as_dict() for r in records] - return templates.TemplateResponse('search_retrieve_response.xml', - { - 'request': request, - 'n_hits': nHits, - 'records': records - }, - media_type='application/xml') - elif config.platform == CorpPlatform.litterae: - try: - strGetParams = app.qp_litterae.translate_fcsql(query, config) - print(strGetParams) - return strGetParams - res = app.qp_litterae.send_query(strGetParams, config) - except Diagnostic as diag: - print('diag', str(diag)) - return Response(content=str(diag), media_type='application/xml') - records, nHits = app.rp_litterae.parse(res, config, xFcsDataviews) - records = [r.as_dict() for r in records] - return templates.TemplateResponse('search_retrieve_response.xml', - { - 'request': request, - 'n_hits': nHits, - 'records': records - }, - media_type='application/xml') - # return str(res) - - return {'operation': operation, 'version': version} + # Now, do the substantial things + return process_request(operation, version, queryType, query, searchOptions, config, diagnostics, app, request, templates) + # return {'operation': operation, 'version': version} if __name__ == '__main__': diff --git a/static/explain_response_1.2.xml b/static/explain_response_1.2.xml index 3462e1d5af9b1078cd0c8bcb88a88dc818e52185..2015939f8016a230427f1a1643e38e4f493f25b0 100644 --- a/static/explain_response_1.2.xml +++ b/static/explain_response_1.2.xml @@ -1,7 +1,7 @@ {% set ep_version = 1 %} <?xml version='1.0' encoding='utf-8'?> <sru:explainResponse xmlns:sru="http://www.loc.gov/zing/srw/"> - <sru:version>1.2</sru:version> + <sru:version>1.2</sru:version>{% if config %} <sru:record> <sru:recordSchema>http://explain.z3950.org/dtd/2.0/</sru:recordSchema> <sru:recordPacking>xml</sru:recordPacking> @@ -39,7 +39,10 @@ <!-- <sru:echoedExplainRequest> is OPTIONAL --> <sru:echoedExplainRequest> <sru:version>1.2</sru:version> - <sru:baseUrl>{{ base_url }}</sru:baseUrl> + <sru:baseUrl>{{ config.base_url }}</sru:baseUrl> </sru:echoedExplainRequest>{% if endpoint_desc_needed %} -{% include 'endpoint_description.xml' }{% endif %} +{% include 'endpoint_description.xml' }{% endif %}{% endif %}{% if diagnostics and diagnostics|length > 0 %} + <sru:diagnostics>{% for d diagnostics %} + {{ d|safe }}{% endfor %} + </sru:diagnostics>{% endif %} </sru:explainResponse> \ No newline at end of file diff --git a/static/search_retrieve_response.xml b/static/search_retrieve_response.xml index c9664a70a4ecabe449de27977fbdb48d3e8d6aa4..0e73071b311d29bdbcfc6a607dc23786af92f407 100644 --- a/static/search_retrieve_response.xml +++ b/static/search_retrieve_response.xml @@ -1,19 +1,22 @@ <?xml version='1.0' encoding='utf-8'?> <sru:searchRetrieveResponse xmlns:sru="http://docs.oasis-open.org/ns/search-ws/sruResponse"> -<sru:version>2.0</sru:version> -<sru:numberOfRecords>{{ n_hits }}</sru:numberOfRecords> -<sru:records>{% for record in records %} - <sru:record> - <sru:recordSchema>http://clarin.eu/fcs/resource</sru:recordSchema> - <sru:recordXMLEscaping>xml</sru:recordXMLEscaping> - <sru:recordData>{% for resource in record.resources %} - {% include 'resource.xml' %} - {% endfor %} - </sru:recordData> - <sru:recordPosition>{{ loop.index }}</sru:recordPosition> - </sru:record> -{% endfor %} -</sru:records>{% if n_hits > records|length %} -<sru:nextRecordPosition>{{ records|length + 1 }}</sru:nextRecordPosition>{% endif %} -<sru:resultCountPrecision>info:srw/vocabulary/resultCountPrecision/1/exact</sru:resultCountPrecision> + <sru:version>2.0</sru:version> + <sru:numberOfRecords>{{ n_hits }}</sru:numberOfRecords>{% if records %} + <sru:records>{% for record in records %} + <sru:record> + <sru:recordSchema>http://clarin.eu/fcs/resource</sru:recordSchema> + <sru:recordXMLEscaping>xml</sru:recordXMLEscaping> + <sru:recordData>{% for resource in record.resources %} + {% include 'resource.xml' %} + {% endfor %} + </sru:recordData> + <sru:recordPosition>{{ loop.index }}</sru:recordPosition> + </sru:record> + {% endfor %} + </sru:records>{% if n_hits > records|length %} + <sru:nextRecordPosition>{{ records|length + 1 }}</sru:nextRecordPosition>{% endif %} + <sru:resultCountPrecision>info:srw/vocabulary/resultCountPrecision/1/exact</sru:resultCountPrecision>{% endif %}{% if diagnostics and diagnostics|length > 0 %} + <sru:diagnostics>{% for d in diagnostics %} + {{ d|safe }}{% endfor %} + </sru:diagnostics>{% endif %} </sru:searchRetrieveResponse> \ No newline at end of file