diff --git a/common/tsakorpus_response_parser.py b/common/tsakorpus_response_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..683c0042add310da57a6c63ceebc8d4d5bdb1748 --- /dev/null +++ b/common/tsakorpus_response_parser.py @@ -0,0 +1,69 @@ +from urllib.parse import quote +import re +import json +import urllib.request +from lxml.html import fragment_fromstring +from .enums import * +from .config import ResourceConfig +from .search_retrieve import Record +from .diagnostics import Diagnostic, DiagnosticTypes + + +class TsakorpusResponseParser: + """ + Parses responses from a Tsakorpus instance. + """ + def __init__(self): + pass + + def parse_context(self, hit, config: ResourceConfig, lang=''): + """ + Parse one hit. Return it as a Record object. + """ + record = Record(dataView=DataView.hits) + if len(lang) <= 0: + lang = config.search_lang_id + if ('languages' not in hit + or lang not in hit['languages'] + or 'text' not in hit['languages'][lang]): + return record + content = fragment_fromstring(hit['languages'][lang]['text'], + create_parent='div') + text = '' + for el in content: + if el.tag == 'span' and 'class' in el.attrib and 'sentence_meta' in el.attrib['class']: + if el.tail is not None: + text += el.tail.strip('\n\t ') + continue + if el.text is not None: + if 'class' in el.attrib and re.search('\\bwmatch\\b', el.attrib['class']) is not None: + text += '<hits:Hit>' + el.text + '</hits:Hit>' + else: + text += el.text + if el.tail is not None: + text += el.tail + print(text) + record.text = text + return record + + + def parse(self, response, config: ResourceConfig, lang=''): + """ + Read a dictionary with the first N hits returned by a Tsakorpus + instance. Return a list of Record objects and the total number of + records found. + """ + nRecords = 0 + if 'n_sentences' in response: + nRecords = response['n_sentences'] + if nRecords <= 0 or 'contexts' not in response: + return [], nRecords + records = [] + for context in response['contexts']: + records.append(self.parse_context(context, config, lang)) + return records, nRecords + + +if __name__ == '__main__': + pass + diff --git a/main.py b/main.py index 37c7634ffe882771ba109aef9e7a6249ba1359f0..8ef74a4b28b7d98e4b2d6d0b61fe68580105c210 100644 --- a/main.py +++ b/main.py @@ -5,6 +5,7 @@ from fastapi.encoders import jsonable_encoder from fastapi.responses import JSONResponse from common.query_parser import QueryParser from common.tsakorpus_query_parser import TsakorpusQueryParser +from common.tsakorpus_response_parser import TsakorpusResponseParser from common.enums import * from common.diagnostics import Diagnostic from common.config import ResourceConfig, read_configs @@ -19,6 +20,7 @@ templates = Jinja2Templates(directory='static') app.qp = QueryParser() app.qp_tsakorpus = TsakorpusQueryParser() +app.rp_tsakorpus = TsakorpusResponseParser() app.configs = read_configs() @@ -29,6 +31,7 @@ def root(): @app.get('/fcs-endpoint/{corpusID}') def endpoint( + request: Request, corpusID: str, operation: Operation = Operation.explain, version: SRUVersion = SRUVersion.v2_0, @@ -73,7 +76,16 @@ def endpoint( except Diagnostic as diag: print('diag', str(diag)) return Response(content=str(diag), media_type='application/xml') - return str(res) + records, nHits = app.rp_tsakorpus.parse(res, config) + records = [r.as_dict() for r in records] + return templates.TemplateResponse('search_retrieve_response.xml', + { + 'request': request, + 'n_hits': nHits, + 'records': records + }) + # media_type='application/xml') + # return str(res) return {'operation': operation, 'version': version} diff --git a/static/search_retrieve_response.xml b/static/search_retrieve_response.xml index 0648ffea265aa6903ef42cd9ec73a8fecb3567d9..b356b31b301536a114eb1f38b4946633b7836b14 100644 --- a/static/search_retrieve_response.xml +++ b/static/search_retrieve_response.xml @@ -13,7 +13,7 @@ <sruResponse:recordPosition>{{ loop.index }}</sruResponse:recordPosition> </sruResponse:record> {% endfor %} -</sruResponse:records>{% if n_hits > record.resources|length %} -<sruResponse:nextRecordPosition>{{ record.resources|length + 1 }}</sruResponse:nextRecordPosition>{% endif %} +</sruResponse:records>{% if n_hits > records|length %} +<sruResponse:nextRecordPosition>{{ records|length + 1 }}</sruResponse:nextRecordPosition>{% endif %} <sruResponse:resultCountPrecision>info:srw/vocabulary/resultCountPrecision/1/exact</sruResponse:resultCountPrecision> </sruResponse:searchRetrieveResponse> \ No newline at end of file