diff --git a/common/search_retrieve.py b/common/search_retrieve.py index 32ba11362fff534665ef25112c0ffd9e6f9a95fd..4738e17fc5193b5be0eb88ddc9d1f6fce3ed4dcb 100644 --- a/common/search_retrieve.py +++ b/common/search_retrieve.py @@ -11,8 +11,8 @@ class Record: multiple hits. Here, each record contains exactly one resource with exactly one fragment with exactly one hit. """ - def __init__(self, dataView: DataView): - self.dataView = dataView + def __init__(self, advancedHits: bool = False): + self.advancedHits = advancedHits # For simple search: self.text = '' self.textNoHighlight = '' # no <hits:Hit> elements, just text @@ -34,14 +34,11 @@ class Record: 'dv_hits': [{ 'text': self.text }], - 'dv_adv': [{ - 'segments': self.segments, - 'layers': self.layers - }] + 'dv_adv': [] }] }] } - if self.dataView == DataView.adv: + if self.advancedHits: record['resources'][0]['resource_fragments'][0]['dv_adv'].append({ 'segments': self.segments, 'layers': self.layers diff --git a/common/tsakorpus_response_parser.py b/common/tsakorpus_response_parser.py index d70b8a41e1661bc100c169ea8dcc5b758604f105..17e5e43e95bfe3526d19f13bcb496d0d95101ed3 100644 --- a/common/tsakorpus_response_parser.py +++ b/common/tsakorpus_response_parser.py @@ -77,7 +77,7 @@ class TsakorpusResponseParser: }) - def parse_span(self, el, record): + def parse_span(self, el, record, advancedHits=False): """ Parse one <span> element from the HTML representation of one hit returned by a Tsakorpus instance. Add the extracted @@ -95,16 +95,17 @@ class TsakorpusResponseParser: if 'class' in el.attrib and re.search('\\bword\\b', el.attrib['class']) is not None: if re.search('\\bwmatch\\b', el.attrib['class']) is not None: bMatch = True - segID = 's' + str(len(record.segments)) - segment = { - 'id': segID, - 'start': len(record.textNoHighlight) + 1, - 'end': len(record.textNoHighlight) + len(el.text) - } - record.segments.append(segment) record.textNoHighlight += el.text - if 'data-ana' in el.attrib: - self.parse_annotation(el.attrib['data-ana'], segID, record) + if advancedHits: + segID = 's' + str(len(record.segments)) + segment = { + 'id': segID, + 'start': len(record.textNoHighlight) + 1, + 'end': len(record.textNoHighlight) + len(el.text) + } + record.segments.append(segment) + if 'data-ana' in el.attrib: + self.parse_annotation(el.attrib['data-ana'], segID, record) if bMatch: record.text += '<hits:Hit>' + el.text + '</hits:Hit>' else: @@ -114,11 +115,11 @@ class TsakorpusResponseParser: record.textNoHighlight += el.tail - def parse_context(self, hit, config: ResourceConfig, lang=''): + def parse_context(self, hit, config: ResourceConfig, lang='', advancedHits=False): """ Parse one hit. Return it as a Record object. """ - record = Record(dataView=DataView.hits) + record = Record(advancedHits=advancedHits) if len(lang) <= 0: lang = config.search_lang_id if ('languages' not in hit @@ -130,17 +131,25 @@ class TsakorpusResponseParser: content = fragment_fromstring(contentTxt, create_parent='div') for el in content: - self.parse_span(el, record) + self.parse_span(el, record, advancedHits) return record - def parse(self, response, config: ResourceConfig, lang=''): + def parse(self, response, config: ResourceConfig, xFcsDataviews, lang=''): """ Read a dictionary with the first N hits returned by a Tsakorpus instance. Return a list of Record objects and the total number of records found. """ self.pc = POSConvertor(config) + diagnostics = [] + advancedHits = False + dataViewsRequested = {v.strip() for v in xFcsDataviews.split(',') if len(v.strip()) > 0} + if 'adv' in dataViewsRequested: + advancedHits = True + for v in dataViewsRequested: + if v not in ('hits', 'adv'): + diagnostics.append(Diagnostic(DiagnosticTypes.fcs, 4, details=v)) nRecords = 0 if 'n_sentences' in response: nRecords = response['n_sentences'] @@ -148,7 +157,7 @@ class TsakorpusResponseParser: return [], nRecords records = [] for context in response['contexts']: - records.append(self.parse_context(context, config, lang)) + records.append(self.parse_context(context, config, lang, advancedHits)) return records, nRecords diff --git a/main.py b/main.py index d43431cb9dbfc6138b070ddd0a547e50dd989d4a..476cc39c7e86e2a7e4b5325cd00f842a029520ad 100644 --- a/main.py +++ b/main.py @@ -76,7 +76,7 @@ def endpoint( except Diagnostic as diag: print('diag', str(diag)) return Response(content=str(diag), media_type='application/xml') - records, nHits = app.rp_tsakorpus.parse(res, config) + records, nHits = app.rp_tsakorpus.parse(res, config, xFcsDataviews) records = [r.as_dict() for r in records] return templates.TemplateResponse('search_retrieve_response.xml', {