From 8e4aefeea5ca3ba563f30cf057f4fea404c69f05 Mon Sep 17 00:00:00 2001 From: Timofey Arkhangelskiy <timofey.arkhangelskiy@uni-hamburg.de> Date: Wed, 21 Dec 2022 22:16:53 +0100 Subject: [PATCH] Enable advanced data view for Tsakorpus --- common/search_retrieve.py | 18 ++++-- common/tsakorpus_response_parser.py | 98 ++++++++++++++++++++++++----- main.py | 4 +- notes.txt | 2 +- static/dataview_adv.xml | 8 +-- static/search_retrieve_response.xml | 30 ++++----- 6 files changed, 117 insertions(+), 43 deletions(-) diff --git a/common/search_retrieve.py b/common/search_retrieve.py index e25928d..32ba113 100644 --- a/common/search_retrieve.py +++ b/common/search_retrieve.py @@ -15,26 +15,34 @@ class Record: self.dataView = dataView # For simple search: self.text = '' + self.textNoHighlight = '' # no <hits:Hit> elements, just text # For advanced search: self.segments = [] - self.layers = [] + self.layers = {} # ID -> content def as_dict(self): """ Returns a dictionary for insertion into the XML template. """ + # In generic SRU, records, resources and resource fragments + # are all distinct entities. In FCS, it's the same thing. Still, + # I model the templates in a generic way, in case that changes + # in FCS in the future record = { - 'resources': { + 'resources': [{ 'resource_fragments': [{ 'dv_hits': [{ 'text': self.text }], - 'dv_adv': [] + 'dv_adv': [{ + 'segments': self.segments, + 'layers': self.layers + }] }] - } + }] } if self.dataView == DataView.adv: - record['resources']['resource_fragments'][0]['dv_adv'].append({ + record['resources'][0]['resource_fragments'][0]['dv_adv'].append({ 'segments': self.segments, 'layers': self.layers }) diff --git a/common/tsakorpus_response_parser.py b/common/tsakorpus_response_parser.py index 683c004..2af8482 100644 --- a/common/tsakorpus_response_parser.py +++ b/common/tsakorpus_response_parser.py @@ -1,7 +1,7 @@ from urllib.parse import quote import re import json -import urllib.request +import html from lxml.html import fragment_fromstring from .enums import * from .config import ResourceConfig @@ -16,6 +16,83 @@ class TsakorpusResponseParser: def __init__(self): pass + def parse_annotation(self, anno, segID, record): + """ + Parse HTML annotation for one word taken from a hit. + Add the data to the layers in the record object. + """ + annoTree = fragment_fromstring(anno, + create_parent='div') + lemmas = set() + lemmasStr = '_' + pos = set() + posStr = '_' + lexNodes = annoTree.xpath('div[@class="popup_word"]/div[@class="popup_ana"]/span[@class="popup_lex"]') + for node in lexNodes: + if node.text is not None: + lemmas.add(node.text) + if len(lemmas) > 0: + lemmasStr = '|'.join(l for l in sorted(lemmas)) + posNodes = annoTree.xpath('div[@class="popup_word"]/div[@class="popup_ana"]/span[@class="popup_pos"]') + for node in posNodes: + if node.text is not None: + posText = re.sub(' |[ \t\ufeff]+', '', node.text) + pos.add(posText) + if len(pos) > 0: + posStr = '|'.join(p for p in sorted(pos)) + + if 'pos' not in record.layers: + record.layers['pos'] = [] + record.layers['pos'].append({ + 'ref': segID, + 'value': posStr + }) + + if 'lemma' not in record.layers: + record.layers['lemma'] = [] + record.layers['lemma'].append({ + 'ref': segID, + 'value': lemmasStr + }) + + + def parse_span(self, el, record): + """ + Parse one <span> element from the HTML representation + of one hit returned by a Tsakorpus instance. Add the extracted + text to the record object. + """ + if el.tag == 'span' and 'class' in el.attrib and 'sentence_meta' in el.attrib['class']: + # This is the introductory span that only contains the header + # (title, author etc.) + if el.tail is not None: + record.text += el.tail.strip('\n\t ') + return + + if el.text is not None: + bMatch = False + if 'class' in el.attrib and re.search('\\bword\\b', el.attrib['class']) is not None: + if re.search('\\bwmatch\\b', el.attrib['class']) is not None: + bMatch = True + segID = 's' + str(len(record.segments)) + segment = { + 'id': segID, + 'start': len(record.textNoHighlight) + 1, + 'end': len(record.textNoHighlight) + len(el.text) + } + record.segments.append(segment) + record.textNoHighlight += el.text + if 'data-ana' in el.attrib: + self.parse_annotation(el.attrib['data-ana'], segID, record) + if bMatch: + record.text += '<hits:Hit>' + el.text + '</hits:Hit>' + else: + record.text += el.text + if el.tail is not None: + record.text += el.tail + record.textNoHighlight += el.tail + + def parse_context(self, hit, config: ResourceConfig, lang=''): """ Parse one hit. Return it as a Record object. @@ -27,23 +104,12 @@ class TsakorpusResponseParser: or lang not in hit['languages'] or 'text' not in hit['languages'][lang]): return record - content = fragment_fromstring(hit['languages'][lang]['text'], + contentTxt = re.sub('[\r\n\t\ufeff]+', '', hit['languages'][lang]['text'], flags=re.DOTALL) + print(contentTxt) + content = fragment_fromstring(contentTxt, create_parent='div') - text = '' for el in content: - if el.tag == 'span' and 'class' in el.attrib and 'sentence_meta' in el.attrib['class']: - if el.tail is not None: - text += el.tail.strip('\n\t ') - continue - if el.text is not None: - if 'class' in el.attrib and re.search('\\bwmatch\\b', el.attrib['class']) is not None: - text += '<hits:Hit>' + el.text + '</hits:Hit>' - else: - text += el.text - if el.tail is not None: - text += el.tail - print(text) - record.text = text + self.parse_span(el, record) return record diff --git a/main.py b/main.py index 8ef74a4..d43431c 100644 --- a/main.py +++ b/main.py @@ -83,8 +83,8 @@ def endpoint( 'request': request, 'n_hits': nHits, 'records': records - }) - # media_type='application/xml') + }, + media_type='application/xml') # return str(res) return {'operation': operation, 'version': version} diff --git a/notes.txt b/notes.txt index fdef6aa..4f5f782 100644 --- a/notes.txt +++ b/notes.txt @@ -12,4 +12,4 @@ p. 14: x-cmd-resource-info parameter present in the query example, but never exp p. 12 and elsewhere: http://explain.z3950.org/dtd/2.0/ mentioned as the URL of the record schema, but its actual URL is http://zeerex.z3950.org/dtd/zeerex-2.0.dtd . -p. 14: What is the sruResponse prefix? It is never mentioned. No namespace is provided here, so the example is actually invalid XML. (Who doesn't validate their examples before showing them to a wide audience -- in an official specification, no less??) Maybe sru was actually meant (see p. 2 and an SRU v. 1.2 example above)? \ No newline at end of file +p. 14: What is the sruResponse prefix? It is never mentioned. No namespace is provided here, so the example is actually invalid XML. (Who doesn't validate their examples before showing them to a wide audience -- in an official specification, no less??) Maybe sru was actually meant (see p. 2 and an SRU v. 1.2 example above)? I'm putting sru there for now. \ No newline at end of file diff --git a/static/dataview_adv.xml b/static/dataview_adv.xml index 6b1d3a1..203f02c 100644 --- a/static/dataview_adv.xml +++ b/static/dataview_adv.xml @@ -1,10 +1,10 @@ <fcs:DataView type="application/x-clarin-fcs-adv+xml"> {% for hit in rf.dv_adv %} - <adv:Advanced unit="item"> + <adv:Advanced unit="item" xmlns:adv="http://clarin.eu/fcs/dataview/advanced"> <adv:Segments>{% for seg in hit.segments %} - <adv:Segment id="{{ seg.id }}" start="{{ seg.start }}" end="{{ seg.end }}"{% if seg.ref %} ref="{{ seg.ref }}"{% endif %}/> - </adv:Segments>{% for layer in hit.layers %} - <Layer id="{{ layer.id }}">{% for span in layer.spans %} + <adv:Segment id="{{ seg.id }}" start="{{ seg.start }}" end="{{ seg.end }}"{% if seg.ref %} ref="{{ seg.ref }}"{% endif %}/>{% endfor %} + </adv:Segments>{% for layer_id in hit.layers %} + <Layer id="{{ layer_id }}">{% for span in hit.layers[layer_id] %} <Span ref="{{ span.ref }}"{% if span.alt_value %} alt-value="{{ span.alt_value }}"{% endif %}{% if span.highlight %} highlight="{{ span.highlight }}"{% endif %}>{{ span.value }}</Span> {% endfor %} </Layer> diff --git a/static/search_retrieve_response.xml b/static/search_retrieve_response.xml index b356b31..c9664a7 100644 --- a/static/search_retrieve_response.xml +++ b/static/search_retrieve_response.xml @@ -1,19 +1,19 @@ <?xml version='1.0' encoding='utf-8'?> -<sruResponse:searchRetrieveResponse> -<sruResponse:version>2.0</sruResponse:version> -<sruResponse:numberOfRecords>{{ n_hits }}</sruResponse:numberOfRecords> -<sruResponse:records>{% for record in records %} - <sruResponse:record> - <sruResponse:recordSchema>http://clarin.eu/fcs/resource</sruResponse:recordSchema> - <sruResponse:recordXMLEscaping>xml</sruResponse:recordXMLEscaping> - <sruResponse:recordData>{% for resource in record.resources %} +<sru:searchRetrieveResponse xmlns:sru="http://docs.oasis-open.org/ns/search-ws/sruResponse"> +<sru:version>2.0</sru:version> +<sru:numberOfRecords>{{ n_hits }}</sru:numberOfRecords> +<sru:records>{% for record in records %} + <sru:record> + <sru:recordSchema>http://clarin.eu/fcs/resource</sru:recordSchema> + <sru:recordXMLEscaping>xml</sru:recordXMLEscaping> + <sru:recordData>{% for resource in record.resources %} {% include 'resource.xml' %} {% endfor %} - </sruResponse:recordData> - <sruResponse:recordPosition>{{ loop.index }}</sruResponse:recordPosition> - </sruResponse:record> + </sru:recordData> + <sru:recordPosition>{{ loop.index }}</sru:recordPosition> + </sru:record> {% endfor %} -</sruResponse:records>{% if n_hits > records|length %} -<sruResponse:nextRecordPosition>{{ records|length + 1 }}</sruResponse:nextRecordPosition>{% endif %} -<sruResponse:resultCountPrecision>info:srw/vocabulary/resultCountPrecision/1/exact</sruResponse:resultCountPrecision> -</sruResponse:searchRetrieveResponse> \ No newline at end of file +</sru:records>{% if n_hits > records|length %} +<sru:nextRecordPosition>{{ records|length + 1 }}</sru:nextRecordPosition>{% endif %} +<sru:resultCountPrecision>info:srw/vocabulary/resultCountPrecision/1/exact</sru:resultCountPrecision> +</sru:searchRetrieveResponse> \ No newline at end of file -- GitLab