Skip to content
Snippets Groups Projects
Commit 8e4aefee authored by Arkhangelskiy, Timofey's avatar Arkhangelskiy, Timofey
Browse files

Enable advanced data view for Tsakorpus

parent 7f1707db
Branches
No related tags found
No related merge requests found
......@@ -15,26 +15,34 @@ class Record:
self.dataView = dataView
# For simple search:
self.text = ''
self.textNoHighlight = '' # no <hits:Hit> elements, just text
# For advanced search:
self.segments = []
self.layers = []
self.layers = {} # ID -> content
def as_dict(self):
"""
Returns a dictionary for insertion into the XML template.
"""
# In generic SRU, records, resources and resource fragments
# are all distinct entities. In FCS, it's the same thing. Still,
# I model the templates in a generic way, in case that changes
# in FCS in the future
record = {
'resources': {
'resources': [{
'resource_fragments': [{
'dv_hits': [{
'text': self.text
}],
'dv_adv': []
'dv_adv': [{
'segments': self.segments,
'layers': self.layers
}]
}]
}
}]
}
if self.dataView == DataView.adv:
record['resources']['resource_fragments'][0]['dv_adv'].append({
record['resources'][0]['resource_fragments'][0]['dv_adv'].append({
'segments': self.segments,
'layers': self.layers
})
......
from urllib.parse import quote
import re
import json
import urllib.request
import html
from lxml.html import fragment_fromstring
from .enums import *
from .config import ResourceConfig
......@@ -16,6 +16,83 @@ class TsakorpusResponseParser:
def __init__(self):
pass
def parse_annotation(self, anno, segID, record):
"""
Parse HTML annotation for one word taken from a hit.
Add the data to the layers in the record object.
"""
annoTree = fragment_fromstring(anno,
create_parent='div')
lemmas = set()
lemmasStr = '_'
pos = set()
posStr = '_'
lexNodes = annoTree.xpath('div[@class="popup_word"]/div[@class="popup_ana"]/span[@class="popup_lex"]')
for node in lexNodes:
if node.text is not None:
lemmas.add(node.text)
if len(lemmas) > 0:
lemmasStr = '|'.join(l for l in sorted(lemmas))
posNodes = annoTree.xpath('div[@class="popup_word"]/div[@class="popup_ana"]/span[@class="popup_pos"]')
for node in posNodes:
if node.text is not None:
posText = re.sub('&nbsp;|[ \t\ufeff]+', '', node.text)
pos.add(posText)
if len(pos) > 0:
posStr = '|'.join(p for p in sorted(pos))
if 'pos' not in record.layers:
record.layers['pos'] = []
record.layers['pos'].append({
'ref': segID,
'value': posStr
})
if 'lemma' not in record.layers:
record.layers['lemma'] = []
record.layers['lemma'].append({
'ref': segID,
'value': lemmasStr
})
def parse_span(self, el, record):
"""
Parse one <span> element from the HTML representation
of one hit returned by a Tsakorpus instance. Add the extracted
text to the record object.
"""
if el.tag == 'span' and 'class' in el.attrib and 'sentence_meta' in el.attrib['class']:
# This is the introductory span that only contains the header
# (title, author etc.)
if el.tail is not None:
record.text += el.tail.strip('\n\t ')
return
if el.text is not None:
bMatch = False
if 'class' in el.attrib and re.search('\\bword\\b', el.attrib['class']) is not None:
if re.search('\\bwmatch\\b', el.attrib['class']) is not None:
bMatch = True
segID = 's' + str(len(record.segments))
segment = {
'id': segID,
'start': len(record.textNoHighlight) + 1,
'end': len(record.textNoHighlight) + len(el.text)
}
record.segments.append(segment)
record.textNoHighlight += el.text
if 'data-ana' in el.attrib:
self.parse_annotation(el.attrib['data-ana'], segID, record)
if bMatch:
record.text += '<hits:Hit>' + el.text + '</hits:Hit>'
else:
record.text += el.text
if el.tail is not None:
record.text += el.tail
record.textNoHighlight += el.tail
def parse_context(self, hit, config: ResourceConfig, lang=''):
"""
Parse one hit. Return it as a Record object.
......@@ -27,23 +104,12 @@ class TsakorpusResponseParser:
or lang not in hit['languages']
or 'text' not in hit['languages'][lang]):
return record
content = fragment_fromstring(hit['languages'][lang]['text'],
contentTxt = re.sub('[\r\n\t\ufeff]+', '', hit['languages'][lang]['text'], flags=re.DOTALL)
print(contentTxt)
content = fragment_fromstring(contentTxt,
create_parent='div')
text = ''
for el in content:
if el.tag == 'span' and 'class' in el.attrib and 'sentence_meta' in el.attrib['class']:
if el.tail is not None:
text += el.tail.strip('\n\t ')
continue
if el.text is not None:
if 'class' in el.attrib and re.search('\\bwmatch\\b', el.attrib['class']) is not None:
text += '<hits:Hit>' + el.text + '</hits:Hit>'
else:
text += el.text
if el.tail is not None:
text += el.tail
print(text)
record.text = text
self.parse_span(el, record)
return record
......
......@@ -83,8 +83,8 @@ def endpoint(
'request': request,
'n_hits': nHits,
'records': records
})
# media_type='application/xml')
},
media_type='application/xml')
# return str(res)
return {'operation': operation, 'version': version}
......
......@@ -12,4 +12,4 @@ p. 14: x-cmd-resource-info parameter present in the query example, but never exp
p. 12 and elsewhere: http://explain.z3950.org/dtd/2.0/ mentioned as the URL of the record schema, but its actual URL is http://zeerex.z3950.org/dtd/zeerex-2.0.dtd .
p. 14: What is the sruResponse prefix? It is never mentioned. No namespace is provided here, so the example is actually invalid XML. (Who doesn't validate their examples before showing them to a wide audience -- in an official specification, no less??) Maybe sru was actually meant (see p. 2 and an SRU v. 1.2 example above)?
\ No newline at end of file
p. 14: What is the sruResponse prefix? It is never mentioned. No namespace is provided here, so the example is actually invalid XML. (Who doesn't validate their examples before showing them to a wide audience -- in an official specification, no less??) Maybe sru was actually meant (see p. 2 and an SRU v. 1.2 example above)? I'm putting sru there for now.
\ No newline at end of file
<fcs:DataView type="application/x-clarin-fcs-adv+xml">
{% for hit in rf.dv_adv %}
<adv:Advanced unit="item">
<adv:Advanced unit="item" xmlns:adv="http://clarin.eu/fcs/dataview/advanced">
<adv:Segments>{% for seg in hit.segments %}
<adv:Segment id="{{ seg.id }}" start="{{ seg.start }}" end="{{ seg.end }}"{% if seg.ref %} ref="{{ seg.ref }}"{% endif %}/>
</adv:Segments>{% for layer in hit.layers %}
<Layer id="{{ layer.id }}">{% for span in layer.spans %}
<adv:Segment id="{{ seg.id }}" start="{{ seg.start }}" end="{{ seg.end }}"{% if seg.ref %} ref="{{ seg.ref }}"{% endif %}/>{% endfor %}
</adv:Segments>{% for layer_id in hit.layers %}
<Layer id="{{ layer_id }}">{% for span in hit.layers[layer_id] %}
<Span ref="{{ span.ref }}"{% if span.alt_value %} alt-value="{{ span.alt_value }}"{% endif %}{% if span.highlight %} highlight="{{ span.highlight }}"{% endif %}>{{ span.value }}</Span>
{% endfor %}
</Layer>
......
<?xml version='1.0' encoding='utf-8'?>
<sruResponse:searchRetrieveResponse>
<sruResponse:version>2.0</sruResponse:version>
<sruResponse:numberOfRecords>{{ n_hits }}</sruResponse:numberOfRecords>
<sruResponse:records>{% for record in records %}
<sruResponse:record>
<sruResponse:recordSchema>http://clarin.eu/fcs/resource</sruResponse:recordSchema>
<sruResponse:recordXMLEscaping>xml</sruResponse:recordXMLEscaping>
<sruResponse:recordData>{% for resource in record.resources %}
<sru:searchRetrieveResponse xmlns:sru="http://docs.oasis-open.org/ns/search-ws/sruResponse">
<sru:version>2.0</sru:version>
<sru:numberOfRecords>{{ n_hits }}</sru:numberOfRecords>
<sru:records>{% for record in records %}
<sru:record>
<sru:recordSchema>http://clarin.eu/fcs/resource</sru:recordSchema>
<sru:recordXMLEscaping>xml</sru:recordXMLEscaping>
<sru:recordData>{% for resource in record.resources %}
{% include 'resource.xml' %}
{% endfor %}
</sruResponse:recordData>
<sruResponse:recordPosition>{{ loop.index }}</sruResponse:recordPosition>
</sruResponse:record>
</sru:recordData>
<sru:recordPosition>{{ loop.index }}</sru:recordPosition>
</sru:record>
{% endfor %}
</sruResponse:records>{% if n_hits > records|length %}
<sruResponse:nextRecordPosition>{{ records|length + 1 }}</sruResponse:nextRecordPosition>{% endif %}
<sruResponse:resultCountPrecision>info:srw/vocabulary/resultCountPrecision/1/exact</sruResponse:resultCountPrecision>
</sruResponse:searchRetrieveResponse>
\ No newline at end of file
</sru:records>{% if n_hits > records|length %}
<sru:nextRecordPosition>{{ records|length + 1 }}</sru:nextRecordPosition>{% endif %}
<sru:resultCountPrecision>info:srw/vocabulary/resultCountPrecision/1/exact</sru:resultCountPrecision>
</sru:searchRetrieveResponse>
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment