Skip to content
Snippets Groups Projects
Commit 1672c9d3 authored by Arkhangelskiy, Timofey's avatar Arkhangelskiy, Timofey
Browse files

Improve logic, parse Litterae hits

parent 397e4286
No related branches found
No related tags found
No related merge requests found
...@@ -22,7 +22,7 @@ class LitteraeQueryParser(QueryParser): ...@@ -22,7 +22,7 @@ class LitteraeQueryParser(QueryParser):
termIndexes = self.term_indexes(getParams) termIndexes = self.term_indexes(getParams)
nWords = len(termIndexes) nWords = len(termIndexes)
boolOperatorMentioned = False boolOperatorMentioned = False
s = 'source=advanced&sort=urn&lemma_search=False&simple_search_id=' + str(random.randint(100000, 1000000)) s = 'source=advanced&sort=urn&lemma_search=False'
for param in getParams: for param in getParams:
nSfx = '' nSfx = ''
if param[1] > 0: if param[1] > 0:
......
...@@ -2,147 +2,59 @@ from urllib.parse import quote ...@@ -2,147 +2,59 @@ from urllib.parse import quote
import re import re
import json import json
import html import html
from lxml.html import fragment_fromstring from lxml.html import fromstring, tostring
from .enums import * from .enums import *
from .config import ResourceConfig from .config import ResourceConfig
from .search_retrieve import Record from .search_retrieve import Record
from .diagnostics import Diagnostic, DiagnosticTypes from .diagnostics import Diagnostic, DiagnosticTypes
class POSConvertor:
"""
Convert corpus-specific parts of speech / grammar tags to
UPOS, using regexes correspondences set in the config.
"""
def __init__(self, config: ResourceConfig):
self.posConvert = config.pos_convert
self.posTests = [(re.compile(k), v) for k, v in self.posConvert]
def convert_pos(self, pos):
"""
Convert corpus-specific POS tags to UPOS, if possible.
Ea
"""
for k, v in self.posTests:
if k.search(pos) is not None:
return v
return pos
class LitteraeResponseParser: class LitteraeResponseParser:
""" """
Parses responses from a Litterae instance. Parses responses from a Litterae instance.
TODO: implement
""" """
rxNHits = re.compile('(?:Suchergebnisse:|Search [rR]esults:)[ \t\r\n]*([0-9]+)')
rxUselessTags = re.compile('</?(?:p|small)[^\r\n<>]*>')
rxHitTag = re.compile('(</?)strong>')
def __init__(self): def __init__(self):
self.pc = None # POS convertor, rebuilt with each parse call self.pc = None # POS convertor, rebuilt with each parse call
def parse_annotation(self, anno, segID, record): def process_hits(self, tableNode, config: ResourceConfig, diagnostics: list[Diagnostic], advancedHits=False):
"""
Parse HTML annotation for one word taken from a hit.
Add the data to the layers in the record object.
""" """
annoTree = fragment_fromstring(anno, Process hits from an HTML node with the results table.
create_parent='div') If anything goes wrong, add Diagnostic objects to diagnostics list.
lemmas = set() Return a list of Record objects.
lemmasStr = '_'
pos = set()
posStr = '_'
lexNodes = annoTree.xpath('div[@class="popup_word"]/div[@class="popup_ana"]/span[@class="popup_lex"]')
for node in lexNodes:
if node.text is not None:
lemmas.add(node.text)
if len(lemmas) > 0:
lemmasStr = '|'.join(l for l in sorted(lemmas))
posNodes = annoTree.xpath('div[@class="popup_word"]/div[@class="popup_ana"]/span[@class="popup_pos"]')
for node in posNodes:
if node.text is not None:
posText = re.sub('&nbsp;|[ \t\ufeff]+', '', node.text)
posText = self.pc.convert_pos(posText)
pos.add(posText)
if len(pos) > 0:
posStr = '|'.join(p for p in sorted(pos))
if 'pos' not in record.layers:
record.layers['pos'] = []
record.layers['pos'].append({
'ref': segID,
'value': posStr
})
if 'lemma' not in record.layers:
record.layers['lemma'] = []
record.layers['lemma'].append({
'ref': segID,
'value': lemmasStr
})
def parse_span(self, el, record, advancedHits=False):
"""
Parse one <span> element from the HTML representation
of one hit returned by a Litterae instance. Add the extracted
text to the record object.
"""
if el.tag == 'span' and 'class' in el.attrib and 'sentence_meta' in el.attrib['class']:
# This is the introductory span that only contains the header
# (title, author etc.)
if el.tail is not None:
record.text += el.tail.strip('\n\t ')
return
if el.text is not None:
bMatch = False
if 'class' in el.attrib and re.search('\\bword\\b', el.attrib['class']) is not None:
if re.search('\\bwmatch\\b', el.attrib['class']) is not None:
bMatch = True
record.textNoHighlight += el.text
if advancedHits:
segID = 's' + str(len(record.segments))
segment = {
'id': segID,
'start': len(record.textNoHighlight) + 1,
'end': len(record.textNoHighlight) + len(el.text)
}
record.segments.append(segment)
if 'data-ana' in el.attrib:
self.parse_annotation(el.attrib['data-ana'], segID, record)
if bMatch:
record.text += '<hits:Hit>' + el.text + '</hits:Hit>'
else:
record.text += el.text
if el.tail is not None:
record.text += el.tail
record.textNoHighlight += el.tail
def parse_context(self, hit, config: ResourceConfig, lang='', advancedHits=False):
"""
Parse one hit. Return it as a Record object.
""" """
records = []
rows = tableNode.xpath('tr')
iRow = 0
iRowOffset = 0
while iRow < len(rows) and iRow - iRowOffset < config.max_hits:
row = rows[iRow]
iRow += 1
paragraphs = row.xpath('td/p')
if len(paragraphs) <= 0:
iRowOffset += 1
continue
record = Record(advancedHits=advancedHits) record = Record(advancedHits=advancedHits)
if len(lang) <= 0: txtParagraphs = []
lang = config.search_lang_id for p in paragraphs:
if ('languages' not in hit txt = tostring(p, encoding='utf-8').decode('utf-8')
or lang not in hit['languages'] print(txt, type(txt))
or 'text' not in hit['languages'][lang]): txt = self.rxUselessTags.sub('', txt)
return record txt = self.rxHitTag.sub('\\1hits:Hit>', txt)
contentTxt = re.sub('[\r\n\t\ufeff]+', '', hit['languages'][lang]['text'], flags=re.DOTALL) txtParagraphs.append(txt.strip())
print(contentTxt) record.text = ' &lt;...&gt; '.join(txtParagraphs).strip()
content = fragment_fromstring(contentTxt, records.append(record)
create_parent='div') return records
for el in content:
self.parse_span(el, record, advancedHits) def parse(self, response, config: ResourceConfig, xFcsDataviews):
return record
def parse(self, response, config: ResourceConfig, xFcsDataviews, lang=''):
""" """
Read a dictionary with the first N hits returned by a Litterae Read HTML response with the first N hits returned by a Litterae
instance. Return a list of Record objects and the total number of instance. Return a list of Record objects and the total number of
records found. records found.
""" """
self.pc = POSConvertor(config)
diagnostics = [] diagnostics = []
advancedHits = False advancedHits = False
dataViewsRequested = {v.strip() for v in xFcsDataviews.split(',') if len(v.strip()) > 0} dataViewsRequested = {v.strip() for v in xFcsDataviews.split(',') if len(v.strip()) > 0}
...@@ -151,15 +63,23 @@ class LitteraeResponseParser: ...@@ -151,15 +63,23 @@ class LitteraeResponseParser:
for v in dataViewsRequested: for v in dataViewsRequested:
if v not in ('hits', 'adv'): if v not in ('hits', 'adv'):
diagnostics.append(Diagnostic(DiagnosticTypes.fcs, 4, details=v)) diagnostics.append(Diagnostic(DiagnosticTypes.fcs, 4, details=v))
srcTree = fromstring(response)
nRecords = 0 nRecords = 0
if 'n_sentences' in response: nHitsNode = srcTree.xpath('//article[@class="container-fluid"]/header/h1')
nRecords = response['n_sentences'] if len(nHitsNode) > 0 and nHitsNode[0].text is not None:
if nRecords <= 0 or 'contexts' not in response: m = self.rxNHits.search(nHitsNode[0].text)
return [], nRecords if m is not None:
nRecords = int(m.group(1))
resTableNodes = srcTree.xpath('//table[@id="partsSearchResultTable"]/tbody')
records = [] records = []
for context in response['contexts']: if len(resTableNodes) <= 0:
records.append(self.parse_context(context, config, lang, advancedHits)) nRecords = 0
return records, nRecords else:
records = self.process_hits(resTableNodes[0], config, diagnostics, advancedHits=advancedHits)
if len(records) < nRecords and len(records) < config.max_hits:
diagnostics.append(Diagnostic(DiagnosticTypes.sru, 59,
message='Some results could not be shown due to copyright restrictions.'))
return records, nRecords, diagnostics
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -154,11 +154,11 @@ class TsakorpusResponseParser: ...@@ -154,11 +154,11 @@ class TsakorpusResponseParser:
if 'n_sentences' in response: if 'n_sentences' in response:
nRecords = response['n_sentences'] nRecords = response['n_sentences']
if nRecords <= 0 or 'contexts' not in response: if nRecords <= 0 or 'contexts' not in response:
return [], nRecords return [], nRecords, diagnostics
records = [] records = []
for context in response['contexts']: for context in response['contexts']:
records.append(self.parse_context(context, config, lang, advancedHits)) records.append(self.parse_context(context, config, lang, advancedHits))
return records, nRecords return records, nRecords, diagnostics
if __name__ == '__main__': if __name__ == '__main__':
......
# Contains functions called by the top-level view functions
# that process the user's request and return a rendered XML
# template
from typing import Optional
from fastapi import Request, Query, Response
from .enums import *
from .diagnostics import Diagnostic
from .config import ResourceConfig
def fatal_response(operation: Operation,
version: SRUVersion,
diagnostics: list[Diagnostic],
request, templates):
"""
Return a response with the fatal diagnostics
and no other payload.
"""
diagStr = [str(d) for d in diagnostics]
if operation == Operation.explain:
templateName = 'explain_response_2.0.xml'
if version == SRUVersion.v1_2:
templateName = 'explain_response_1.2.xml'
return templates.TemplateResponse(templateName,
{
'request': request,
'diagnostics': diagStr
},
media_type='application/xml')
elif operation == Operation.searchRetrieve:
return templates.TemplateResponse('search_retrieve_response.xml',
{
'request': request,
'diagnostics': diagStr,
'n_hits': 0,
},
media_type='application/xml')
def process_search_retrieve(version: SRUVersion,
queryType: QueryType,
query: str,
searchOptions: dict[str, str],
config: Optional[ResourceConfig],
diagnostics: list[Diagnostic],
app, request, templates):
"""
Process a searchRetrieve request.
Return a rendered XML response.
"""
if config.platform == CorpPlatform.tsakorpus:
try:
strGetParams = app.qp_tsakorpus.translate_fcsql(query, config)
print(strGetParams)
res = app.qp_tsakorpus.send_query(strGetParams, config)
except Diagnostic as diag:
return fatal_response(Operation.searchRetrieve, version, [diag], request, templates)
records, nHits, diagnostics = app.rp_tsakorpus.parse(res, config, searchOptions['x-fcs-dataviews'])
records = [r.as_dict() for r in records]
diagnostics = [str(d) for d in diagnostics]
return templates.TemplateResponse('search_retrieve_response.xml',
{
'request': request,
'n_hits': nHits,
'records': records,
'diagnostics': diagnostics
},
media_type='application/xml')
elif config.platform == CorpPlatform.litterae:
try:
strGetParams = app.qp_litterae.translate_fcsql(query, config)
print(strGetParams)
# return strGetParams
res = app.qp_litterae.send_query(strGetParams, config)
print(res)
except Diagnostic as diag:
return fatal_response(Operation.searchRetrieve, version, [diag], request, templates)
records, nHits, diagnostics = app.rp_litterae.parse(res, config, searchOptions['x-fcs-dataviews'])
records = [r.as_dict() for r in records]
diagnostics = [str(d) for d in diagnostics]
return templates.TemplateResponse('search_retrieve_response.xml',
{
'request': request,
'n_hits': nHits,
'records': records,
'diagnostics': diagnostics
},
media_type='application/xml')
def process_request(operation: Operation,
version: SRUVersion,
queryType: QueryType,
query: str,
searchOptions: dict[str, str],
config: Optional[ResourceConfig],
diagnostics: list[Diagnostic],
app, request, templates):
"""
Process validated user request that came in through the endpoint()
function in main.py.
Return a rendered template.
:param diagnostics: List of diagnostics produced by the validation
function.
"""
# If something is clearly wrong with the query, return
# a response with the list of diagnostics
if config is None or any(d.is_fatal() for d in diagnostics):
return fatal_response(operation, version, diagnostics, request, templates)
# If everything looks good, proceed to query parsing
if operation == Operation.searchRetrieve:
return process_search_retrieve(version, queryType, query, searchOptions, config, diagnostics, app, request, templates)
if __name__ == '__main__':
pass
...@@ -11,9 +11,11 @@ from common.tsakorpus_response_parser import TsakorpusResponseParser ...@@ -11,9 +11,11 @@ from common.tsakorpus_response_parser import TsakorpusResponseParser
from common.enums import * from common.enums import *
from common.diagnostics import Diagnostic from common.diagnostics import Diagnostic
from common.config import ResourceConfig, read_configs from common.config import ResourceConfig, read_configs
from common.views_logic import *
import json import json
import os import os
import re import re
import copy
import uvicorn import uvicorn
app = FastAPI() app = FastAPI()
...@@ -59,58 +61,28 @@ def endpoint( ...@@ -59,58 +61,28 @@ def endpoint(
alias='x-fcs-rewrites-allowed' alias='x-fcs-rewrites-allowed'
) )
): ):
searchOptions = {
'x-fcs-endpoint-description': xFcsEndpointDescription,
'x-fcs-context': xFcsContext,
'x-fcs-dataviews': xFcsDataviews,
'x-fcs-rewrites-allowed': xFcsRewritesAllowed
}
# Check if the corpus ID is correct
if corpusID not in app.configs: if corpusID not in app.configs:
message = 'No corpus with this ID (' + corpusID +') is served by this Endpoint. ' \ message = 'No corpus with this ID (' + corpusID +') is served by this Endpoint. ' \
'Valid corpus IDs are: ' + '; '.join(cID for cID in sorted(app.configs)) + '.' 'Valid corpus IDs are: ' + '; '.join(cID for cID in sorted(app.configs)) + '.'
diagBody = str(Diagnostic(DiagnosticTypes.sru, 235, diag = Diagnostic(DiagnosticTypes.sru, 235, message=message) # "Database does not exist"
message=message)) # "Database does not exist" return process_request(operation, version, queryType, query, searchOptions, None, [diag], app, request, templates)
return Response(content=diagBody, media_type='application/xml')
config = app.configs[corpusID] config = app.configs[corpusID]
# Check for common problems with parameter values
diagnostics = app.qp.validate_query(operation, version, queryType, query, diagnostics = app.qp.validate_query(operation, version, queryType, query,
xFcsEndpointDescription, xFcsContext, xFcsEndpointDescription, xFcsContext,
xFcsDataviews, xFcsRewritesAllowed) xFcsDataviews, xFcsRewritesAllowed)
if any(d.is_fatal() for d in diagnostics): # Now, do the substantial things
return '\n'.join(str(d) for d in diagnostics) return process_request(operation, version, queryType, query, searchOptions, config, diagnostics, app, request, templates)
# return {'operation': operation, 'version': version}
if operation == Operation.searchRetrieve:
if config.platform == CorpPlatform.tsakorpus:
try:
strGetParams = app.qp_tsakorpus.translate_fcsql(query, config)
print(strGetParams)
res = app.qp_tsakorpus.send_query(strGetParams, config)
except Diagnostic as diag:
print('diag', str(diag))
return Response(content=str(diag), media_type='application/xml')
records, nHits = app.rp_tsakorpus.parse(res, config, xFcsDataviews)
records = [r.as_dict() for r in records]
return templates.TemplateResponse('search_retrieve_response.xml',
{
'request': request,
'n_hits': nHits,
'records': records
},
media_type='application/xml')
elif config.platform == CorpPlatform.litterae:
try:
strGetParams = app.qp_litterae.translate_fcsql(query, config)
print(strGetParams)
return strGetParams
res = app.qp_litterae.send_query(strGetParams, config)
except Diagnostic as diag:
print('diag', str(diag))
return Response(content=str(diag), media_type='application/xml')
records, nHits = app.rp_litterae.parse(res, config, xFcsDataviews)
records = [r.as_dict() for r in records]
return templates.TemplateResponse('search_retrieve_response.xml',
{
'request': request,
'n_hits': nHits,
'records': records
},
media_type='application/xml')
# return str(res)
return {'operation': operation, 'version': version}
if __name__ == '__main__': if __name__ == '__main__':
......
{% set ep_version = 1 %} {% set ep_version = 1 %}
<?xml version='1.0' encoding='utf-8'?> <?xml version='1.0' encoding='utf-8'?>
<sru:explainResponse xmlns:sru="http://www.loc.gov/zing/srw/"> <sru:explainResponse xmlns:sru="http://www.loc.gov/zing/srw/">
<sru:version>1.2</sru:version> <sru:version>1.2</sru:version>{% if config %}
<sru:record> <sru:record>
<sru:recordSchema>http://explain.z3950.org/dtd/2.0/</sru:recordSchema> <sru:recordSchema>http://explain.z3950.org/dtd/2.0/</sru:recordSchema>
<sru:recordPacking>xml</sru:recordPacking> <sru:recordPacking>xml</sru:recordPacking>
...@@ -39,7 +39,10 @@ ...@@ -39,7 +39,10 @@
<!-- <sru:echoedExplainRequest> is OPTIONAL --> <!-- <sru:echoedExplainRequest> is OPTIONAL -->
<sru:echoedExplainRequest> <sru:echoedExplainRequest>
<sru:version>1.2</sru:version> <sru:version>1.2</sru:version>
<sru:baseUrl>{{ base_url }}</sru:baseUrl> <sru:baseUrl>{{ config.base_url }}</sru:baseUrl>
</sru:echoedExplainRequest>{% if endpoint_desc_needed %} </sru:echoedExplainRequest>{% if endpoint_desc_needed %}
{% include 'endpoint_description.xml' }{% endif %} {% include 'endpoint_description.xml' }{% endif %}{% endif %}{% if diagnostics and diagnostics|length > 0 %}
<sru:diagnostics>{% for d diagnostics %}
{{ d|safe }}{% endfor %}
</sru:diagnostics>{% endif %}
</sru:explainResponse> </sru:explainResponse>
\ No newline at end of file
<?xml version='1.0' encoding='utf-8'?> <?xml version='1.0' encoding='utf-8'?>
<sru:searchRetrieveResponse xmlns:sru="http://docs.oasis-open.org/ns/search-ws/sruResponse"> <sru:searchRetrieveResponse xmlns:sru="http://docs.oasis-open.org/ns/search-ws/sruResponse">
<sru:version>2.0</sru:version> <sru:version>2.0</sru:version>
<sru:numberOfRecords>{{ n_hits }}</sru:numberOfRecords> <sru:numberOfRecords>{{ n_hits }}</sru:numberOfRecords>{% if records %}
<sru:records>{% for record in records %} <sru:records>{% for record in records %}
<sru:record> <sru:record>
<sru:recordSchema>http://clarin.eu/fcs/resource</sru:recordSchema> <sru:recordSchema>http://clarin.eu/fcs/resource</sru:recordSchema>
...@@ -15,5 +15,8 @@ ...@@ -15,5 +15,8 @@
{% endfor %} {% endfor %}
</sru:records>{% if n_hits > records|length %} </sru:records>{% if n_hits > records|length %}
<sru:nextRecordPosition>{{ records|length + 1 }}</sru:nextRecordPosition>{% endif %} <sru:nextRecordPosition>{{ records|length + 1 }}</sru:nextRecordPosition>{% endif %}
<sru:resultCountPrecision>info:srw/vocabulary/resultCountPrecision/1/exact</sru:resultCountPrecision> <sru:resultCountPrecision>info:srw/vocabulary/resultCountPrecision/1/exact</sru:resultCountPrecision>{% endif %}{% if diagnostics and diagnostics|length > 0 %}
<sru:diagnostics>{% for d in diagnostics %}
{{ d|safe }}{% endfor %}
</sru:diagnostics>{% endif %}
</sru:searchRetrieveResponse> </sru:searchRetrieveResponse>
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment