Improve logic, parse Litterae hits

1672c9d3 · Arkhangelskiy, Timofey · 397e4286 · 1672c9d3 · 1672c9d3 · 1672c9d3
Commit 1672c9d3 authored 2 years ago by Arkhangelskiy, Timofey
--- a/common/litterae_query_parser.py
+++ b/common/litterae_query_parser.py
@@ -22,7 +22,7 @@ class LitteraeQueryParser(QueryParser):
        termIndexes = self.term_indexes(getParams)
        nWords = len(termIndexes)
        boolOperatorMentioned = False
-        s = 'source=advanced&sort=urn&lemma_search=False&simple_search_id=' + str(random.randint(100000, 1000000))
+        s = 'source=advanced&sort=urn&lemma_search=False'
        for param in getParams:
            nSfx = ''
            if param[1] > 0:

--- a/common/litterae_response_parser.py
+++ b/common/litterae_response_parser.py
@@ -2,147 +2,59 @@ from urllib.parse import quote
 import re
 import json
 import html
-from lxml.html import fragment_fromstring
+from lxml.html import fromstring, tostring
 from .enums import *
 from .config import ResourceConfig
 from .search_retrieve import Record
 from .diagnostics import Diagnostic, DiagnosticTypes
-class POSConvertor:
-    """
-    Convert corpus-specific parts of speech / grammar tags to
-    UPOS, using regexes correspondences set in the config.
-    """
-    def __init__(self, config: ResourceConfig):
-        self.posConvert = config.pos_convert
-        self.posTests = [(re.compile(k), v) for k, v in self.posConvert]
-    def convert_pos(self, pos):
-        """
-        Convert corpus-specific POS tags to UPOS, if possible.
-        Ea
-        """
-        for k, v in self.posTests:
-            if k.search(pos) is not None:
-                return v
-        return pos
 class LitteraeResponseParser:
    """
    Parses responses from a Litterae instance.
-    TODO: implement
    """
+    rxNHits = re.compile('(?:Suchergebnisse:|Search [rR]esults:)[ \t\r\n]*([0-9]+)')
+    rxUselessTags = re.compile('</?(?:p|small)[^\r\n<>]*>')
+    rxHitTag = re.compile('(</?)strong>')
    def __init__(self):
        self.pc = None      # POS convertor, rebuilt with each parse call
-    def parse_annotation(self, anno, segID, record):
+    def process_hits(self, tableNode, config: ResourceConfig, diagnostics: list[Diagnostic], advancedHits=False):
-        """
-        Parse HTML annotation for one word taken from a hit.
-        Add the data to the layers in the record object.
        """
-        annoTree = fragment_fromstring(anno,
+        Process hits from an HTML node with the results table.
-                                       create_parent='div')
+        If anything goes wrong, add Diagnostic objects to diagnostics list.
-        lemmas = set()
+        Return a list of Record objects.
-        lemmasStr = '_'
-        pos = set()
-        posStr = '_'
-        lexNodes = annoTree.xpath('div[@class="popup_word"]/div[@class="popup_ana"]/span[@class="popup_lex"]')
-        for node in lexNodes:
-            if node.text is not None:
-                lemmas.add(node.text)
-        if len(lemmas) > 0:
-            lemmasStr = '|'.join(l for l in sorted(lemmas))
-        posNodes = annoTree.xpath('div[@class="popup_word"]/div[@class="popup_ana"]/span[@class="popup_pos"]')
-        for node in posNodes:
-            if node.text is not None:
-                posText = re.sub('&nbsp;|[  \t\ufeff]+', '', node.text)
-                posText = self.pc.convert_pos(posText)
-                pos.add(posText)
-        if len(pos) > 0:
-            posStr = '|'.join(p for p in sorted(pos))
-        if 'pos' not in record.layers:
-            record.layers['pos'] = []
-        record.layers['pos'].append({
-            'ref': segID,
-            'value': posStr
-        })
-        if 'lemma' not in record.layers:
-            record.layers['lemma'] = []
-        record.layers['lemma'].append({
-            'ref': segID,
-            'value': lemmasStr
-        })
-    def parse_span(self, el, record, advancedHits=False):
-        """
-        Parse one <span> element from the HTML representation
-        of one hit returned by a Litterae instance. Add the extracted
-        text to the record object.
-        """
-        if el.tag == 'span' and 'class' in el.attrib and 'sentence_meta' in el.attrib['class']:
-            # This is the introductory span that only contains the header
-            # (title, author etc.)
-            if el.tail is not None:
-                record.text += el.tail.strip('\n\t ')
-            return
-        if el.text is not None:
-            bMatch = False
-            if 'class' in el.attrib and re.search('\\bword\\b', el.attrib['class']) is not None:
-                if re.search('\\bwmatch\\b', el.attrib['class']) is not None:
-                    bMatch = True
-                record.textNoHighlight += el.text
-                if advancedHits:
-                    segID = 's' + str(len(record.segments))
-                    segment = {
-                        'id': segID,
-                        'start': len(record.textNoHighlight) + 1,
-                        'end': len(record.textNoHighlight) + len(el.text)
-                    }
-                    record.segments.append(segment)
-                    if 'data-ana' in el.attrib:
-                        self.parse_annotation(el.attrib['data-ana'], segID, record)
-            if bMatch:
-                record.text += '<hits:Hit>' + el.text + '</hits:Hit>'
-            else:
-                record.text += el.text
-        if el.tail is not None:
-            record.text += el.tail
-            record.textNoHighlight += el.tail
-    def parse_context(self, hit, config: ResourceConfig, lang='', advancedHits=False):
-        """
-        Parse one hit. Return it as a Record object.
        """
+        records = []
+        rows = tableNode.xpath('tr')
+        iRow = 0
+        iRowOffset = 0
+        while iRow < len(rows) and iRow - iRowOffset < config.max_hits:
+            row = rows[iRow]
+            iRow += 1
+            paragraphs = row.xpath('td/p')
+            if len(paragraphs) <= 0:
+                iRowOffset += 1
+                continue
            record = Record(advancedHits=advancedHits)
-        if len(lang) <= 0:
+            txtParagraphs = []
-            lang = config.search_lang_id
+            for p in paragraphs:
-        if ('languages' not in hit
+                txt = tostring(p, encoding='utf-8').decode('utf-8')
-                or lang not in hit['languages']
+                print(txt, type(txt))
-                or 'text' not in hit['languages'][lang]):
+                txt = self.rxUselessTags.sub('', txt)
-            return record
+                txt = self.rxHitTag.sub('\\1hits:Hit>', txt)
-        contentTxt = re.sub('[\r\n\t\ufeff]+', '', hit['languages'][lang]['text'], flags=re.DOTALL)
+                txtParagraphs.append(txt.strip())
-        print(contentTxt)
+            record.text = ' &lt;...&gt; '.join(txtParagraphs).strip()
-        content = fragment_fromstring(contentTxt,
+            records.append(record)
-                                      create_parent='div')
+        return records
-        for el in content:
-            self.parse_span(el, record, advancedHits)
+    def parse(self, response, config: ResourceConfig, xFcsDataviews):
-        return record
-    def parse(self, response, config: ResourceConfig, xFcsDataviews, lang=''):
        """
-        Read a dictionary with the first N hits returned by a Litterae
+        Read HTML response with the first N hits returned by a Litterae
        instance. Return a list of Record objects and the total number of
        records found.
        """
-        self.pc = POSConvertor(config)
        diagnostics = []
        advancedHits = False
        dataViewsRequested = {v.strip() for v in xFcsDataviews.split(',') if len(v.strip()) > 0}
@@ -151,15 +63,23 @@ class LitteraeResponseParser:
        for v in dataViewsRequested:
            if v not in ('hits', 'adv'):
                diagnostics.append(Diagnostic(DiagnosticTypes.fcs, 4, details=v))
+        srcTree = fromstring(response)
        nRecords = 0
-        if 'n_sentences' in response:
+        nHitsNode = srcTree.xpath('//article[@class="container-fluid"]/header/h1')
-            nRecords = response['n_sentences']
+        if len(nHitsNode) > 0 and nHitsNode[0].text is not None:
-        if nRecords <= 0 or 'contexts' not in response:
+            m = self.rxNHits.search(nHitsNode[0].text)
-            return [], nRecords
+            if m is not None:
+                nRecords = int(m.group(1))
+        resTableNodes = srcTree.xpath('//table[@id="partsSearchResultTable"]/tbody')
        records = []
-        for context in response['contexts']:
+        if len(resTableNodes) <= 0:
-            records.append(self.parse_context(context, config, lang, advancedHits))
+            nRecords = 0
-        return records, nRecords
+        else:
+            records = self.process_hits(resTableNodes[0], config, diagnostics, advancedHits=advancedHits)
+        if len(records) < nRecords and len(records) < config.max_hits:
+            diagnostics.append(Diagnostic(DiagnosticTypes.sru, 59,
+                                          message='Some results could not be shown due to copyright restrictions.'))
+        return records, nRecords, diagnostics
 if __name__ == '__main__':

--- a/common/tsakorpus_response_parser.py
+++ b/common/tsakorpus_response_parser.py
@@ -154,11 +154,11 @@ class TsakorpusResponseParser:
        if 'n_sentences' in response:
            nRecords = response['n_sentences']
        if nRecords <= 0 or 'contexts' not in response:
-            return [], nRecords
+            return [], nRecords, diagnostics
        records = []
        for context in response['contexts']:
            records.append(self.parse_context(context, config, lang, advancedHits))
-        return records, nRecords
+        return records, nRecords, diagnostics
 if __name__ == '__main__':

--- a/common/views_logic.py
+++ b/common/views_logic.py
+# Contains functions called by the top-level view functions
+# that process the user's request and return a rendered XML
+# template
+from typing import Optional
+from fastapi import Request, Query, Response
+from .enums import *
+from .diagnostics import Diagnostic
+from .config import ResourceConfig
+def fatal_response(operation: Operation,
+                   version: SRUVersion,
+                   diagnostics: list[Diagnostic],
+                   request, templates):
+    """
+    Return a response with the fatal diagnostics
+    and no other payload.
+    """
+    diagStr = [str(d) for d in diagnostics]
+    if operation == Operation.explain:
+        templateName = 'explain_response_2.0.xml'
+        if version == SRUVersion.v1_2:
+            templateName = 'explain_response_1.2.xml'
+        return templates.TemplateResponse(templateName,
+                                          {
+                                              'request': request,
+                                              'diagnostics': diagStr
+                                          },
+                                          media_type='application/xml')
+    elif operation == Operation.searchRetrieve:
+        return templates.TemplateResponse('search_retrieve_response.xml',
+                                          {
+                                              'request': request,
+                                              'diagnostics': diagStr,
+                                              'n_hits': 0,
+                                          },
+                                          media_type='application/xml')
+def process_search_retrieve(version: SRUVersion,
+                            queryType: QueryType,
+                            query: str,
+                            searchOptions: dict[str, str],
+                            config: Optional[ResourceConfig],
+                            diagnostics: list[Diagnostic],
+                            app, request, templates):
+    """
+    Process a searchRetrieve request.
+    Return a rendered XML response.
+    """
+    if config.platform == CorpPlatform.tsakorpus:
+        try:
+            strGetParams = app.qp_tsakorpus.translate_fcsql(query, config)
+            print(strGetParams)
+            res = app.qp_tsakorpus.send_query(strGetParams, config)
+        except Diagnostic as diag:
+            return fatal_response(Operation.searchRetrieve, version, [diag], request, templates)
+        records, nHits, diagnostics = app.rp_tsakorpus.parse(res, config, searchOptions['x-fcs-dataviews'])
+        records = [r.as_dict() for r in records]
+        diagnostics = [str(d) for d in diagnostics]
+        return templates.TemplateResponse('search_retrieve_response.xml',
+                                          {
+                                              'request': request,
+                                              'n_hits': nHits,
+                                              'records': records,
+                                              'diagnostics': diagnostics
+                                          },
+                                          media_type='application/xml')
+    elif config.platform == CorpPlatform.litterae:
+        try:
+            strGetParams = app.qp_litterae.translate_fcsql(query, config)
+            print(strGetParams)
+            # return strGetParams
+            res = app.qp_litterae.send_query(strGetParams, config)
+            print(res)
+        except Diagnostic as diag:
+            return fatal_response(Operation.searchRetrieve, version, [diag], request, templates)
+        records, nHits, diagnostics = app.rp_litterae.parse(res, config, searchOptions['x-fcs-dataviews'])
+        records = [r.as_dict() for r in records]
+        diagnostics = [str(d) for d in diagnostics]
+        return templates.TemplateResponse('search_retrieve_response.xml',
+                                          {
+                                              'request': request,
+                                              'n_hits': nHits,
+                                              'records': records,
+                                              'diagnostics': diagnostics
+                                          },
+                                          media_type='application/xml')
+def process_request(operation: Operation,
+                    version: SRUVersion,
+                    queryType: QueryType,
+                    query: str,
+                    searchOptions: dict[str, str],
+                    config: Optional[ResourceConfig],
+                    diagnostics: list[Diagnostic],
+                    app, request, templates):
+    """
+    Process validated user request that came in through the endpoint()
+    function in main.py.
+    Return a rendered template.
+    :param diagnostics: List of diagnostics produced by the validation
+    function.
+    """
+    # If something is clearly wrong with the query, return
+    # a response with the list of diagnostics
+    if config is None or any(d.is_fatal() for d in diagnostics):
+        return fatal_response(operation, version, diagnostics, request, templates)
+    # If everything looks good, proceed to query parsing
+    if operation == Operation.searchRetrieve:
+        return process_search_retrieve(version, queryType, query, searchOptions, config, diagnostics, app, request, templates)
+if __name__ == '__main__':
+    pass
--- a/main.py
+++ b/main.py
@@ -11,9 +11,11 @@ from common.tsakorpus_response_parser import TsakorpusResponseParser
 from common.enums import *
 from common.diagnostics import Diagnostic
 from common.config import ResourceConfig, read_configs
+from common.views_logic import *
 import json
 import os
 import re
+import copy
 import uvicorn
 app = FastAPI()
@@ -59,58 +61,28 @@ def endpoint(
            alias='x-fcs-rewrites-allowed'
        )
        ):
+    searchOptions = {
+        'x-fcs-endpoint-description': xFcsEndpointDescription,
+        'x-fcs-context': xFcsContext,
+        'x-fcs-dataviews': xFcsDataviews,
+        'x-fcs-rewrites-allowed': xFcsRewritesAllowed
+    }
+    # Check if the corpus ID is correct
    if corpusID not in app.configs:
        message = 'No corpus with this ID (' + corpusID +') is served by this Endpoint. ' \
                  'Valid corpus IDs are: ' + '; '.join(cID for cID in sorted(app.configs)) + '.'
-        diagBody = str(Diagnostic(DiagnosticTypes.sru, 235,
+        diag = Diagnostic(DiagnosticTypes.sru, 235, message=message)  # "Database does not exist"
-                       message=message))  # "Database does not exist"
+        return process_request(operation, version, queryType, query, searchOptions, None, [diag], app, request, templates)
-        return Response(content=diagBody, media_type='application/xml')
    config = app.configs[corpusID]
+    # Check for common problems with parameter values
    diagnostics = app.qp.validate_query(operation, version, queryType, query,
                                        xFcsEndpointDescription, xFcsContext,
                                        xFcsDataviews, xFcsRewritesAllowed)
-    if any(d.is_fatal() for d in diagnostics):
+    # Now, do the substantial things
-        return '\n'.join(str(d) for d in diagnostics)
+    return process_request(operation, version, queryType, query, searchOptions, config, diagnostics, app, request, templates)
+    # return {'operation': operation, 'version': version}
-    if operation == Operation.searchRetrieve:
-        if config.platform == CorpPlatform.tsakorpus:
-            try:
-                strGetParams = app.qp_tsakorpus.translate_fcsql(query, config)
-                print(strGetParams)
-                res = app.qp_tsakorpus.send_query(strGetParams, config)
-            except Diagnostic as diag:
-                print('diag', str(diag))
-                return Response(content=str(diag), media_type='application/xml')
-            records, nHits = app.rp_tsakorpus.parse(res, config, xFcsDataviews)
-            records = [r.as_dict() for r in records]
-            return templates.TemplateResponse('search_retrieve_response.xml',
-                                              {
-                                                  'request': request,
-                                                  'n_hits': nHits,
-                                                  'records': records
-                                              },
-                                              media_type='application/xml')
-        elif config.platform == CorpPlatform.litterae:
-            try:
-                strGetParams = app.qp_litterae.translate_fcsql(query, config)
-                print(strGetParams)
-                return strGetParams
-                res = app.qp_litterae.send_query(strGetParams, config)
-            except Diagnostic as diag:
-                print('diag', str(diag))
-                return Response(content=str(diag), media_type='application/xml')
-            records, nHits = app.rp_litterae.parse(res, config, xFcsDataviews)
-            records = [r.as_dict() for r in records]
-            return templates.TemplateResponse('search_retrieve_response.xml',
-                                              {
-                                                  'request': request,
-                                                  'n_hits': nHits,
-                                                  'records': records
-                                              },
-                                              media_type='application/xml')
-            # return str(res)
-    return {'operation': operation, 'version': version}
 if __name__ == '__main__':

--- a/static/explain_response_1.2.xml
+++ b/static/explain_response_1.2.xml
 {% set ep_version = 1 %}
 <?xml version='1.0' encoding='utf-8'?>
 <sru:explainResponse xmlns:sru="http://www.loc.gov/zing/srw/">
-    <sru:version>1.2</sru:version>
+    <sru:version>1.2</sru:version>{% if config %}
    <sru:record>
        <sru:recordSchema>http://explain.z3950.org/dtd/2.0/</sru:recordSchema>
        <sru:recordPacking>xml</sru:recordPacking>
@@ -39,7 +39,10 @@
    <!-- <sru:echoedExplainRequest> is OPTIONAL -->
    <sru:echoedExplainRequest>
        <sru:version>1.2</sru:version>
-        <sru:baseUrl>{{ base_url }}</sru:baseUrl>
+        <sru:baseUrl>{{ config.base_url }}</sru:baseUrl>
    </sru:echoedExplainRequest>{% if endpoint_desc_needed %}
-{% include 'endpoint_description.xml' }{% endif %}
+{% include 'endpoint_description.xml' }{% endif %}{% endif %}{% if diagnostics and diagnostics|length > 0 %}
+    <sru:diagnostics>{% for d diagnostics %}
+        {{ d|safe }}{% endfor %}
+    </sru:diagnostics>{% endif %}
 </sru:explainResponse>
\ No newline at end of file
--- a/static/search_retrieve_response.xml
+++ b/static/search_retrieve_response.xml
 <?xml version='1.0' encoding='utf-8'?>
 <sru:searchRetrieveResponse xmlns:sru="http://docs.oasis-open.org/ns/search-ws/sruResponse">
 	<sru:version>2.0</sru:version>
-<sru:numberOfRecords>{{ n_hits }}</sru:numberOfRecords>
+	<sru:numberOfRecords>{{ n_hits }}</sru:numberOfRecords>{% if records %}
 	<sru:records>{% for record in records %}
 		<sru:record>
 			<sru:recordSchema>http://clarin.eu/fcs/resource</sru:recordSchema>
@@ -15,5 +15,8 @@
 	{% endfor %}
 	</sru:records>{% if n_hits > records|length %}
 	<sru:nextRecordPosition>{{ records|length + 1 }}</sru:nextRecordPosition>{% endif %}
-<sru:resultCountPrecision>info:srw/vocabulary/resultCountPrecision/1/exact</sru:resultCountPrecision>
+	<sru:resultCountPrecision>info:srw/vocabulary/resultCountPrecision/1/exact</sru:resultCountPrecision>{% endif %}{% if diagnostics and diagnostics|length > 0 %}
+	<sru:diagnostics>{% for d in diagnostics %}
+		{{ d|safe }}{% endfor %}
+	</sru:diagnostics>{% endif %}
 </sru:searchRetrieveResponse>
\ No newline at end of file