Skip to content
Snippets Groups Projects
Commit 628d8a55 authored by Arkhangelskiy, Timofey's avatar Arkhangelskiy, Timofey
Browse files

Explain now works

parent 1672c9d3
Branches
No related tags found
No related merge requests found
......@@ -19,6 +19,7 @@ class ResourceConfig:
rxExt = re.compile('\\.[^.]*$')
def __init__(self, fnameConfig=None):
self.corpus_id = ''
self.platform = 'tsakorpus'
self.transport_protocol = 'https'
self.host = '127.0.0.1'
......@@ -63,14 +64,69 @@ class ResourceConfig:
self.load_settings(fnameConfig)
def add_default_lang(self, elements):
"""
Add the default "lang"="en" attribute to all elements
in a list that do not have a "lang" attribute.
"""
for el in elements:
if 'lang' not in el:
el['lang'] = 'en'
def load_settings(self, fnameConfig):
"""
Load configuration for one of the resources from a JSON file.
"""
with open(fnameConfig, 'r', encoding='utf-8') as fConfig:
config = json.load(fConfig)
if len(self.corpus_id) <= 0:
self.corpus_id = re.sub('.*?([^/\\\\]+)\\.[^.]*$', '\\1', fnameConfig)
for k, v in config.items():
setattr(self, k, v)
self.add_default_lang(self.titles)
self.add_default_lang(self.descriptions)
self.add_default_lang(self.authors)
self.add_default_lang(self.contacts)
self.add_default_lang(self.extents)
self.add_default_lang(self.history)
self.add_default_lang(self.restrictions)
if self.platform == 'tsakorpus' and self.adv_supported:
if len(self.supported_layers) <= 0:
self.supported_layers = [
{
'id': 'word',
'result-id': 'word', # I have no idea what this is
'layer_type': 'text'
},
{
'id': 'lemma',
'result-id': 'lemma', # I have no idea what this is
'layer_type': 'lemma'
},
{
'id': 'pos',
'result-id': 'pos', # I have no idea what this is
'layer_type': 'pos'
}
]
for r in self.resources:
if 'pid' not in r:
r['pid'] = ''
if 'titles' not in r:
r['titles'] = []
if 'descriptions' not in r:
r['descriptions'] = []
if 'languages' not in r:
r['languages'] = []
if 'data_views' not in r or len(r['data_views']) <= 0:
r['data_views'] = 'hits'
if 'layers' not in r:
if self.adv_supported and self.platform == 'tsakorpus':
r['layers'] = 'word lemma pos'
else:
r['layers'] = ''
self.add_default_lang(r['titles'])
self.add_default_lang(r['descriptions'])
def as_dict(self):
"""
......
......@@ -12,9 +12,12 @@ class Diagnostic(Exception):
"""
fatalFCSDiagnostics = {3, 10, 11} # FCS specifications, 4.2
fatalSRUDiagnostics = {8, 10, 27, 37, 47, 48, 235} # A subset actually used by this endpoint
fatalSRUDiagnostics = {1, 4, 8, 10, 27, 37, 47, 48, 235} # A subset actually used by this endpoint
stdMessages = {
(DiagnosticTypes.fcs, 4): 'Requested Data View not valid for this resource.',
(DiagnosticTypes.sru, 4): 'Unsupported operation.',
(DiagnosticTypes.sru, 8): 'Unsupported parameter.',
(DiagnosticTypes.sru, 10): 'Something is wrong with the query syntax.',
(DiagnosticTypes.sru, 27): 'The query should not be empty.',
(DiagnosticTypes.sru, 37): 'Unsupported boolean operator.'
......
......@@ -25,6 +25,8 @@ class LitteraeResponseParser:
Process hits from an HTML node with the results table.
If anything goes wrong, add Diagnostic objects to diagnostics list.
Return a list of Record objects.
Since only the Generic Hits view is available for Litterae,
advancedHits parameter will not actually be used.
"""
records = []
rows = tableNode.xpath('tr')
......@@ -37,7 +39,7 @@ class LitteraeResponseParser:
if len(paragraphs) <= 0:
iRowOffset += 1
continue
record = Record(advancedHits=advancedHits)
record = Record(advancedHits=False)
txtParagraphs = []
for p in paragraphs:
txt = tostring(p, encoding='utf-8').decode('utf-8')
......@@ -54,15 +56,14 @@ class LitteraeResponseParser:
Read HTML response with the first N hits returned by a Litterae
instance. Return a list of Record objects and the total number of
records found.
Since only the Generic Hits view is available for Litterae,
xFcsDataviews parameter will not actually be used.
"""
diagnostics = []
advancedHits = False
dataViewsRequested = {v.strip() for v in xFcsDataviews.split(',') if len(v.strip()) > 0}
if 'adv' in dataViewsRequested:
advancedHits = True
for v in dataViewsRequested:
if v not in ('hits', 'adv'):
diagnostics.append(Diagnostic(DiagnosticTypes.fcs, 4, details=v))
srcTree = fromstring(response)
nRecords = 0
nHitsNode = srcTree.xpath('//article[@class="container-fluid"]/header/h1')
......
......@@ -149,20 +149,16 @@ class QueryParser:
xFcsDataviews, xFcsRewritesAllowed):
"""
Check if the query parameters contain a valid combination of values.
:param operation:
:param version:
:param queryType:
:param query:
:param xFcsEndpointDescription:
:param xFcsContext:
:param xFcsDataviews:
:param xFcsRewritesAllowed:
:return: Return a list of diagnostics describing problems with the query.
Return a list of diagnostics describing problems with the query.
If the query is prima facie valid and can be processed further, an empty
list will be returned.
"""
diagnostics = []
# The scan operation name is reserved, but not described by the specifications
if operation == Operation.scan:
diagnostics.append(Diagnostic(DiagnosticTypes.sru, 4, details='scan'))
# Check if additional parameters combine with the operation requested
# (FCS specifications, 4.1)
if len(xFcsEndpointDescription) > 0 and operation != Operation.explain:
......@@ -173,6 +169,30 @@ class QueryParser:
diagnostics.append(Diagnostic(DiagnosticTypes.sru, 8, details='x-fcs-dataviews'))
if len(xFcsRewritesAllowed) > 0 and operation != Operation.searchRetrieve:
diagnostics.append(Diagnostic(DiagnosticTypes.sru, 8, details='x-fcs-rewrites-allowed'))
if xFcsRewritesAllowed not in ('', 'true'):
diagnostics.append(Diagnostic(DiagnosticTypes.sru, 6,
message='The value of the parameter x-fcs-rewrites-allowed '
'can only equal "true".',
details=xFcsRewritesAllowed))
if xFcsEndpointDescription not in ('', 'true'):
diagnostics.append(Diagnostic(DiagnosticTypes.sru, 6,
message='The value of the parameter x-fcs-endpoint-description '
'can only equal "true".',
details=xFcsEndpointDescription))
# Check version-specific parameters and values
if version == SRUVersion.v1_2:
if queryType == QueryType.cql:
diagnostics.append(Diagnostic(DiagnosticTypes.sru, 8, details='queryType'))
for dv in xFcsDataviews.split(','):
dv = dv.strip()
if len(dv) <= 0:
continue
if dv != 'hits' and version == SRUVersion.v1_2:
diagnostics.append(Diagnostic(DiagnosticTypes.fcs, 4, details=dv))
elif dv not in ('hits', 'adv'):
# There actually can be other data view IDs, but they are described in a protected Trac instance
diagnostics.append(Diagnostic(DiagnosticTypes.fcs, 4, details=dv))
return diagnostics
......
......@@ -147,9 +147,6 @@ class TsakorpusResponseParser:
dataViewsRequested = {v.strip() for v in xFcsDataviews.split(',') if len(v.strip()) > 0}
if 'adv' in dataViewsRequested:
advancedHits = True
for v in dataViewsRequested:
if v not in ('hits', 'adv'):
diagnostics.append(Diagnostic(DiagnosticTypes.fcs, 4, details=v))
nRecords = 0
if 'n_sentences' in response:
nRecords = response['n_sentences']
......
......@@ -17,14 +17,16 @@ def fatal_response(operation: Operation,
and no other payload.
"""
diagStr = [str(d) for d in diagnostics]
if operation == Operation.explain:
templateName = 'explain_response_2.0.xml'
if operation in (Operation.explain, Operation.scan):
if version == SRUVersion.v1_2:
templateName = 'explain_response_1.2.xml'
return templates.TemplateResponse(templateName,
templateVersion = 1
else:
templateVersion = 2
return templates.TemplateResponse('explain_response.xml',
{
'request': request,
'diagnostics': diagStr
'diagnostics': diagStr,
'version': templateVersion
},
media_type='application/xml')
elif operation == Operation.searchRetrieve:
......@@ -32,11 +34,39 @@ def fatal_response(operation: Operation,
{
'request': request,
'diagnostics': diagStr,
'n_hits': 0,
'n_hits': 0
},
media_type='application/xml')
def process_explain(version: SRUVersion,
searchOptions: dict[str, str],
config: Optional[ResourceConfig],
diagnostics: list[Diagnostic],
request, templates):
"""
Process an explain request.
Return a rendered XML response.
"""
if version == SRUVersion.v1_2:
templateVersion = 1
else:
templateVersion = 2
endpointDescNeeded = False
if 'x-fcs-endpoint-description' in searchOptions and searchOptions['x-fcs-endpoint-description'] == 'true':
endpointDescNeeded = True
diagStr = [str(d) for d in diagnostics]
return templates.TemplateResponse('explain_response.xml',
{
'request': request,
'diagnostics': diagStr,
'config': config.as_dict(),
'version': templateVersion,
'endpoint_desc_needed': endpointDescNeeded
},
media_type='application/xml')
def process_search_retrieve(version: SRUVersion,
queryType: QueryType,
query: str,
......@@ -54,7 +84,7 @@ def process_search_retrieve(version: SRUVersion,
print(strGetParams)
res = app.qp_tsakorpus.send_query(strGetParams, config)
except Diagnostic as diag:
return fatal_response(Operation.searchRetrieve, version, [diag], request, templates)
return fatal_response(Operation.searchRetrieve, version, diagnostics + [diag], request, templates)
records, nHits, diagnostics = app.rp_tsakorpus.parse(res, config, searchOptions['x-fcs-dataviews'])
records = [r.as_dict() for r in records]
diagnostics = [str(d) for d in diagnostics]
......@@ -74,7 +104,15 @@ def process_search_retrieve(version: SRUVersion,
res = app.qp_litterae.send_query(strGetParams, config)
print(res)
except Diagnostic as diag:
return fatal_response(Operation.searchRetrieve, version, [diag], request, templates)
return fatal_response(Operation.searchRetrieve, version, diagnostics + [diag], request, templates)
for dv in searchOptions['x-fcs-dataviews'].split(','):
dv = dv.strip()
if dv != 'hits' and version == SRUVersion.v2_0:
# Litterae does not provide any additional annotation, so only Generic Hits
# are available as a data view.
# If SRU 1.2 is used, such a diagnostic has already been added
# at a previous step.
diagnostics.append(Diagnostic(DiagnosticTypes.fcs, 4, details=dv))
records, nHits, diagnostics = app.rp_litterae.parse(res, config, searchOptions['x-fcs-dataviews'])
records = [r.as_dict() for r in records]
diagnostics = [str(d) for d in diagnostics]
......@@ -111,6 +149,14 @@ def process_request(operation: Operation,
# If everything looks good, proceed to query parsing
if operation == Operation.searchRetrieve:
return process_search_retrieve(version, queryType, query, searchOptions, config, diagnostics, app, request, templates)
elif operation == Operation.explain:
return process_explain(version, searchOptions, config, diagnostics, request, templates)
# We should not end up here, but if we did, something went wrong and
# no fatal diagnostic describes the problem. Add a generic fatal diagnostic
# and return a fatal response.
diagnostics.append(Diagnostic(DiagnosticTypes.sru, 1))
return fatal_response(operation, version, diagnostics, request, templates)
if __name__ == '__main__':
......
{
"host": "0.0.0.0",
"port": "80",
"max_hits": 8,
"host": "127.0.0.1",
"port": "5000",
"transport_protocol": "http",
"url_path": "http://127.0.0.1:5000/",
"max_hits": 20,
"advanced_search_capability": false,
"adv_supported": false,
"platform": "litterae",
"resource_base_url": "https://werkstatt.formulae.uni-hamburg.de/search/",
"pos_convert": [
"titles": [
{
"content": "Formulae - Litterae - Chartae Werkstatt",
"primary": true,
"lang": "en"
}
],
"descriptions": [
{
"content": "The Formulae - Litterae - Chartae Werkstatt brings together hundreds of examples of early medieval formulaic writing, primarily formulae, charters, and letters, to allow scholars to examine these sources in context with each other. In the Werkstatt, the user can read several texts side-by-side in order to compare their content, style, language, etc. It is also possible to do complex searches in order to find formulae and/or charters that are related in terms of vocabulary, phraseology, or date.",
"primary": true,
"lang": "en"
},
{
"content": "Die Formulae - Litterae - Chartae Werkstatt enthält Beispiele für formelhaftes Schreiben aus dem frühen Mittelalter in Form von Formeln, Urkunden und Briefe. Sie bietet Wissenschaftler damit neue Möglichkeiten, Verbindungen zwischen einzelnen Texten zu erforschen. Mehrere Texte lassen sich parallel konsultieren, um Inhalt, Stil und Sprache zu vergleichen. Komplexe Suchen auf Basis von Vokabular, Ausdrucksweise oder Datum erlauben es, ähnliche Formeln oder Urkunden schnell zu identifizieren.",
"lang": "de"
},
{
"content": "Le projet Formulae - Litterae - Chartae comprend des exemplesd'écritures formulaires du Haut Moyen-Âge sous forme de formules, de chartes et de lettres.Il offre ainsi aux spécialistes de nouvelles opportunités de recherche sur les liens entre différents textes.Plusieurs textes sont comparables sur le contenu, le style et la langue. La recherche appuyée permet d'identifierdes formules ou des chartes similaires grâce au vocabulaire, à l'impression ou à la date.",
"lang": "fr"
}
],
"authors": [
{
"content": "Philippe Depreux",
"lang": "en"
},
{
"content": "Horst Lößlein",
"lang": "en"
},
{
"content": "Matthew Munson",
"lang": "en"
},
{
"content": "Christina Rothe",
"lang": "en"
},
{
"content": "Christoph Walther",
"lang": "en"
}
],
"resources": [
{
"titles": [
{
"content": "Formulae - Litterae - Chartae Werkstatt",
"primary": true,
"lang": "en"
}
],
"descriptions": [
{
"content": "The Formulae - Litterae - Chartae Werkstatt brings together hundreds of examples of early medieval formulaic writing, primarily formulae, charters, and letters, to allow scholars to examine these sources in context with each other. In the Werkstatt, the user can read several texts side-by-side in order to compare their content, style, language, etc. It is also possible to do complex searches in order to find formulae and/or charters that are related in terms of vocabulary, phraseology, or date.",
"primary": true,
"lang": "en"
},
{
"content": "Die Formulae - Litterae - Chartae Werkstatt enthält Beispiele für formelhaftes Schreiben aus dem frühen Mittelalter in Form von Formeln, Urkunden und Briefe. Sie bietet Wissenschaftler damit neue Möglichkeiten, Verbindungen zwischen einzelnen Texten zu erforschen. Mehrere Texte lassen sich parallel konsultieren, um Inhalt, Stil und Sprache zu vergleichen. Komplexe Suchen auf Basis von Vokabular, Ausdrucksweise oder Datum erlauben es, ähnliche Formeln oder Urkunden schnell zu identifizieren.",
"lang": "de"
},
{
"content": "Le projet Formulae - Litterae - Chartae comprend des exemplesd'écritures formulaires du Haut Moyen-Âge sous forme de formules, de chartes et de lettres.Il offre ainsi aux spécialistes de nouvelles opportunités de recherche sur les liens entre différents textes.Plusieurs textes sont comparables sur le contenu, le style et la langue. La recherche appuyée permet d'identifierdes formules ou des chartes similaires grâce au vocabulaire, à l'impression ou à la date.",
"lang": "fr"
}
],
"landing_page": "https://werkstatt.formulae.uni-hamburg.de/",
"languages": [
"lat"
],
"data_views": "hits"
}
]
}
\ No newline at end of file
p. 2: Link 'CLARIN-FCS-DataViews' leads to a CLARIN-internal Trac instance.
p. 3: "The value of the @version attribute MUST be 2." But SRU 1.2 EndpointDescription will have version="1", per example on p. 12. (By the way, some examples are numbered and some are not.)
p. 4: Link 'section "Layers"' leads to a CLARIN-internal Trac instance.
p. 8-9: Links to the .xsd's lead to a CLARIN-internal Trac instance.
p. 8. It is said that all endpoints must implement the Generic Hits view as 'send-by-default'. No such thing is said about the Advanced view, but it is also designated as 'send-by-default'. Why is that?
p. 9: Advanced data view is 'send-by-default', but Advanced search should only be available for SRU 2.0 queries. So for 1.2, it shouldn't be available at all.
p. 9-10: In the advanced search results, how does a client understand which layer is which? They don't contain any unique identifiers such as 'pos' or 'word'. They only contain some arbitrary IDs (<Layer id="...">). By the way, I don't understand what those IDs are and where I get them from. Just write something random out of my head?
p. 10: "Send explain request without version and operation parameter" -- but an explain request has to have the operation parameter with 'explain' as its value, as is stated somewhere nearby. UPDATE: SRU documentation says that, indeed, an empty request is treated as an explain request. Maybe this exception is worth mentioning this explicitly here.
p. 14: x-cmd-resource-info parameter present in the query example, but never explained (mentioned in some 2013 slides on FCS; should now probably be x-fcs-endpoint-description)
p. 12 and elsewhere: http://explain.z3950.org/dtd/2.0/ mentioned as the URL of the record schema, but its actual URL is http://zeerex.z3950.org/dtd/zeerex-2.0.dtd .
p. 12-13, example: what is "result-id" in SupportedLayer? What do I put as the text of the SupportedLayer element?
p. 14: x-cmd-resource-info parameter present in the query example, but never explained (mentioned in some 2013 slides on FCS; should now probably be x-fcs-endpoint-description)
p. 14: What is the sruResponse prefix? It is never mentioned. No namespace is provided here, so the example is actually invalid XML. (Who doesn't validate their examples before showing them to a wide audience -- in an official specification, no less??) Maybe sru was actually meant (see p. 2 and an SRU v. 1.2 example above)? I'm putting sru there for now.
\ No newline at end of file
<sru:extraResponseData>
<ed:EndpointDescription xmlns:ed="http://clarin.eu/fcs/endpoint-description" version="{{ ep_version }}">
<ed:EndpointDescription xmlns:ed="http://clarin.eu/fcs/endpoint-description" version="{{ sru_version }}">
<ed:Capabilities>{% if config.basic_search_capability %}
<ed:Capability>http://clarin.eu/fcs/capability/basic-search</ed:Capability>{% endif %}{% if config.advanced_search_capability and ep_version >= 2 %}
<ed:Capability>http://clarin.eu/fcs/capability/basic-search</ed:Capability>{% endif %}{% if config.advanced_search_capability and version >= 2 %}
<ed:Capability>http://clarin.eu/fcs/capability/advanced-search</ed:Capability>{% endif %}
</ed:Capabilities>
<ed:SupportedDataViews>{% if config.hits_supported %}
<ed:SupportedDataView id="hits" delivery-policy="send-by-default">application/x-clarin-fcs-hits+xml</ed:SupportedDataView>{% endif %}{% if config.adv_supported and ep_version >= 2%}
<ed:SupportedDataView id="hits" delivery-policy="send-by-default">application/x-clarin-fcs-hits+xml</ed:SupportedDataView>{% endif %}{% if config.adv_supported and version >= 2%}
<ed:SupportedDataView id="adv" delivery-policy="send-by-default">application/x-clarin-fcs-adv+xml</ed:SupportedDataView>{% endif %}
</ed:SupportedDataViews>{% if config.advanced_search_capability and ep_version >= 2 %}
</ed:SupportedDataViews>{% if config.advanced_search_capability and version >= 2 %}
<ed:SupportedLayers>{% for layer in config.supported_layers %}
<ed:SupportedLayer id="{{ layer.id }}" result-id="{{ layer.result_id }}"{% if layer.alt_value_info %} alt-value-info="{{ layer.alt_value_info }}"{% endif %}{% if layer.alt_value_info_uri %} alt-value-info-uri="{{ layer.alt_value_info_uri }}"{% endif %}{% if layer.qualifier %} qualifier="{{ layer.qualifier }}"{% endif %}>{{ layer.layer_type }}</ed:SupportedLayer>{% endfor %}
</ed:SupportedLayers>
......@@ -15,12 +15,13 @@
<ed:Resources>{% for r in config.resources %}
<ed:Resource pid="{{ r.pid }}">{% for title in r.titles %}
<ed:Title xml:lang="{{ title.lang }}">{{ title.content }}</ed:Title>{% endfor %}{% for desc in r.descriptions %}
<ed:Description xml:lang="{{ desc.lang }}">{{ desc.content }}</ed:Description>{% if landing_page|length > 0%}
<ed:Description xml:lang="{{ desc.lang }}">{{ desc.content }}</ed:Description>{% endfor %}{% if r.landing_page and r.landing_page|length > 0%}
<ed:LandingPageURI>{{ r.landing_page }}</ed:LandingPageURI>{% endif %}
<ed:Languages>{% for lang in r.languages %}
<ed:Language>{{ lang }}</ed:Language>{% endfor %}
</ed:Languages>
<ed:AvailableDataViews ref="{{ r.data_views }}"/>
<ed:AvailableDataViews ref="{{ r.data_views }}"/>{% if version >= 2 %}
<ed:AvailableLayers ref="{{ r.layers }}"/>{% endif %}
</ed:Resource>{% endfor %}
</ed:Resources>
</ed:EndpointDescription>
......
{% set ep_version = 1 %}
{% if version == 1 -%}{% set sru_version = '1.2' %}{% else -%}{% set sru_version = '2.0' %}{% endif -%}
<?xml version='1.0' encoding='utf-8'?>
<sru:explainResponse xmlns:sru="http://www.loc.gov/zing/srw/">
<sru:version>1.2</sru:version>{% if config %}
<sru:version>{{ sru_version }}</sru:version>{% if config %}
<sru:record>
<sru:recordSchema>http://explain.z3950.org/dtd/2.0/</sru:recordSchema>
<sru:recordPacking>xml</sru:recordPacking>
<sru:recordData>
<zr:explain xmlns:zr="http://explain.z3950.org/dtd/2.0/">
<!-- <zr:serverInfo > is REQUIRED -->
<zr:serverInfo protocol="SRU" version="1.2" transport="{{ transport_protocol }}" method="GET">
<zr:serverInfo protocol="SRU" version="{{ sru_version }}" transport="{{ config.transport_protocol }}" method="GET">
<zr:host>{{ config.host }}</zr:host>
<zr:port>{{ config.port }}</zr:port>
<zr:database>{{ config.url_path }}/fcs-endpoint/{{ config.platform }}/{{ config.corpus_id }}</zr:database>
<zr:database>{{ config.corpus_id }}</zr:database>
</zr:serverInfo>
<!-- <zr:databaseInfo> is REQUIRED -->
<zr:databaseInfo>{% for title in config.titles %}
......@@ -38,11 +38,11 @@
</sru:record>
<!-- <sru:echoedExplainRequest> is OPTIONAL -->
<sru:echoedExplainRequest>
<sru:version>1.2</sru:version>
<sru:baseUrl>{{ config.base_url }}</sru:baseUrl>
<sru:version>{{ sru_version }}</sru:version>
<sru:baseUrl>{{ config.url_path|trim('/') }}/fcs-endpoint/{{ config.corpus_id }}</sru:baseUrl>
</sru:echoedExplainRequest>{% if endpoint_desc_needed %}
{% include 'endpoint_description.xml' }{% endif %}{% endif %}{% if diagnostics and diagnostics|length > 0 %}
<sru:diagnostics>{% for d diagnostics %}
{% include 'endpoint_description.xml' %}{% endif %}{% endif %}{% if diagnostics and diagnostics|length > 0 %}
<sru:diagnostics>{% for d in diagnostics %}
{{ d|safe }}{% endfor %}
</sru:diagnostics>{% endif %}
</sru:explainResponse>
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment