diff --git a/common/enums.py b/common/enums.py index ea51cdda673b1694151605408c466025f71ba38a..862a55454ab325be8752d9183f2b8d08ba3f9531 100644 --- a/common/enums.py +++ b/common/enums.py @@ -30,6 +30,13 @@ class DiagnosticTypes(str, Enum): sru = 'sru' # Defined at http://www.loc.gov/standards/sru/diagnostics/diagnosticsList.html fcs = 'fcs' # Defined in the FCS specifications, 4.2 + +class DataView(str, Enum): + # Data view (simple hits / multi-layered hits with annotation) + hits = 'hits' + adv = 'adv' + + if __name__ == '__main__': pass diff --git a/common/search_retrieve.py b/common/search_retrieve.py new file mode 100644 index 0000000000000000000000000000000000000000..e25928d31d86e58a139659951e5f4b3207e238cf --- /dev/null +++ b/common/search_retrieve.py @@ -0,0 +1,45 @@ +from .enums import * +from .diagnostics import Diagnostic +from .config import ResourceConfig + + +class Record: + """ + Per FCS specifications, one hit should be encoded as one record, + even though in SRU, records can contain resources, resources + can contain resource fragments, and they, in turn, can contain + multiple hits. Here, each record contains exactly one resource + with exactly one fragment with exactly one hit. + """ + def __init__(self, dataView: DataView): + self.dataView = dataView + # For simple search: + self.text = '' + # For advanced search: + self.segments = [] + self.layers = [] + + def as_dict(self): + """ + Returns a dictionary for insertion into the XML template. + """ + record = { + 'resources': { + 'resource_fragments': [{ + 'dv_hits': [{ + 'text': self.text + }], + 'dv_adv': [] + }] + } + } + if self.dataView == DataView.adv: + record['resources']['resource_fragments'][0]['dv_adv'].append({ + 'segments': self.segments, + 'layers': self.layers + }) + return record + +if __name__ == '__main__': + pass + diff --git a/notes.txt b/notes.txt index 1c8646222f7acce4fb08179556d7ed4e69c9c8d2..fdef6aab2aa854fd8311c66400c4d4857b0cb79e 100644 --- a/notes.txt +++ b/notes.txt @@ -2,8 +2,14 @@ p. 3: "The value of the @version attribute MUST be 2." But SRU 1.2 EndpointDescr p. 4: Link 'section "Layers"' leads to a CLARIN-internal Trac instance. -p. 10: "Send explain request without version and operation parameter" -- but an explain request has to have the operation parameter with 'explain' as its value +p. 8-9: Links to the .xsd's lead to a CLARIN-internal Trac instance. + +p. 9-10: In the advanced search results, how does a client understand which layer is which? They don't contain any unique identifiers such as 'pos' or 'word'. They only contain some arbitrary IDs (<Layer id="...">). By the way, I don't understand what those IDs are and where I get them from. Just write something random out of my head? + +p. 10: "Send explain request without version and operation parameter" -- but an explain request has to have the operation parameter with 'explain' as its value, as is stated somewhere nearby. UPDATE: SRU documentation says that, indeed, an empty request is treated as an explain request. Maybe this exception is worth mentioning this explicitly here. p. 14: x-cmd-resource-info parameter present in the query example, but never explained (mentioned in some 2013 slides on FCS; should now probably be x-fcs-endpoint-description) -p. 12 and elsewhere: http://explain.z3950.org/dtd/2.0/ mentioned as the URL of the record schema, but its actual URL is http://zeerex.z3950.org/dtd/zeerex-2.0.dtd . \ No newline at end of file +p. 12 and elsewhere: http://explain.z3950.org/dtd/2.0/ mentioned as the URL of the record schema, but its actual URL is http://zeerex.z3950.org/dtd/zeerex-2.0.dtd . + +p. 14: What is the sruResponse prefix? It is never mentioned. No namespace is provided here, so the example is actually invalid XML. (Who doesn't validate their examples before showing them to a wide audience -- in an official specification, no less??) Maybe sru was actually meant (see p. 2 and an SRU v. 1.2 example above)? \ No newline at end of file diff --git a/static/dataview_adv.xml b/static/dataview_adv.xml new file mode 100644 index 0000000000000000000000000000000000000000..6b1d3a1a24ac80d5131fe82cdc6c5d0cd1e5d600 --- /dev/null +++ b/static/dataview_adv.xml @@ -0,0 +1,14 @@ +<fcs:DataView type="application/x-clarin-fcs-adv+xml"> +{% for hit in rf.dv_adv %} + <adv:Advanced unit="item"> + <adv:Segments>{% for seg in hit.segments %} + <adv:Segment id="{{ seg.id }}" start="{{ seg.start }}" end="{{ seg.end }}"{% if seg.ref %} ref="{{ seg.ref }}"{% endif %}/> + </adv:Segments>{% for layer in hit.layers %} + <Layer id="{{ layer.id }}">{% for span in layer.spans %} + <Span ref="{{ span.ref }}"{% if span.alt_value %} alt-value="{{ span.alt_value }}"{% endif %}{% if span.highlight %} highlight="{{ span.highlight }}"{% endif %}>{{ span.value }}</Span> + {% endfor %} + </Layer> + {% endfor %} + </adv:Advanced> +{% endfor %} +</fcs:DataView> \ No newline at end of file diff --git a/static/gitkeep b/static/dataview_cmdi.xml similarity index 100% rename from static/gitkeep rename to static/dataview_cmdi.xml diff --git a/static/dataview_hits.xml b/static/dataview_hits.xml new file mode 100644 index 0000000000000000000000000000000000000000..3f7a378dc93465c340659c2582a09bcdd3736057 --- /dev/null +++ b/static/dataview_hits.xml @@ -0,0 +1,7 @@ +<fcs:DataView type="application/x-clarin-fcs-hits+xml"> +{% for hit in rf.dv_hits %} +<hits:Result xmlns:hits="http://clarin.eu/fcs/dataview/hits"> +{{ hit.text | safe }} +</hits:Result> +{% endfor %} +</fcs:DataView> \ No newline at end of file diff --git a/static/resource.xml b/static/resource.xml new file mode 100644 index 0000000000000000000000000000000000000000..a71b2143f50dcc2c08bceff90f4706198adbd414 --- /dev/null +++ b/static/resource.xml @@ -0,0 +1,5 @@ +<fcs:Resource xmlns:fcs="http://clarin.eu/fcs/resource"{% if resource.pid %} pid="{{ resource.pid }}"{% endif %}{% if resource.ref %} resource.ref="{{ resource.ref }}"{% endif %}>{% if resource.dv_cmdi %}{% include 'dataview_cmdi.xml' %}{% endif %} +{% for rf in resource.resource_fragments %} +{% include 'resource_fragment.xml' %} +{% endfor %} +</fcs:Resource> \ No newline at end of file diff --git a/static/resource_fragment.xml b/static/resource_fragment.xml new file mode 100644 index 0000000000000000000000000000000000000000..66966a2799a8c48aaa6659475670357bbfbc2256 --- /dev/null +++ b/static/resource_fragment.xml @@ -0,0 +1,4 @@ +<fcs:ResourceFragment {% if rf.pid %} pid="{{ rf.pid }}"{% endif %}{% if rf.ref %} ref="{{ rf.ref }}"{% endif %}> +{% if rf.dv_hits %}{% include 'dataview_hits.xml' %}{% endif %} +{% if rf.dv_adv %}{% include 'dataview_adv.xml' %}{% endif %} +</fcs:ResourceFragment> \ No newline at end of file diff --git a/static/search_retrieve_response.xml b/static/search_retrieve_response.xml new file mode 100644 index 0000000000000000000000000000000000000000..0648ffea265aa6903ef42cd9ec73a8fecb3567d9 --- /dev/null +++ b/static/search_retrieve_response.xml @@ -0,0 +1,19 @@ +<?xml version='1.0' encoding='utf-8'?> +<sruResponse:searchRetrieveResponse> +<sruResponse:version>2.0</sruResponse:version> +<sruResponse:numberOfRecords>{{ n_hits }}</sruResponse:numberOfRecords> +<sruResponse:records>{% for record in records %} + <sruResponse:record> + <sruResponse:recordSchema>http://clarin.eu/fcs/resource</sruResponse:recordSchema> + <sruResponse:recordXMLEscaping>xml</sruResponse:recordXMLEscaping> + <sruResponse:recordData>{% for resource in record.resources %} + {% include 'resource.xml' %} + {% endfor %} + </sruResponse:recordData> + <sruResponse:recordPosition>{{ loop.index }}</sruResponse:recordPosition> + </sruResponse:record> +{% endfor %} +</sruResponse:records>{% if n_hits > record.resources|length %} +<sruResponse:nextRecordPosition>{{ record.resources|length + 1 }}</sruResponse:nextRecordPosition>{% endif %} +<sruResponse:resultCountPrecision>info:srw/vocabulary/resultCountPrecision/1/exact</sruResponse:resultCountPrecision> +</sruResponse:searchRetrieveResponse> \ No newline at end of file