Skip to content
Snippets Groups Projects
Commit d8ebf70d authored by felixwelter's avatar felixwelter
Browse files

Add context aware search index

parent 9b2c67b2
No related branches found
No related tags found
No related merge requests found
FROM python:3.8 FROM python:3.8
RUN pip install pdfplumber Whoosh Flask RUN pip install pdfplumber Whoosh Flask nltk
RUN python -m nltk.downloader punkt
RUN apt-get update RUN apt-get update
RUN apt-get install -y libmagickwand-dev ghostscript RUN apt-get install -y libmagickwand-dev ghostscript
RUN rm /etc/ImageMagick-6/policy.xml RUN rm /etc/ImageMagick-6/policy.xml
......
...@@ -53,7 +53,7 @@ def query(): ...@@ -53,7 +53,7 @@ def query():
index = Index() index = Index()
query = request.form.get("term") query = request.form.get("term")
context = request.form.get("context") # TODO: Use context to find better results context = request.form.get("context") # TODO: Use context to find better results
result = index.search(query) result = index.search(query, context)
img_name = result["path"][7:] + "_" + str(result["page"]) + ".jpg" img_name = result["path"][7:] + "_" + str(result["page"]) + ".jpg"
return jsonify({ return jsonify({
"type": "image", "type": "image",
......
from whoosh.qparser import QueryParser, OrGroup
from search_index import BasicSearchIndex
from nltk import word_tokenize
class ContextAwareSearchIndex(BasicSearchIndex):
"""Return fitting slides, favouring slides with the query contained in the title"""
def result_list(self, query, context):
term = query + "" # Make new object
query = " ".join([token + "^1" for token in word_tokenize(term)])
context_tokens = word_tokenize(context)
if len(context_tokens):
individual_context_token_weight = str(round(1 / len(context_tokens), 3))
query += " " + " ".join([token + "^" + individual_context_token_weight for token in context_tokens])
query_parser = QueryParser("content", self.ix.schema, group=OrGroup.factory(0.9))
return self.ix.searcher().search(query_parser.parse(query))
...@@ -24,12 +24,12 @@ class BasicSearchIndex: ...@@ -24,12 +24,12 @@ class BasicSearchIndex:
writer.add_document(path=path, page=page, content=content) writer.add_document(path=path, page=page, content=content)
writer.commit() writer.commit()
def result_list(self, query): def result_list(self, query, context):
query_parser = QueryParser("content", self.ix.schema, group=OrGroup.factory(0.9)) query_parser = QueryParser("content", self.ix.schema, group=OrGroup.factory(0.9))
return self.ix.searcher().search(query_parser.parse(query)) return self.ix.searcher().search(query_parser.parse(query))
def search(self, query): def search(self, query, context):
return self.result_list(query)[0] return self.result_list(query, context)[0]
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -4,8 +4,8 @@ from search_index import BasicSearchIndex ...@@ -4,8 +4,8 @@ from search_index import BasicSearchIndex
class TitleFocusSearchIndex(BasicSearchIndex): class TitleFocusSearchIndex(BasicSearchIndex):
"""Return fitting slides, favouring slides with the query contained in the title""" """Return fitting slides, favouring slides with the query contained in the title"""
def search(self, query): def search(self, query, context):
results = self.result_list(query) results = self.result_list(query, context)
for result in results: for result in results:
if query.lower() in result["content"].split("\n")[0].lower(): if query.lower() in result["content"].split("\n")[0].lower():
return result return result
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment