Add context aware search index

d8ebf70d · felixwelter · 9b2c67b2 · d8ebf70d · d8ebf70d · d8ebf70d
Commit d8ebf70d authored 4 years ago by felixwelter
--- a/Dockerfile
+++ b/Dockerfile
 FROM python:3.8
-RUN pip install pdfplumber Whoosh Flask
+RUN pip install pdfplumber Whoosh Flask nltk
+RUN python -m nltk.downloader punkt
 RUN apt-get update
 RUN apt-get install -y libmagickwand-dev ghostscript
 RUN rm /etc/ImageMagick-6/policy.xml

--- a/app.py
+++ b/app.py
@@ -53,7 +53,7 @@ def query():
        index = Index()
        query = request.form.get("term")
        context = request.form.get("context")  # TODO: Use context to find better results
-        result = index.search(query)
+        result = index.search(query, context)
        img_name = result["path"][7:] + "_" + str(result["page"]) + ".jpg"
        return jsonify({
            "type": "image",

--- a/context_aware_search_index.py
+++ b/context_aware_search_index.py
+from whoosh.qparser import QueryParser, OrGroup
+from search_index import BasicSearchIndex
+from nltk import word_tokenize
+class ContextAwareSearchIndex(BasicSearchIndex):
+    """Return fitting slides, favouring slides with the query contained in the title"""
+    def result_list(self, query, context):
+        term = query + ""  # Make new object
+        query = " ".join([token + "^1" for token in word_tokenize(term)])
+        context_tokens = word_tokenize(context)
+        if len(context_tokens):
+            individual_context_token_weight = str(round(1 / len(context_tokens), 3))
+            query += " " + " ".join([token + "^" + individual_context_token_weight for token in context_tokens])
+        query_parser = QueryParser("content", self.ix.schema, group=OrGroup.factory(0.9))
+        return self.ix.searcher().search(query_parser.parse(query))
--- a/search_index.py
+++ b/search_index.py
@@ -24,12 +24,12 @@ class BasicSearchIndex:
        writer.add_document(path=path, page=page, content=content)
        writer.commit()
-    def result_list(self, query):
+    def result_list(self, query, context):
        query_parser = QueryParser("content", self.ix.schema, group=OrGroup.factory(0.9))
        return self.ix.searcher().search(query_parser.parse(query))
-    def search(self, query):
+    def search(self, query, context):
-        return self.result_list(query)[0]
+        return self.result_list(query, context)[0]
 if __name__ == "__main__":

--- a/title_focus_search_index.py
+++ b/title_focus_search_index.py
@@ -4,8 +4,8 @@ from search_index import BasicSearchIndex
 class TitleFocusSearchIndex(BasicSearchIndex):
    """Return fitting slides, favouring slides with the query contained in the title"""
-    def search(self, query):
+    def search(self, query, context):
-        results = self.result_list(query)
+        results = self.result_list(query, context)
        for result in results:
            if query.lower() in result["content"].split("\n")[0].lower():
                return result