From 0ae7508557811729a4fc909f6f944b6a23d6e310 Mon Sep 17 00:00:00 2001
From: felixwelter <felixwelter@gmail.com>
Date: Fri, 4 Sep 2020 12:36:43 +0200
Subject: [PATCH] Refactor; Move index classes into module

---
 Dockerfile                                    |  2 +-
 app.py                                        |  4 +-
 search_index/__init__.py                      |  4 ++
 .../basic_search_index.py                     |  0
 .../cherry_pick_title_search_index.py         | 23 +++++++++++
 .../context_aware_search_index.py             |  8 ++--
 search_index/title_focus_search_index.py      | 17 ++++++++
 title_focus_search_index.py                   | 41 -------------------
 8 files changed, 52 insertions(+), 47 deletions(-)
 create mode 100644 search_index/__init__.py
 rename search_index.py => search_index/basic_search_index.py (100%)
 create mode 100644 search_index/cherry_pick_title_search_index.py
 rename context_aware_search_index.py => search_index/context_aware_search_index.py (71%)
 create mode 100644 search_index/title_focus_search_index.py
 delete mode 100644 title_focus_search_index.py

diff --git a/Dockerfile b/Dockerfile
index 2c82b9e..03b465b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,6 +1,6 @@
 FROM python:3.8
 
-RUN pip install pdfplumber Whoosh Flask nltk
+RUN pip install pdfplumber Whoosh Flask nltk Deprecated
 RUN python -m nltk.downloader punkt
 RUN apt-get update
 RUN apt-get install -y libmagickwand-dev ghostscript
diff --git a/app.py b/app.py
index 5007778..5645317 100644
--- a/app.py
+++ b/app.py
@@ -6,14 +6,14 @@ import pdfplumber
 from flask import Flask, render_template, request, redirect, send_file, jsonify
 from werkzeug.utils import secure_filename
 
-from title_focus_search_index import IndexBasedTitleFocusSearchIndex
+from search_index import TitleFocusSearchIndex
 
 app = Flask(__name__)
 
 SLIDE_DIR = "slides"
 IMAGE_DIR = "img_cache"
 
-Index = IndexBasedTitleFocusSearchIndex
+Index = TitleFocusSearchIndex
 
 
 @app.route('/')
diff --git a/search_index/__init__.py b/search_index/__init__.py
new file mode 100644
index 0000000..0f7d6aa
--- /dev/null
+++ b/search_index/__init__.py
@@ -0,0 +1,4 @@
+from .basic_search_index import BasicSearchIndex
+from .cherry_pick_title_search_index import CherryPickTitleSearchIndex
+from .context_aware_search_index import ContextAwareSearchIndex
+from .title_focus_search_index import TitleFocusSearchIndex
diff --git a/search_index.py b/search_index/basic_search_index.py
similarity index 100%
rename from search_index.py
rename to search_index/basic_search_index.py
diff --git a/search_index/cherry_pick_title_search_index.py b/search_index/cherry_pick_title_search_index.py
new file mode 100644
index 0000000..1457e52
--- /dev/null
+++ b/search_index/cherry_pick_title_search_index.py
@@ -0,0 +1,23 @@
+from search_index.basic_search_index import BasicSearchIndex
+
+
+class CherryPickTitleSearchIndex(BasicSearchIndex):
+    """Return fitting slides, favouring slides with the query contained in the title"""
+
+    def __init__(self, booster_words=None):
+        self.booster_words = booster_words or ["concept", "definition"]
+        super().__init__()
+
+    def search(self, query, context):
+        results = self.result_list(query, context)
+
+        # Look for result with query and booster word in the title
+        for result in results:
+            if query.lower() in result["title"].lower() \
+                    and any([bw in result["title"].lower() for bw in self.booster_words]):
+                return result
+
+        for result in results:
+            if query.lower() in result["title"].lower():
+                return result
+        return results[0]
\ No newline at end of file
diff --git a/context_aware_search_index.py b/search_index/context_aware_search_index.py
similarity index 71%
rename from context_aware_search_index.py
rename to search_index/context_aware_search_index.py
index b6f3f64..9b20216 100644
--- a/context_aware_search_index.py
+++ b/search_index/context_aware_search_index.py
@@ -1,17 +1,19 @@
 from whoosh.qparser import QueryParser, OrGroup
 
-from search_index import BasicSearchIndex
 from nltk import word_tokenize
+from .basic_search_index import BasicSearchIndex
+from deprecated import deprecated
 
 
+@deprecated(reason="Uses overly complicated query creation. Use TitleFocusSearchIndex instead.")
 class ContextAwareSearchIndex(BasicSearchIndex):
-    """Return fitting slides, favouring slides with the query contained in the title"""
+    """Return fitting slides based on context and title"""
 
     def result_list(self, query, context):
         term = query + ""  # Make new object
         query = " ".join([token + "^1" for token in word_tokenize(term)])
         context_tokens = word_tokenize(context)
-        if len(context_tokens):
+        if len(context_tokens) > 0:
             individual_context_token_weight = str(round(1 / len(context_tokens), 3))
             query += " " + " ".join([token + "^" + individual_context_token_weight for token in context_tokens])
         query_parser = QueryParser("content", self.ix.schema, group=OrGroup.factory(0.9))
diff --git a/search_index/title_focus_search_index.py b/search_index/title_focus_search_index.py
new file mode 100644
index 0000000..1b452da
--- /dev/null
+++ b/search_index/title_focus_search_index.py
@@ -0,0 +1,17 @@
+from whoosh.qparser import QueryParser, OrGroup
+from .basic_search_index import BasicSearchIndex
+
+
+class TitleFocusSearchIndex(BasicSearchIndex):
+    """Return fitting slides, boosting terms contained in the title"""
+
+    def result_list(self, query, context):
+        user_query = query
+        query = "title:({})^2".format(user_query)
+        query += " content:({})^1.2".format(user_query)
+        if len(context.strip()) > 0:
+            query += " title:({})^0.2".format(context)
+            query += " content:({})^0.1".format(context)
+        query_parser = QueryParser("content", self.ix.schema, group=OrGroup.factory(0.9))
+        parsed_query = query_parser.parse(query)
+        return self.ix.searcher().search(parsed_query)
diff --git a/title_focus_search_index.py b/title_focus_search_index.py
deleted file mode 100644
index 6eb0f8e..0000000
--- a/title_focus_search_index.py
+++ /dev/null
@@ -1,41 +0,0 @@
-from nltk import word_tokenize
-from whoosh.qparser import QueryParser, OrGroup
-
-from search_index import BasicSearchIndex
-
-
-class TitleFocusSearchIndex(BasicSearchIndex):
-    """Return fitting slides, favouring slides with the query contained in the title"""
-
-    def __init__(self, booster_words=None):
-        self.booster_words = booster_words or ["concept", "definition"]
-        super().__init__()
-
-    def search(self, query, context):
-        results = self.result_list(query, context)
-
-        # Look for result with query and booster word in the title
-        for result in results:
-            if query.lower() in result["title"].lower() \
-                    and any([bw in result["title"].lower() for bw in self.booster_words]):
-                return result
-
-        for result in results:
-            if query.lower() in result["title"].lower():
-                return result
-        return results[0]
-
-
-class IndexBasedTitleFocusSearchIndex(BasicSearchIndex):
-    """Return fitting slides, querying for slides with the query contained in the title"""
-
-    def result_list(self, query, context):
-        user_query = query
-        query = "title:({})^2".format(user_query)
-        query += " content:({})^1.2".format(user_query)
-        if len(context.strip()) > 0:
-            query += " title:({})^0.2".format(context)
-            query += " content:({})^0.1".format(context)
-        query_parser = QueryParser("content", self.ix.schema, group=OrGroup.factory(0.9))
-        parsed_query = query_parser.parse(query)
-        return self.ix.searcher().search(parsed_query)
-- 
GitLab