diff --git a/Dockerfile b/Dockerfile
index 2c82b9eaf73f037375d7004f9e82ccf206310ffb..03b465b4026cca14ac53aacc8baa357adfcbef11 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,6 +1,6 @@
FROM python:3.8
-RUN pip install pdfplumber Whoosh Flask nltk
+RUN pip install pdfplumber Whoosh Flask nltk Deprecated
RUN python -m nltk.downloader punkt
RUN apt-get update
RUN apt-get install -y libmagickwand-dev ghostscript
diff --git a/app.py b/app.py
index 5007778e9ff1b640aaa0aec24e823a928dd4fa08..5645317215ac1de255662ebe676a1d4902b90f19 100644
--- a/app.py
+++ b/app.py
@@ -6,14 +6,14 @@ import pdfplumber
from flask import Flask, render_template, request, redirect, send_file, jsonify
from werkzeug.utils import secure_filename
-from title_focus_search_index import IndexBasedTitleFocusSearchIndex
+from search_index import TitleFocusSearchIndex
app = Flask(__name__)
SLIDE_DIR = "slides"
IMAGE_DIR = "img_cache"
-Index = IndexBasedTitleFocusSearchIndex
+Index = TitleFocusSearchIndex
@app.route('/')
diff --git a/search_index/__init__.py b/search_index/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f7d6aa6d840f3079a3656255ac432058624ccdd
--- /dev/null
+++ b/search_index/__init__.py
@@ -0,0 +1,4 @@
+from .basic_search_index import BasicSearchIndex
+from .cherry_pick_title_search_index import CherryPickTitleSearchIndex
+from .context_aware_search_index import ContextAwareSearchIndex
+from .title_focus_search_index import TitleFocusSearchIndex
diff --git a/search_index.py b/search_index/basic_search_index.py
similarity index 100%
rename from search_index.py
rename to search_index/basic_search_index.py
diff --git a/search_index/cherry_pick_title_search_index.py b/search_index/cherry_pick_title_search_index.py
new file mode 100644
index 0000000000000000000000000000000000000000..1457e52c70dd8ffca7bfde11c595358e86ca4b80
--- /dev/null
+++ b/search_index/cherry_pick_title_search_index.py
@@ -0,0 +1,23 @@
+from search_index.basic_search_index import BasicSearchIndex
+
+
+class CherryPickTitleSearchIndex(BasicSearchIndex):
+ """Return fitting slides, favouring slides with the query contained in the title"""
+
+ def __init__(self, booster_words=None):
+ self.booster_words = booster_words or ["concept", "definition"]
+ super().__init__()
+
+ def search(self, query, context):
+ results = self.result_list(query, context)
+
+ # Look for result with query and booster word in the title
+ for result in results:
+ if query.lower() in result["title"].lower() \
+ and any([bw in result["title"].lower() for bw in self.booster_words]):
+ return result
+
+ for result in results:
+ if query.lower() in result["title"].lower():
+ return result
+ return results[0]
\ No newline at end of file
diff --git a/context_aware_search_index.py b/search_index/context_aware_search_index.py
similarity index 71%
rename from context_aware_search_index.py
rename to search_index/context_aware_search_index.py
index b6f3f64d391524f3f7650a36f203a5cdff9371cd..9b20216d2ef4cdaa18f781eb445a6b214b9746a0 100644
--- a/context_aware_search_index.py
+++ b/search_index/context_aware_search_index.py
@@ -1,17 +1,19 @@
from whoosh.qparser import QueryParser, OrGroup
-from search_index import BasicSearchIndex
from nltk import word_tokenize
+from .basic_search_index import BasicSearchIndex
+from deprecated import deprecated
+@deprecated(reason="Uses overly complicated query creation. Use TitleFocusSearchIndex instead.")
class ContextAwareSearchIndex(BasicSearchIndex):
- """Return fitting slides, favouring slides with the query contained in the title"""
+ """Return fitting slides based on context and title"""
def result_list(self, query, context):
term = query + "" # Make new object
query = " ".join([token + "^1" for token in word_tokenize(term)])
context_tokens = word_tokenize(context)
- if len(context_tokens):
+ if len(context_tokens) > 0:
individual_context_token_weight = str(round(1 / len(context_tokens), 3))
query += " " + " ".join([token + "^" + individual_context_token_weight for token in context_tokens])
query_parser = QueryParser("content", self.ix.schema, group=OrGroup.factory(0.9))
diff --git a/search_index/title_focus_search_index.py b/search_index/title_focus_search_index.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b452da0522cec0a274d95aa1878756b1d6d1c09
--- /dev/null
+++ b/search_index/title_focus_search_index.py
@@ -0,0 +1,17 @@
+from whoosh.qparser import QueryParser, OrGroup
+from .basic_search_index import BasicSearchIndex
+
+
+class TitleFocusSearchIndex(BasicSearchIndex):
+ """Return fitting slides, boosting terms contained in the title"""
+
+ def result_list(self, query, context):
+ user_query = query
+ query = "title:({})^2".format(user_query)
+ query += " content:({})^1.2".format(user_query)
+ if len(context.strip()) > 0:
+ query += " title:({})^0.2".format(context)
+ query += " content:({})^0.1".format(context)
+ query_parser = QueryParser("content", self.ix.schema, group=OrGroup.factory(0.9))
+ parsed_query = query_parser.parse(query)
+ return self.ix.searcher().search(parsed_query)
diff --git a/title_focus_search_index.py b/title_focus_search_index.py
deleted file mode 100644
index 6eb0f8ec436152470210b8326bac805c49a6531b..0000000000000000000000000000000000000000
--- a/title_focus_search_index.py
+++ /dev/null
@@ -1,41 +0,0 @@
-from nltk import word_tokenize
-from whoosh.qparser import QueryParser, OrGroup
-
-from search_index import BasicSearchIndex
-
-
-class TitleFocusSearchIndex(BasicSearchIndex):
- """Return fitting slides, favouring slides with the query contained in the title"""
-
- def __init__(self, booster_words=None):
- self.booster_words = booster_words or ["concept", "definition"]
- super().__init__()
-
- def search(self, query, context):
- results = self.result_list(query, context)
-
- # Look for result with query and booster word in the title
- for result in results:
- if query.lower() in result["title"].lower() \
- and any([bw in result["title"].lower() for bw in self.booster_words]):
- return result
-
- for result in results:
- if query.lower() in result["title"].lower():
- return result
- return results[0]
-
-
-class IndexBasedTitleFocusSearchIndex(BasicSearchIndex):
- """Return fitting slides, querying for slides with the query contained in the title"""
-
- def result_list(self, query, context):
- user_query = query
- query = "title:({})^2".format(user_query)
- query += " content:({})^1.2".format(user_query)
- if len(context.strip()) > 0:
- query += " title:({})^0.2".format(context)
- query += " content:({})^0.1".format(context)
- query_parser = QueryParser("content", self.ix.schema, group=OrGroup.factory(0.9))
- parsed_query = query_parser.parse(query)
- return self.ix.searcher().search(parsed_query)