From 0e17f9a2a7287876f3d62a016ff638d7fd946fb2 Mon Sep 17 00:00:00 2001
From: felixwelter <felixwelter@gmail.com>
Date: Fri, 4 Sep 2020 12:18:24 +0200
Subject: [PATCH] Add title focused index relying on whoosh

---
 app.py                      |  8 ++++----
 search_index.py             |  7 ++++---
 title_focus_search_index.py | 31 ++++++++++++++++++++++++++++++-
 3 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/app.py b/app.py
index 62b66f8..5007778 100644
--- a/app.py
+++ b/app.py
@@ -6,14 +6,14 @@ import pdfplumber
 from flask import Flask, render_template, request, redirect, send_file, jsonify
 from werkzeug.utils import secure_filename
 
-from title_focus_search_index import TitleFocusSearchIndex
+from title_focus_search_index import IndexBasedTitleFocusSearchIndex
 
 app = Flask(__name__)
 
 SLIDE_DIR = "slides"
 IMAGE_DIR = "img_cache"
 
-Index = TitleFocusSearchIndex
+Index = IndexBasedTitleFocusSearchIndex
 
 
 @app.route('/')
@@ -39,7 +39,7 @@ def upload():
                 index = Index()
                 for i, page in enumerate(pdf.pages):
                     text = page.extract_text()
-                    index.add(str(file_path), i, text)
+                    index.add(str(file_path), i, text, text.split("\n")[0])  # Assumes title in the first line
                     img_name = str(file_path)[7:] + "_" + str(i) + ".jpg"
                     img_path = os.path.join(IMAGE_DIR, img_name)
                     page.to_image().save(img_path)
@@ -52,7 +52,7 @@ def query():
     try:
         index = Index()
         query = request.form.get("term")
-        context = request.form.get("context")  # TODO: Use context to find better results
+        context = request.form.get("context")
         result = index.search(query, context)
         img_name = result["path"][7:] + "_" + str(result["page"]) + ".jpg"
         return jsonify({
diff --git a/search_index.py b/search_index.py
index 28424f9..a56c39b 100644
--- a/search_index.py
+++ b/search_index.py
@@ -10,7 +10,8 @@ class BasicSearchIndex:
     """Expose relevant functions of Whoosh using a simple interface"""
 
     def __init__(self, index_dir="index"):
-        self.schema = Schema(path=ID(stored=True), page=NUMERIC(stored=True), content=TEXT(stored=True))
+        self.schema = Schema(path=ID(stored=True), page=NUMERIC(stored=True), content=TEXT(stored=True),
+                             title=TEXT(stored=True))
         try:
             self.ix = open_dir(index_dir)
         except EmptyIndexError:
@@ -19,9 +20,9 @@ class BasicSearchIndex:
     def create(self, index_dir):
         self.ix = create_in(index_dir, self.schema)
 
-    def add(self, path, page, content):
+    def add(self, path, page, content, title):
         writer = self.ix.writer()
-        writer.add_document(path=path, page=page, content=content)
+        writer.add_document(path=path, page=page, content=content, title=title)
         writer.commit()
 
     def result_list(self, query, context):
diff --git a/title_focus_search_index.py b/title_focus_search_index.py
index 2674628..6eb0f8e 100644
--- a/title_focus_search_index.py
+++ b/title_focus_search_index.py
@@ -1,12 +1,41 @@
+from nltk import word_tokenize
+from whoosh.qparser import QueryParser, OrGroup
+
 from search_index import BasicSearchIndex
 
 
 class TitleFocusSearchIndex(BasicSearchIndex):
     """Return fitting slides, favouring slides with the query contained in the title"""
 
+    def __init__(self, booster_words=None):
+        self.booster_words = booster_words or ["concept", "definition"]
+        super().__init__()
+
     def search(self, query, context):
         results = self.result_list(query, context)
+
+        # Look for result with query and booster word in the title
         for result in results:
-            if query.lower() in result["content"].split("\n")[0].lower():
+            if query.lower() in result["title"].lower() \
+                    and any([bw in result["title"].lower() for bw in self.booster_words]):
+                return result
+
+        for result in results:
+            if query.lower() in result["title"].lower():
                 return result
         return results[0]
+
+
+class IndexBasedTitleFocusSearchIndex(BasicSearchIndex):
+    """Return fitting slides, querying for slides with the query contained in the title"""
+
+    def result_list(self, query, context):
+        user_query = query
+        query = "title:({})^2".format(user_query)
+        query += " content:({})^1.2".format(user_query)
+        if len(context.strip()) > 0:
+            query += " title:({})^0.2".format(context)
+            query += " content:({})^0.1".format(context)
+        query_parser = QueryParser("content", self.ix.schema, group=OrGroup.factory(0.9))
+        parsed_query = query_parser.parse(query)
+        return self.ix.searcher().search(parsed_query)
-- 
GitLab