Add title focused index relying on whoosh

0e17f9a2 · felixwelter · d8ebf70d · 0e17f9a2 · 0e17f9a2 · 0e17f9a2
Commit 0e17f9a2 authored 4 years ago by felixwelter
--- a/app.py
+++ b/app.py
@@ -6,14 +6,14 @@ import pdfplumber
 from flask import Flask, render_template, request, redirect, send_file, jsonify
 from werkzeug.utils import secure_filename

-from title_focus_search_index import TitleFocusSearchIndex
+from title_focus_search_index import IndexBasedTitleFocusSearchIndex

 app = Flask(__name__)

 SLIDE_DIR = "slides"
 IMAGE_DIR = "img_cache"

-Index = TitleFocusSearchIndex
+Index = IndexBasedTitleFocusSearchIndex


 @app.route('/')
@@ -39,7 +39,7 @@ def upload():
                index = Index()
                for i, page in enumerate(pdf.pages):
                    text = page.extract_text()
-                    index.add(str(file_path), i, text)
+                    index.add(str(file_path), i, text, text.split("\n")[0])  # Assumes title in the first line
                    img_name = str(file_path)[7:] + "_" + str(i) + ".jpg"
                    img_path = os.path.join(IMAGE_DIR, img_name)
                    page.to_image().save(img_path)
@@ -52,7 +52,7 @@ def query():
    try:
        index = Index()
        query = request.form.get("term")
-        context = request.form.get("context")  # TODO: Use context to find better results
+        context = request.form.get("context")
        result = index.search(query, context)
        img_name = result["path"][7:] + "_" + str(result["page"]) + ".jpg"
        return jsonify({

--- a/search_index.py
+++ b/search_index.py
@@ -10,7 +10,8 @@ class BasicSearchIndex:
    """Expose relevant functions of Whoosh using a simple interface"""

    def __init__(self, index_dir="index"):
-        self.schema = Schema(path=ID(stored=True), page=NUMERIC(stored=True), content=TEXT(stored=True))
+        self.schema = Schema(path=ID(stored=True), page=NUMERIC(stored=True), content=TEXT(stored=True),
+                             title=TEXT(stored=True))
        try:
            self.ix = open_dir(index_dir)
        except EmptyIndexError:
@@ -19,9 +20,9 @@ class BasicSearchIndex:
    def create(self, index_dir):
        self.ix = create_in(index_dir, self.schema)

-    def add(self, path, page, content):
+    def add(self, path, page, content, title):
        writer = self.ix.writer()
-        writer.add_document(path=path, page=page, content=content)
+        writer.add_document(path=path, page=page, content=content, title=title)
        writer.commit()

    def result_list(self, query, context):

--- a/title_focus_search_index.py
+++ b/title_focus_search_index.py
+from nltk import word_tokenize
+from whoosh.qparser import QueryParser, OrGroup
+
 from search_index import BasicSearchIndex


 class TitleFocusSearchIndex(BasicSearchIndex):
    """Return fitting slides, favouring slides with the query contained in the title"""

+    def __init__(self, booster_words=None):
+        self.booster_words = booster_words or ["concept", "definition"]
+        super().__init__()
+
    def search(self, query, context):
        results = self.result_list(query, context)
+
+        # Look for result with query and booster word in the title
        for result in results:
-            if query.lower() in result["content"].split("\n")[0].lower():
+            if query.lower() in result["title"].lower() \
+                    and any([bw in result["title"].lower() for bw in self.booster_words]):
+                return result
+
+        for result in results:
+            if query.lower() in result["title"].lower():
                return result
        return results[0]
+
+
+class IndexBasedTitleFocusSearchIndex(BasicSearchIndex):
+    """Return fitting slides, querying for slides with the query contained in the title"""
+
+    def result_list(self, query, context):
+        user_query = query
+        query = "title:({})^2".format(user_query)
+        query += " content:({})^1.2".format(user_query)
+        if len(context.strip()) > 0:
+            query += " title:({})^0.2".format(context)
+            query += " content:({})^0.1".format(context)
+        query_parser = QueryParser("content", self.ix.schema, group=OrGroup.factory(0.9))
+        parsed_query = query_parser.parse(query)
+        return self.ix.searcher().search(parsed_query)