From 0e17f9a2a7287876f3d62a016ff638d7fd946fb2 Mon Sep 17 00:00:00 2001 From: felixwelter <felixwelter@gmail.com> Date: Fri, 4 Sep 2020 12:18:24 +0200 Subject: [PATCH] Add title focused index relying on whoosh --- app.py | 8 ++++---- search_index.py | 7 ++++--- title_focus_search_index.py | 31 ++++++++++++++++++++++++++++++- 3 files changed, 38 insertions(+), 8 deletions(-) diff --git a/app.py b/app.py index 62b66f8..5007778 100644 --- a/app.py +++ b/app.py @@ -6,14 +6,14 @@ import pdfplumber from flask import Flask, render_template, request, redirect, send_file, jsonify from werkzeug.utils import secure_filename -from title_focus_search_index import TitleFocusSearchIndex +from title_focus_search_index import IndexBasedTitleFocusSearchIndex app = Flask(__name__) SLIDE_DIR = "slides" IMAGE_DIR = "img_cache" -Index = TitleFocusSearchIndex +Index = IndexBasedTitleFocusSearchIndex @app.route('/') @@ -39,7 +39,7 @@ def upload(): index = Index() for i, page in enumerate(pdf.pages): text = page.extract_text() - index.add(str(file_path), i, text) + index.add(str(file_path), i, text, text.split("\n")[0]) # Assumes title in the first line img_name = str(file_path)[7:] + "_" + str(i) + ".jpg" img_path = os.path.join(IMAGE_DIR, img_name) page.to_image().save(img_path) @@ -52,7 +52,7 @@ def query(): try: index = Index() query = request.form.get("term") - context = request.form.get("context") # TODO: Use context to find better results + context = request.form.get("context") result = index.search(query, context) img_name = result["path"][7:] + "_" + str(result["page"]) + ".jpg" return jsonify({ diff --git a/search_index.py b/search_index.py index 28424f9..a56c39b 100644 --- a/search_index.py +++ b/search_index.py @@ -10,7 +10,8 @@ class BasicSearchIndex: """Expose relevant functions of Whoosh using a simple interface""" def __init__(self, index_dir="index"): - self.schema = Schema(path=ID(stored=True), page=NUMERIC(stored=True), content=TEXT(stored=True)) + self.schema = Schema(path=ID(stored=True), page=NUMERIC(stored=True), content=TEXT(stored=True), + title=TEXT(stored=True)) try: self.ix = open_dir(index_dir) except EmptyIndexError: @@ -19,9 +20,9 @@ class BasicSearchIndex: def create(self, index_dir): self.ix = create_in(index_dir, self.schema) - def add(self, path, page, content): + def add(self, path, page, content, title): writer = self.ix.writer() - writer.add_document(path=path, page=page, content=content) + writer.add_document(path=path, page=page, content=content, title=title) writer.commit() def result_list(self, query, context): diff --git a/title_focus_search_index.py b/title_focus_search_index.py index 2674628..6eb0f8e 100644 --- a/title_focus_search_index.py +++ b/title_focus_search_index.py @@ -1,12 +1,41 @@ +from nltk import word_tokenize +from whoosh.qparser import QueryParser, OrGroup + from search_index import BasicSearchIndex class TitleFocusSearchIndex(BasicSearchIndex): """Return fitting slides, favouring slides with the query contained in the title""" + def __init__(self, booster_words=None): + self.booster_words = booster_words or ["concept", "definition"] + super().__init__() + def search(self, query, context): results = self.result_list(query, context) + + # Look for result with query and booster word in the title for result in results: - if query.lower() in result["content"].split("\n")[0].lower(): + if query.lower() in result["title"].lower() \ + and any([bw in result["title"].lower() for bw in self.booster_words]): + return result + + for result in results: + if query.lower() in result["title"].lower(): return result return results[0] + + +class IndexBasedTitleFocusSearchIndex(BasicSearchIndex): + """Return fitting slides, querying for slides with the query contained in the title""" + + def result_list(self, query, context): + user_query = query + query = "title:({})^2".format(user_query) + query += " content:({})^1.2".format(user_query) + if len(context.strip()) > 0: + query += " title:({})^0.2".format(context) + query += " content:({})^0.1".format(context) + query_parser = QueryParser("content", self.ix.schema, group=OrGroup.factory(0.9)) + parsed_query = query_parser.parse(query) + return self.ix.searcher().search(parsed_query) -- GitLab