Skip to content
Snippets Groups Projects
Commit 0e17f9a2 authored by felixwelter's avatar felixwelter
Browse files

Add title focused index relying on whoosh

parent d8ebf70d
No related branches found
No related tags found
No related merge requests found
......@@ -6,14 +6,14 @@ import pdfplumber
from flask import Flask, render_template, request, redirect, send_file, jsonify
from werkzeug.utils import secure_filename
from title_focus_search_index import TitleFocusSearchIndex
from title_focus_search_index import IndexBasedTitleFocusSearchIndex
app = Flask(__name__)
SLIDE_DIR = "slides"
IMAGE_DIR = "img_cache"
Index = TitleFocusSearchIndex
Index = IndexBasedTitleFocusSearchIndex
@app.route('/')
......@@ -39,7 +39,7 @@ def upload():
index = Index()
for i, page in enumerate(pdf.pages):
text = page.extract_text()
index.add(str(file_path), i, text)
index.add(str(file_path), i, text, text.split("\n")[0]) # Assumes title in the first line
img_name = str(file_path)[7:] + "_" + str(i) + ".jpg"
img_path = os.path.join(IMAGE_DIR, img_name)
page.to_image().save(img_path)
......@@ -52,7 +52,7 @@ def query():
try:
index = Index()
query = request.form.get("term")
context = request.form.get("context") # TODO: Use context to find better results
context = request.form.get("context")
result = index.search(query, context)
img_name = result["path"][7:] + "_" + str(result["page"]) + ".jpg"
return jsonify({
......
......@@ -10,7 +10,8 @@ class BasicSearchIndex:
"""Expose relevant functions of Whoosh using a simple interface"""
def __init__(self, index_dir="index"):
self.schema = Schema(path=ID(stored=True), page=NUMERIC(stored=True), content=TEXT(stored=True))
self.schema = Schema(path=ID(stored=True), page=NUMERIC(stored=True), content=TEXT(stored=True),
title=TEXT(stored=True))
try:
self.ix = open_dir(index_dir)
except EmptyIndexError:
......@@ -19,9 +20,9 @@ class BasicSearchIndex:
def create(self, index_dir):
self.ix = create_in(index_dir, self.schema)
def add(self, path, page, content):
def add(self, path, page, content, title):
writer = self.ix.writer()
writer.add_document(path=path, page=page, content=content)
writer.add_document(path=path, page=page, content=content, title=title)
writer.commit()
def result_list(self, query, context):
......
from nltk import word_tokenize
from whoosh.qparser import QueryParser, OrGroup
from search_index import BasicSearchIndex
class TitleFocusSearchIndex(BasicSearchIndex):
"""Return fitting slides, favouring slides with the query contained in the title"""
def __init__(self, booster_words=None):
self.booster_words = booster_words or ["concept", "definition"]
super().__init__()
def search(self, query, context):
results = self.result_list(query, context)
# Look for result with query and booster word in the title
for result in results:
if query.lower() in result["content"].split("\n")[0].lower():
if query.lower() in result["title"].lower() \
and any([bw in result["title"].lower() for bw in self.booster_words]):
return result
for result in results:
if query.lower() in result["title"].lower():
return result
return results[0]
class IndexBasedTitleFocusSearchIndex(BasicSearchIndex):
"""Return fitting slides, querying for slides with the query contained in the title"""
def result_list(self, query, context):
user_query = query
query = "title:({})^2".format(user_query)
query += " content:({})^1.2".format(user_query)
if len(context.strip()) > 0:
query += " title:({})^0.2".format(context)
query += " content:({})^0.1".format(context)
query_parser = QueryParser("content", self.ix.schema, group=OrGroup.factory(0.9))
parsed_query = query_parser.parse(query)
return self.ix.searcher().search(parsed_query)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment