From 0ae7508557811729a4fc909f6f944b6a23d6e310 Mon Sep 17 00:00:00 2001 From: felixwelter <felixwelter@gmail.com> Date: Fri, 4 Sep 2020 12:36:43 +0200 Subject: [PATCH] Refactor; Move index classes into module --- Dockerfile | 2 +- app.py | 4 +- search_index/__init__.py | 4 ++ .../basic_search_index.py | 0 .../cherry_pick_title_search_index.py | 23 +++++++++++ .../context_aware_search_index.py | 8 ++-- search_index/title_focus_search_index.py | 17 ++++++++ title_focus_search_index.py | 41 ------------------- 8 files changed, 52 insertions(+), 47 deletions(-) create mode 100644 search_index/__init__.py rename search_index.py => search_index/basic_search_index.py (100%) create mode 100644 search_index/cherry_pick_title_search_index.py rename context_aware_search_index.py => search_index/context_aware_search_index.py (71%) create mode 100644 search_index/title_focus_search_index.py delete mode 100644 title_focus_search_index.py diff --git a/Dockerfile b/Dockerfile index 2c82b9e..03b465b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ FROM python:3.8 -RUN pip install pdfplumber Whoosh Flask nltk +RUN pip install pdfplumber Whoosh Flask nltk Deprecated RUN python -m nltk.downloader punkt RUN apt-get update RUN apt-get install -y libmagickwand-dev ghostscript diff --git a/app.py b/app.py index 5007778..5645317 100644 --- a/app.py +++ b/app.py @@ -6,14 +6,14 @@ import pdfplumber from flask import Flask, render_template, request, redirect, send_file, jsonify from werkzeug.utils import secure_filename -from title_focus_search_index import IndexBasedTitleFocusSearchIndex +from search_index import TitleFocusSearchIndex app = Flask(__name__) SLIDE_DIR = "slides" IMAGE_DIR = "img_cache" -Index = IndexBasedTitleFocusSearchIndex +Index = TitleFocusSearchIndex @app.route('/') diff --git a/search_index/__init__.py b/search_index/__init__.py new file mode 100644 index 0000000..0f7d6aa --- /dev/null +++ b/search_index/__init__.py @@ -0,0 +1,4 @@ +from .basic_search_index import BasicSearchIndex +from .cherry_pick_title_search_index import CherryPickTitleSearchIndex +from .context_aware_search_index import ContextAwareSearchIndex +from .title_focus_search_index import TitleFocusSearchIndex diff --git a/search_index.py b/search_index/basic_search_index.py similarity index 100% rename from search_index.py rename to search_index/basic_search_index.py diff --git a/search_index/cherry_pick_title_search_index.py b/search_index/cherry_pick_title_search_index.py new file mode 100644 index 0000000..1457e52 --- /dev/null +++ b/search_index/cherry_pick_title_search_index.py @@ -0,0 +1,23 @@ +from search_index.basic_search_index import BasicSearchIndex + + +class CherryPickTitleSearchIndex(BasicSearchIndex): + """Return fitting slides, favouring slides with the query contained in the title""" + + def __init__(self, booster_words=None): + self.booster_words = booster_words or ["concept", "definition"] + super().__init__() + + def search(self, query, context): + results = self.result_list(query, context) + + # Look for result with query and booster word in the title + for result in results: + if query.lower() in result["title"].lower() \ + and any([bw in result["title"].lower() for bw in self.booster_words]): + return result + + for result in results: + if query.lower() in result["title"].lower(): + return result + return results[0] \ No newline at end of file diff --git a/context_aware_search_index.py b/search_index/context_aware_search_index.py similarity index 71% rename from context_aware_search_index.py rename to search_index/context_aware_search_index.py index b6f3f64..9b20216 100644 --- a/context_aware_search_index.py +++ b/search_index/context_aware_search_index.py @@ -1,17 +1,19 @@ from whoosh.qparser import QueryParser, OrGroup -from search_index import BasicSearchIndex from nltk import word_tokenize +from .basic_search_index import BasicSearchIndex +from deprecated import deprecated +@deprecated(reason="Uses overly complicated query creation. Use TitleFocusSearchIndex instead.") class ContextAwareSearchIndex(BasicSearchIndex): - """Return fitting slides, favouring slides with the query contained in the title""" + """Return fitting slides based on context and title""" def result_list(self, query, context): term = query + "" # Make new object query = " ".join([token + "^1" for token in word_tokenize(term)]) context_tokens = word_tokenize(context) - if len(context_tokens): + if len(context_tokens) > 0: individual_context_token_weight = str(round(1 / len(context_tokens), 3)) query += " " + " ".join([token + "^" + individual_context_token_weight for token in context_tokens]) query_parser = QueryParser("content", self.ix.schema, group=OrGroup.factory(0.9)) diff --git a/search_index/title_focus_search_index.py b/search_index/title_focus_search_index.py new file mode 100644 index 0000000..1b452da --- /dev/null +++ b/search_index/title_focus_search_index.py @@ -0,0 +1,17 @@ +from whoosh.qparser import QueryParser, OrGroup +from .basic_search_index import BasicSearchIndex + + +class TitleFocusSearchIndex(BasicSearchIndex): + """Return fitting slides, boosting terms contained in the title""" + + def result_list(self, query, context): + user_query = query + query = "title:({})^2".format(user_query) + query += " content:({})^1.2".format(user_query) + if len(context.strip()) > 0: + query += " title:({})^0.2".format(context) + query += " content:({})^0.1".format(context) + query_parser = QueryParser("content", self.ix.schema, group=OrGroup.factory(0.9)) + parsed_query = query_parser.parse(query) + return self.ix.searcher().search(parsed_query) diff --git a/title_focus_search_index.py b/title_focus_search_index.py deleted file mode 100644 index 6eb0f8e..0000000 --- a/title_focus_search_index.py +++ /dev/null @@ -1,41 +0,0 @@ -from nltk import word_tokenize -from whoosh.qparser import QueryParser, OrGroup - -from search_index import BasicSearchIndex - - -class TitleFocusSearchIndex(BasicSearchIndex): - """Return fitting slides, favouring slides with the query contained in the title""" - - def __init__(self, booster_words=None): - self.booster_words = booster_words or ["concept", "definition"] - super().__init__() - - def search(self, query, context): - results = self.result_list(query, context) - - # Look for result with query and booster word in the title - for result in results: - if query.lower() in result["title"].lower() \ - and any([bw in result["title"].lower() for bw in self.booster_words]): - return result - - for result in results: - if query.lower() in result["title"].lower(): - return result - return results[0] - - -class IndexBasedTitleFocusSearchIndex(BasicSearchIndex): - """Return fitting slides, querying for slides with the query contained in the title""" - - def result_list(self, query, context): - user_query = query - query = "title:({})^2".format(user_query) - query += " content:({})^1.2".format(user_query) - if len(context.strip()) > 0: - query += " title:({})^0.2".format(context) - query += " content:({})^0.1".format(context) - query_parser = QueryParser("content", self.ix.schema, group=OrGroup.factory(0.9)) - parsed_query = query_parser.parse(query) - return self.ix.searcher().search(parsed_query) -- GitLab