diff --git a/Dockerfile b/Dockerfile index 3b45dcd5241d297a73ef5f908540c245a6feae77..2c82b9eaf73f037375d7004f9e82ccf206310ffb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,7 @@ FROM python:3.8 -RUN pip install pdfplumber Whoosh Flask +RUN pip install pdfplumber Whoosh Flask nltk +RUN python -m nltk.downloader punkt RUN apt-get update RUN apt-get install -y libmagickwand-dev ghostscript RUN rm /etc/ImageMagick-6/policy.xml diff --git a/app.py b/app.py index 2746bc2c59c88b1bbc70204006c0d76d93cfc94b..62b66f83c80387e3c681cd50440988b2da2f6b14 100644 --- a/app.py +++ b/app.py @@ -53,7 +53,7 @@ def query(): index = Index() query = request.form.get("term") context = request.form.get("context") # TODO: Use context to find better results - result = index.search(query) + result = index.search(query, context) img_name = result["path"][7:] + "_" + str(result["page"]) + ".jpg" return jsonify({ "type": "image", diff --git a/context_aware_search_index.py b/context_aware_search_index.py new file mode 100644 index 0000000000000000000000000000000000000000..b6f3f64d391524f3f7650a36f203a5cdff9371cd --- /dev/null +++ b/context_aware_search_index.py @@ -0,0 +1,18 @@ +from whoosh.qparser import QueryParser, OrGroup + +from search_index import BasicSearchIndex +from nltk import word_tokenize + + +class ContextAwareSearchIndex(BasicSearchIndex): + """Return fitting slides, favouring slides with the query contained in the title""" + + def result_list(self, query, context): + term = query + "" # Make new object + query = " ".join([token + "^1" for token in word_tokenize(term)]) + context_tokens = word_tokenize(context) + if len(context_tokens): + individual_context_token_weight = str(round(1 / len(context_tokens), 3)) + query += " " + " ".join([token + "^" + individual_context_token_weight for token in context_tokens]) + query_parser = QueryParser("content", self.ix.schema, group=OrGroup.factory(0.9)) + return self.ix.searcher().search(query_parser.parse(query)) diff --git a/search_index.py b/search_index.py index 67b2f6f8578498ec55dac060527f637bfedb5821..28424f917d4ca20c0ca74495f6900dee5a07d08e 100644 --- a/search_index.py +++ b/search_index.py @@ -24,12 +24,12 @@ class BasicSearchIndex: writer.add_document(path=path, page=page, content=content) writer.commit() - def result_list(self, query): + def result_list(self, query, context): query_parser = QueryParser("content", self.ix.schema, group=OrGroup.factory(0.9)) return self.ix.searcher().search(query_parser.parse(query)) - def search(self, query): - return self.result_list(query)[0] + def search(self, query, context): + return self.result_list(query, context)[0] if __name__ == "__main__": diff --git a/title_focus_search_index.py b/title_focus_search_index.py index 72fc131123f378f226d5d3b69d646cfe6bb1159a..26746282da56985156f8eb0ce71ebe70b4675d9b 100644 --- a/title_focus_search_index.py +++ b/title_focus_search_index.py @@ -4,8 +4,8 @@ from search_index import BasicSearchIndex class TitleFocusSearchIndex(BasicSearchIndex): """Return fitting slides, favouring slides with the query contained in the title""" - def search(self, query): - results = self.result_list(query) + def search(self, query, context): + results = self.result_list(query, context) for result in results: if query.lower() in result["content"].split("\n")[0].lower(): return result