From a38f5e974211443aaf1b8c577caad388747d61bb Mon Sep 17 00:00:00 2001 From: felixwelter <felixwelter@gmail.com> Date: Sat, 5 Sep 2020 17:32:39 +0200 Subject: [PATCH] Store file_name instead of file_path of pdf in index --- app.py | 2 +- search_index/basic_search_index.py | 12 ++++++------ slide_indexer/basic_indexer.py | 9 +++++---- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/app.py b/app.py index 29e93d7..1bc3a64 100644 --- a/app.py +++ b/app.py @@ -54,7 +54,7 @@ def query(): query = request.form.get("term") context = request.form.get("context") result = index.search(query, context) - img_name = result["path"].split(os.sep)[-1] + "_" + str(result["page"]) + ".jpg" + img_name = result["file_name"] + "_" + str(result["page"]) + ".jpg" return jsonify({ "type": "image", "path": os.getenv('EXTERNAL_HOST', '<PLEASE_SET_EXTERNAL_HOST_ENV_VAR>') + "/slide/" + img_name diff --git a/search_index/basic_search_index.py b/search_index/basic_search_index.py index 546c8d2..8bf11a4 100644 --- a/search_index/basic_search_index.py +++ b/search_index/basic_search_index.py @@ -10,7 +10,7 @@ class BasicSearchIndex: """Expose relevant functions of Whoosh using a simple interface""" def __init__(self, index_dir="index"): - self.schema = Schema(path=ID(stored=True), page=NUMERIC(stored=True), content=TEXT(stored=True), + self.schema = Schema(file_name=ID(stored=True), page=NUMERIC(stored=True), content=TEXT(stored=True), title=TEXT(stored=True)) try: self.ix = open_dir(index_dir) @@ -20,9 +20,9 @@ class BasicSearchIndex: def create(self, index_dir): self.ix = create_in(index_dir, self.schema) - def add(self, path, page, content, title): + def add(self, file_name, page, content, title): writer = self.ix.writer() - writer.add_document(path=path, page=page, content=content, title=title) + writer.add_document(file_name=file_name, page=page, content=content, title=title) writer.commit() def result_list(self, query, context): @@ -36,8 +36,8 @@ class BasicSearchIndex: if __name__ == "__main__": os.makedirs("index_test") index = BasicSearchIndex("index_test") - index.add(path=u"/world", page=1, content=u"this is a test about the world") - index.add(path=u"/fire", page=2, content=u"i could not imagine the heat or the regression") - index.add(path=u"/dream", page=3, content=u"dreaming is special not only to humans but all animals") + index.add(file_name=u"/world", page=1, content=u"this is a test about the world") + index.add(file_name=u"/fire", page=2, content=u"i could not imagine the heat or the regression") + index.add(file_name=u"/dream", page=3, content=u"dreaming is special not only to humans but all animals") print(index.search("logistic regression")) shutil.rmtree('index_test', ignore_errors=True) diff --git a/slide_indexer/basic_indexer.py b/slide_indexer/basic_indexer.py index 8eac188..d8ac80d 100644 --- a/slide_indexer/basic_indexer.py +++ b/slide_indexer/basic_indexer.py @@ -16,12 +16,13 @@ class BasicIndexer(): def skip_images(self): self.process_images = False - def add_pdf(self, file_path): - pdf = pdfplumber.open(file_path) + def add_pdf(self, pdf_file_path): + pdf = pdfplumber.open(pdf_file_path) + pdf_file_name = pdf_file_path.split(os.sep)[-1] for i, page in enumerate(pdf.pages): text = page.extract_text() - self.index.add(str(file_path), i, text, text.split("\n")[self.title_row]) + self.index.add(pdf_file_name, i, text, text.split("\n")[self.title_row]) if self.process_images: - img_name = file_path.split(os.sep)[-1] + "_" + str(i) + ".jpg" + img_name = pdf_file_name + "_" + str(i) + ".jpg" img_path = os.path.join(self.image_dir, img_name) page.to_image().save(img_path) -- GitLab