From a38f5e974211443aaf1b8c577caad388747d61bb Mon Sep 17 00:00:00 2001
From: felixwelter <felixwelter@gmail.com>
Date: Sat, 5 Sep 2020 17:32:39 +0200
Subject: [PATCH] Store file_name instead of file_path of pdf in index

---
 app.py                             |  2 +-
 search_index/basic_search_index.py | 12 ++++++------
 slide_indexer/basic_indexer.py     |  9 +++++----
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/app.py b/app.py
index 29e93d7..1bc3a64 100644
--- a/app.py
+++ b/app.py
@@ -54,7 +54,7 @@ def query():
         query = request.form.get("term")
         context = request.form.get("context")
         result = index.search(query, context)
-        img_name = result["path"].split(os.sep)[-1] + "_" + str(result["page"]) + ".jpg"
+        img_name = result["file_name"] + "_" + str(result["page"]) + ".jpg"
         return jsonify({
             "type": "image",
             "path": os.getenv('EXTERNAL_HOST', '<PLEASE_SET_EXTERNAL_HOST_ENV_VAR>') + "/slide/" + img_name
diff --git a/search_index/basic_search_index.py b/search_index/basic_search_index.py
index 546c8d2..8bf11a4 100644
--- a/search_index/basic_search_index.py
+++ b/search_index/basic_search_index.py
@@ -10,7 +10,7 @@ class BasicSearchIndex:
     """Expose relevant functions of Whoosh using a simple interface"""
 
     def __init__(self, index_dir="index"):
-        self.schema = Schema(path=ID(stored=True), page=NUMERIC(stored=True), content=TEXT(stored=True),
+        self.schema = Schema(file_name=ID(stored=True), page=NUMERIC(stored=True), content=TEXT(stored=True),
                              title=TEXT(stored=True))
         try:
             self.ix = open_dir(index_dir)
@@ -20,9 +20,9 @@ class BasicSearchIndex:
     def create(self, index_dir):
         self.ix = create_in(index_dir, self.schema)
 
-    def add(self, path, page, content, title):
+    def add(self, file_name, page, content, title):
         writer = self.ix.writer()
-        writer.add_document(path=path, page=page, content=content, title=title)
+        writer.add_document(file_name=file_name, page=page, content=content, title=title)
         writer.commit()
 
     def result_list(self, query, context):
@@ -36,8 +36,8 @@ class BasicSearchIndex:
 if __name__ == "__main__":
     os.makedirs("index_test")
     index = BasicSearchIndex("index_test")
-    index.add(path=u"/world", page=1, content=u"this is a test about the world")
-    index.add(path=u"/fire", page=2, content=u"i could not imagine the heat or the regression")
-    index.add(path=u"/dream", page=3, content=u"dreaming is special not only to humans but all animals")
+    index.add(file_name=u"/world", page=1, content=u"this is a test about the world")
+    index.add(file_name=u"/fire", page=2, content=u"i could not imagine the heat or the regression")
+    index.add(file_name=u"/dream", page=3, content=u"dreaming is special not only to humans but all animals")
     print(index.search("logistic regression"))
     shutil.rmtree('index_test', ignore_errors=True)
diff --git a/slide_indexer/basic_indexer.py b/slide_indexer/basic_indexer.py
index 8eac188..d8ac80d 100644
--- a/slide_indexer/basic_indexer.py
+++ b/slide_indexer/basic_indexer.py
@@ -16,12 +16,13 @@ class BasicIndexer():
     def skip_images(self):
         self.process_images = False
 
-    def add_pdf(self, file_path):
-        pdf = pdfplumber.open(file_path)
+    def add_pdf(self, pdf_file_path):
+        pdf = pdfplumber.open(pdf_file_path)
+        pdf_file_name = pdf_file_path.split(os.sep)[-1]
         for i, page in enumerate(pdf.pages):
             text = page.extract_text()
-            self.index.add(str(file_path), i, text, text.split("\n")[self.title_row])
+            self.index.add(pdf_file_name, i, text, text.split("\n")[self.title_row])
             if self.process_images:
-                img_name = file_path.split(os.sep)[-1] + "_" + str(i) + ".jpg"
+                img_name = pdf_file_name + "_" + str(i) + ".jpg"
                 img_path = os.path.join(self.image_dir, img_name)
                 page.to_image().save(img_path)
-- 
GitLab