Store file_name instead of file_path of pdf in index

a38f5e97 · felixwelter · 62e861e2 · a38f5e97 · a38f5e97 · a38f5e97
Commit a38f5e97 authored Sep 5, 2020 by felixwelter
--- a/app.py
+++ b/app.py
@@ -54,7 +54,7 @@ def query():
        query = request.form.get("term")
        context = request.form.get("context")
        result = index.search(query, context)
-        img_name = result["path"].split(os.sep)[-1] + "_" + str(result["page"]) + ".jpg"
+        img_name = result["file_name"] + "_" + str(result["page"]) + ".jpg"
        return jsonify({
            "type": "image",
            "path": os.getenv('EXTERNAL_HOST', '<PLEASE_SET_EXTERNAL_HOST_ENV_VAR>') + "/slide/" + img_name

--- a/search_index/basic_search_index.py
+++ b/search_index/basic_search_index.py
@@ -10,7 +10,7 @@ class BasicSearchIndex:
    """Expose relevant functions of Whoosh using a simple interface"""
    def __init__(self, index_dir="index"):
-        self.schema = Schema(path=ID(stored=True), page=NUMERIC(stored=True), content=TEXT(stored=True),
+        self.schema = Schema(file_name=ID(stored=True), page=NUMERIC(stored=True), content=TEXT(stored=True),
                             title=TEXT(stored=True))
        try:
            self.ix = open_dir(index_dir)
@@ -20,9 +20,9 @@ class BasicSearchIndex:
    def create(self, index_dir):
        self.ix = create_in(index_dir, self.schema)
-    def add(self, path, page, content, title):
+    def add(self, file_name, page, content, title):
        writer = self.ix.writer()
-        writer.add_document(path=path, page=page, content=content, title=title)
+        writer.add_document(file_name=file_name, page=page, content=content, title=title)
        writer.commit()
    def result_list(self, query, context):
@@ -36,8 +36,8 @@ class BasicSearchIndex:
 if __name__ == "__main__":
    os.makedirs("index_test")
    index = BasicSearchIndex("index_test")
-    index.add(path=u"/world", page=1, content=u"this is a test about the world")
+    index.add(file_name=u"/world", page=1, content=u"this is a test about the world")
-    index.add(path=u"/fire", page=2, content=u"i could not imagine the heat or the regression")
+    index.add(file_name=u"/fire", page=2, content=u"i could not imagine the heat or the regression")
-    index.add(path=u"/dream", page=3, content=u"dreaming is special not only to humans but all animals")
+    index.add(file_name=u"/dream", page=3, content=u"dreaming is special not only to humans but all animals")
    print(index.search("logistic regression"))
    shutil.rmtree('index_test', ignore_errors=True)
--- a/slide_indexer/basic_indexer.py
+++ b/slide_indexer/basic_indexer.py
@@ -16,12 +16,13 @@ class BasicIndexer():
    def skip_images(self):
        self.process_images = False
-    def add_pdf(self, file_path):
+    def add_pdf(self, pdf_file_path):
-        pdf = pdfplumber.open(file_path)
+        pdf = pdfplumber.open(pdf_file_path)
+        pdf_file_name = pdf_file_path.split(os.sep)[-1]
        for i, page in enumerate(pdf.pages):
            text = page.extract_text()
-            self.index.add(str(file_path), i, text, text.split("\n")[self.title_row])
+            self.index.add(pdf_file_name, i, text, text.split("\n")[self.title_row])
            if self.process_images:
-                img_name = file_path.split(os.sep)[-1] + "_" + str(i) + ".jpg"
+                img_name = pdf_file_name + "_" + str(i) + ".jpg"
                img_path = os.path.join(self.image_dir, img_name)
                page.to_image().save(img_path)