Skip to content
Snippets Groups Projects
Commit a38f5e97 authored by felixwelter's avatar felixwelter
Browse files

Store file_name instead of file_path of pdf in index

parent 62e861e2
No related branches found
No related tags found
No related merge requests found
...@@ -54,7 +54,7 @@ def query(): ...@@ -54,7 +54,7 @@ def query():
query = request.form.get("term") query = request.form.get("term")
context = request.form.get("context") context = request.form.get("context")
result = index.search(query, context) result = index.search(query, context)
img_name = result["path"].split(os.sep)[-1] + "_" + str(result["page"]) + ".jpg" img_name = result["file_name"] + "_" + str(result["page"]) + ".jpg"
return jsonify({ return jsonify({
"type": "image", "type": "image",
"path": os.getenv('EXTERNAL_HOST', '<PLEASE_SET_EXTERNAL_HOST_ENV_VAR>') + "/slide/" + img_name "path": os.getenv('EXTERNAL_HOST', '<PLEASE_SET_EXTERNAL_HOST_ENV_VAR>') + "/slide/" + img_name
......
...@@ -10,7 +10,7 @@ class BasicSearchIndex: ...@@ -10,7 +10,7 @@ class BasicSearchIndex:
"""Expose relevant functions of Whoosh using a simple interface""" """Expose relevant functions of Whoosh using a simple interface"""
def __init__(self, index_dir="index"): def __init__(self, index_dir="index"):
self.schema = Schema(path=ID(stored=True), page=NUMERIC(stored=True), content=TEXT(stored=True), self.schema = Schema(file_name=ID(stored=True), page=NUMERIC(stored=True), content=TEXT(stored=True),
title=TEXT(stored=True)) title=TEXT(stored=True))
try: try:
self.ix = open_dir(index_dir) self.ix = open_dir(index_dir)
...@@ -20,9 +20,9 @@ class BasicSearchIndex: ...@@ -20,9 +20,9 @@ class BasicSearchIndex:
def create(self, index_dir): def create(self, index_dir):
self.ix = create_in(index_dir, self.schema) self.ix = create_in(index_dir, self.schema)
def add(self, path, page, content, title): def add(self, file_name, page, content, title):
writer = self.ix.writer() writer = self.ix.writer()
writer.add_document(path=path, page=page, content=content, title=title) writer.add_document(file_name=file_name, page=page, content=content, title=title)
writer.commit() writer.commit()
def result_list(self, query, context): def result_list(self, query, context):
...@@ -36,8 +36,8 @@ class BasicSearchIndex: ...@@ -36,8 +36,8 @@ class BasicSearchIndex:
if __name__ == "__main__": if __name__ == "__main__":
os.makedirs("index_test") os.makedirs("index_test")
index = BasicSearchIndex("index_test") index = BasicSearchIndex("index_test")
index.add(path=u"/world", page=1, content=u"this is a test about the world") index.add(file_name=u"/world", page=1, content=u"this is a test about the world")
index.add(path=u"/fire", page=2, content=u"i could not imagine the heat or the regression") index.add(file_name=u"/fire", page=2, content=u"i could not imagine the heat or the regression")
index.add(path=u"/dream", page=3, content=u"dreaming is special not only to humans but all animals") index.add(file_name=u"/dream", page=3, content=u"dreaming is special not only to humans but all animals")
print(index.search("logistic regression")) print(index.search("logistic regression"))
shutil.rmtree('index_test', ignore_errors=True) shutil.rmtree('index_test', ignore_errors=True)
...@@ -16,12 +16,13 @@ class BasicIndexer(): ...@@ -16,12 +16,13 @@ class BasicIndexer():
def skip_images(self): def skip_images(self):
self.process_images = False self.process_images = False
def add_pdf(self, file_path): def add_pdf(self, pdf_file_path):
pdf = pdfplumber.open(file_path) pdf = pdfplumber.open(pdf_file_path)
pdf_file_name = pdf_file_path.split(os.sep)[-1]
for i, page in enumerate(pdf.pages): for i, page in enumerate(pdf.pages):
text = page.extract_text() text = page.extract_text()
self.index.add(str(file_path), i, text, text.split("\n")[self.title_row]) self.index.add(pdf_file_name, i, text, text.split("\n")[self.title_row])
if self.process_images: if self.process_images:
img_name = file_path.split(os.sep)[-1] + "_" + str(i) + ".jpg" img_name = pdf_file_name + "_" + str(i) + ".jpg"
img_path = os.path.join(self.image_dir, img_name) img_path = os.path.join(self.image_dir, img_name)
page.to_image().save(img_path) page.to_image().save(img_path)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment