diff --git a/app.py b/app.py index 1d890ef12d254070afee429dd1b87faf832d814c..8b520019825e9503ad347b2f4643dd7e794bbbbd 100644 --- a/app.py +++ b/app.py @@ -7,6 +7,7 @@ from flask import Flask, render_template, request, redirect, send_file, jsonify from werkzeug.utils import secure_filename from search_index import TitleFocusSearchIndex +from slide_indexer import BasicIndexer app = Flask(__name__) @@ -32,6 +33,8 @@ def allowed_file(filename): def upload(): if 'files' in request.files: files = request.files.getlist('files') + index = Index(index_dir=INDEX_DIR) + indexer = BasicIndexer(index, IMAGE_DIR) for i, file in enumerate(files): print(i, file) if file.filename != '': @@ -39,15 +42,9 @@ def upload(): filename = secure_filename(file.filename) file_path = os.path.join(Path(SLIDE_DIR), filename) file.save(file_path) - pdf = pdfplumber.open(file_path) - index = Index(index_dir=INDEX_DIR) - for i, page in enumerate(pdf.pages): - text = page.extract_text() - index.add(str(file_path), i, text, text.split("\n")[0]) # Assumes title in the first line - img_name = str(file_path)[7:] + "_" + str(i) + ".jpg" - img_path = os.path.join(IMAGE_DIR, img_name) - page.to_image().save(img_path) - del index + indexer.add_pdf(file_path) + del indexer + del index return redirect('/') diff --git a/slide_indexer/__init__.py b/slide_indexer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6c582745748df43d8ebe0eb837782d12110bc579 --- /dev/null +++ b/slide_indexer/__init__.py @@ -0,0 +1 @@ +from .basic_indexer import BasicIndexer \ No newline at end of file diff --git a/slide_indexer/basic_indexer.py b/slide_indexer/basic_indexer.py new file mode 100644 index 0000000000000000000000000000000000000000..fbb32013237767cec27a46cd02c2b396d3eaca2c --- /dev/null +++ b/slide_indexer/basic_indexer.py @@ -0,0 +1,20 @@ +import os + +import pdfplumber + + +class BasicIndexer(): + def __init__(self, index, image_dir): + self.index = index + self.image_dir = image_dir + + def add_pdf(self, file_path): + pdf = pdfplumber.open(file_path) + for i, page in enumerate(pdf.pages): + text = page.extract_text() + self.index.add(str(file_path), i, text, text.split("\n")[0]) # Assumes title in the first line + img_name = str(file_path)[7:] + "_" + str(i) + ".jpg" + img_path = os.path.join(self.image_dir, img_name) + page.to_image().save(img_path) + +