From dedac951d7ac097105402f6f89e1193cda0e3e64 Mon Sep 17 00:00:00 2001 From: felixwelter <felixwelter@gmail.com> Date: Fri, 4 Sep 2020 15:57:05 +0200 Subject: [PATCH] Refactor; extract indexing routine into indexer class --- app.py | 15 ++++++--------- slide_indexer/__init__.py | 1 + slide_indexer/basic_indexer.py | 20 ++++++++++++++++++++ 3 files changed, 27 insertions(+), 9 deletions(-) create mode 100644 slide_indexer/__init__.py create mode 100644 slide_indexer/basic_indexer.py diff --git a/app.py b/app.py index 1d890ef..8b52001 100644 --- a/app.py +++ b/app.py @@ -7,6 +7,7 @@ from flask import Flask, render_template, request, redirect, send_file, jsonify from werkzeug.utils import secure_filename from search_index import TitleFocusSearchIndex +from slide_indexer import BasicIndexer app = Flask(__name__) @@ -32,6 +33,8 @@ def allowed_file(filename): def upload(): if 'files' in request.files: files = request.files.getlist('files') + index = Index(index_dir=INDEX_DIR) + indexer = BasicIndexer(index, IMAGE_DIR) for i, file in enumerate(files): print(i, file) if file.filename != '': @@ -39,15 +42,9 @@ def upload(): filename = secure_filename(file.filename) file_path = os.path.join(Path(SLIDE_DIR), filename) file.save(file_path) - pdf = pdfplumber.open(file_path) - index = Index(index_dir=INDEX_DIR) - for i, page in enumerate(pdf.pages): - text = page.extract_text() - index.add(str(file_path), i, text, text.split("\n")[0]) # Assumes title in the first line - img_name = str(file_path)[7:] + "_" + str(i) + ".jpg" - img_path = os.path.join(IMAGE_DIR, img_name) - page.to_image().save(img_path) - del index + indexer.add_pdf(file_path) + del indexer + del index return redirect('/') diff --git a/slide_indexer/__init__.py b/slide_indexer/__init__.py new file mode 100644 index 0000000..6c58274 --- /dev/null +++ b/slide_indexer/__init__.py @@ -0,0 +1 @@ +from .basic_indexer import BasicIndexer \ No newline at end of file diff --git a/slide_indexer/basic_indexer.py b/slide_indexer/basic_indexer.py new file mode 100644 index 0000000..fbb3201 --- /dev/null +++ b/slide_indexer/basic_indexer.py @@ -0,0 +1,20 @@ +import os + +import pdfplumber + + +class BasicIndexer(): + def __init__(self, index, image_dir): + self.index = index + self.image_dir = image_dir + + def add_pdf(self, file_path): + pdf = pdfplumber.open(file_path) + for i, page in enumerate(pdf.pages): + text = page.extract_text() + self.index.add(str(file_path), i, text, text.split("\n")[0]) # Assumes title in the first line + img_name = str(file_path)[7:] + "_" + str(i) + ".jpg" + img_path = os.path.join(self.image_dir, img_name) + page.to_image().save(img_path) + + -- GitLab