Skip to content
Snippets Groups Projects
Commit abe6a8ff authored by felixwelter's avatar felixwelter
Browse files

Add configuration options to BasicIndexer

parent 8f440b45
Branches
Tags
No related merge requests found
......@@ -4,17 +4,24 @@ import pdfplumber
class BasicIndexer():
def __init__(self, index, image_dir):
"""Add a single pdf file to the index"""
def __init__(self, index, image_dir, slide_dir):
self.index = index
self.image_dir = image_dir
self.slide_dir = slide_dir
self.process_images = True
self.title_row = 0
def skip_images(self):
self.process_images = False
def add_pdf(self, file_path):
pdf = pdfplumber.open(file_path)
for i, page in enumerate(pdf.pages):
text = page.extract_text()
self.index.add(str(file_path), i, text, text.split("\n")[0]) # Assumes title in the first line
img_name = str(file_path)[7:] + "_" + str(i) + ".jpg"
self.index.add(str(file_path), i, text, text.split("\n")[self.title_row])
if self.process_images:
img_name = str(file_path)[len(self.slide_dir):] + "_" + str(i) + ".jpg"
img_path = os.path.join(self.image_dir, img_name)
page.to_image().save(img_path)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment