diff --git a/slide_indexer/basic_indexer.py b/slide_indexer/basic_indexer.py index fbb32013237767cec27a46cd02c2b396d3eaca2c..b550edb66cb25667bbff0120bd10e5ce5b3cf2ae 100644 --- a/slide_indexer/basic_indexer.py +++ b/slide_indexer/basic_indexer.py @@ -4,17 +4,24 @@ import pdfplumber class BasicIndexer(): - def __init__(self, index, image_dir): + """Add a single pdf file to the index""" + + def __init__(self, index, image_dir, slide_dir): self.index = index self.image_dir = image_dir + self.slide_dir = slide_dir + self.process_images = True + self.title_row = 0 + + def skip_images(self): + self.process_images = False def add_pdf(self, file_path): pdf = pdfplumber.open(file_path) for i, page in enumerate(pdf.pages): text = page.extract_text() - self.index.add(str(file_path), i, text, text.split("\n")[0]) # Assumes title in the first line - img_name = str(file_path)[7:] + "_" + str(i) + ".jpg" - img_path = os.path.join(self.image_dir, img_name) - page.to_image().save(img_path) - - + self.index.add(str(file_path), i, text, text.split("\n")[self.title_row]) + if self.process_images: + img_name = str(file_path)[len(self.slide_dir):] + "_" + str(i) + ".jpg" + img_path = os.path.join(self.image_dir, img_name) + page.to_image().save(img_path)