Skip to content
Snippets Groups Projects
Commit e46e1c80 authored by felixwelter's avatar felixwelter
Browse files

Add option to give title row

parent 1310e543
Branches
Tags
No related merge requests found
...@@ -25,6 +25,13 @@ def index(): ...@@ -25,6 +25,13 @@ def index():
return render_template('index.html', ll=ll) return render_template('index.html', ll=ll)
@app.route('/current_index')
def current_index():
index = Index(index_dir=INDEX_DIR)
return "<br>".join(
["{} {} {}".format(res["file_name"], res["page"], res["title"]) for res in index.search("*", "")])
def allowed_file(filename): def allowed_file(filename):
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ["pdf"] return '.' in filename and filename.rsplit('.', 1)[1].lower() in ["pdf"]
...@@ -39,7 +46,8 @@ def upload(): ...@@ -39,7 +46,8 @@ def upload():
files = request.files.getlist('files') files = request.files.getlist('files')
index = Index(index_dir=INDEX_DIR) index = Index(index_dir=INDEX_DIR)
indexer = BasicIndexer(index, IMAGE_DIR) indexer = BasicIndexer(index, IMAGE_DIR)
indexer.title_row = 2 # TODO: Add mechanism for automatic detection of user configuration if request.form['title_row']:
indexer.title_row = int(request.form['title_row'])
for i, file in enumerate(files): for i, file in enumerate(files):
if file.filename != '': if file.filename != '':
if file and allowed_file(file.filename): if file and allowed_file(file.filename):
......
...@@ -20,6 +20,7 @@ class BasicIndexer(): ...@@ -20,6 +20,7 @@ class BasicIndexer():
pdf_file_name = pdf_file_path.split(os.sep)[-1] pdf_file_name = pdf_file_path.split(os.sep)[-1]
for i, page in enumerate(pdf.pages): for i, page in enumerate(pdf.pages):
text = page.extract_text() text = page.extract_text()
print(self.title_row)
self.index.add(pdf_file_name, i + 1, text, text.split("\n")[self.title_row]) self.index.add(pdf_file_name, i + 1, text, text.split("\n")[self.title_row])
if self.process_images: if self.process_images:
img_name = pdf_file_name + "_" + str(i + 1) + ".jpg" img_name = pdf_file_name + "_" + str(i + 1) + ".jpg"
......
...@@ -6,7 +6,8 @@ ...@@ -6,7 +6,8 @@
<h1>Upload new slide</h1> <h1>Upload new slide</h1>
<form action="upload" enctype="multipart/form-data" method="post"> <form action="upload" enctype="multipart/form-data" method="post">
<input type="file" name="files" multiple=""> <input type="file" name="files" multiple="">
<input type="submit" value="Upload"> <input type="submit" value="Upload"><br>
Title row: <input type="number" name="title_row">
</form> </form>
<ul> <ul>
{% for item in ll %} {% for item in ll %}
...@@ -17,6 +18,9 @@ ...@@ -17,6 +18,9 @@
<form action="reset_index" method="post"> <form action="reset_index" method="post">
<input type="submit" value="Reset Index"> <input type="submit" value="Reset Index">
</form> </form>
<p>
<a href="current_index">View example of extracted titles</a>
</p>
<h1>Query</h1> <h1>Query</h1>
<form action="search" method="post"> <form action="search" method="post">
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment