Skip to content
Snippets Groups Projects
Commit 8f440b45 authored by felixwelter's avatar felixwelter
Browse files

Add further context processing to TitleFocusSearchIndex

parent fcec07d2
No related branches found
No related tags found
No related merge requests found
FROM python:3.8
RUN pip install pdfplumber Whoosh Flask nltk Deprecated
RUN python -m nltk.downloader punkt
RUN pip install pdfplumber Whoosh Flask nltk Deprecated pandas
RUN python -m nltk.downloader punkt && python -m nltk.downloader stopwords
RUN apt-get update
RUN apt-get install -y libmagickwand-dev ghostscript
RUN rm /etc/ImageMagick-6/policy.xml
......
stopwords = ["aber",
"alle",
"allem",
"allen",
"aller",
"alles",
"als",
"also",
"am",
"an",
"ander",
"andere",
"anderem",
"anderen",
"anderer",
"anderes",
"anderm",
"andern",
"anderr",
"anders",
"auch",
"auf",
"aus",
"bei",
"bin",
"bis",
"bist",
"da",
"damit",
"dann",
"der",
"den",
"des",
"dem",
"die",
"das",
"daß",
"derselbe",
"derselben",
"denselben",
"desselben",
"demselben",
"dieselbe",
"dieselben",
"dasselbe",
"dazu",
"dein",
"deine",
"deinem",
"deinen",
"deiner",
"deines",
"denn",
"derer",
"dessen",
"dich",
"dir",
"du",
"dies",
"diese",
"diesem",
"diesen",
"dieser",
"dieses",
"",
"doch",
"dort",
"",
"durch",
"ein",
"eine",
"einem",
"einen",
"einer",
"eines",
"einig",
"einige",
"einigem",
"einigen",
"einiger",
"einiges",
"einmal",
"er",
"ihn",
"ihm",
"es",
"etwas",
"euer",
"eure",
"eurem",
"euren",
"eurer",
"eures",
"für",
"gegen",
"gewesen",
"hab",
"habe",
"haben",
"hat",
"hatte",
"hatten",
"hier",
"hin",
"hinter",
"ich",
"mich",
"mir",
"",
"ihr",
"ihre",
"ihrem",
"ihren",
"ihrer",
"ihres",
"euch",
"im",
"in",
"indem",
"ins",
"ist",
"jede",
"jedem",
"jeden",
"jeder",
"jedes",
"jene",
"jenem",
"jenen",
"jener",
"jenes",
"jetzt",
"kann",
"kein",
"keine",
"keinem",
"keinen",
"keiner",
"keines",
"können",
"könnte",
"machen",
"man",
"manche",
"manchem",
"manchen",
"mancher",
"manches",
"mein",
"meine",
"meinem",
"meinen",
"meiner",
"meines",
"mit",
"muss",
"musste",
"nach",
"nicht",
"nichts",
"noch",
"nun",
"nur",
"ob",
"oder",
"ohne",
"sehr",
"sein",
"seine",
"seinem",
"seinen",
"seiner",
"seines",
"selbst",
"sich",
"sie",
"ihnen",
"sind",
"so",
"solche",
"solchem",
"solchen",
"solcher",
"solches",
"soll",
"sollte",
"sondern",
"sonst",
"über",
"um",
"und",
"uns",
"unse",
"unsem",
"unsen",
"unser",
"unses",
"unter",
"viel",
"vom",
"von",
"vor",
"während",
"war",
"waren",
"warst",
"was",
"weg",
"weil",
"weiter",
"welche",
"welchem",
"welchen",
"welcher",
"welches",
"wenn",
"werde",
"werden",
"wie",
"wieder",
"will",
"wir",
"wird",
"wirst",
"wo",
"wollen",
"wollte",
"würde",
"würden",
"zu",
"zum",
"zur",
"zwar",
"zwischen"]
from whoosh.qparser import QueryParser, OrGroup
from nltk.corpus import stopwords
from .basic_search_index import BasicSearchIndex
from .german_stopwords import stopwords as german_stopwords
def clean_context(term, context):
context = context.split(" ")
# Remove words from context that are already in the term
for token in term.split(" "):
if token.strip() != "":
try:
context.remove(token)
except ValueError:
pass
# Remove stop words
for lang in ["english", "german"]:
for stopword in stopwords.words(lang):
try:
context.remove(stopword)
except ValueError:
pass
# for stopword in german_stopwords:
# context = context.replace(stopword, "")
return " ".join(context)
class TitleFocusSearchIndex(BasicSearchIndex):
......@@ -8,6 +34,7 @@ class TitleFocusSearchIndex(BasicSearchIndex):
def result_list(self, query, context):
user_query = query
context = clean_context(user_query, context)
query = "title:({})^2".format(user_query)
query += " content:({})^1.2".format(user_query)
if len(context.strip()) > 0:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment