From 8f440b45dd77330ea4bb74b6ff8db0e48f92eac7 Mon Sep 17 00:00:00 2001 From: felixwelter <felixwelter@gmail.com> Date: Fri, 4 Sep 2020 17:37:24 +0200 Subject: [PATCH] Add further context processing to TitleFocusSearchIndex --- Dockerfile | 4 +- search_index/german_stopwords.py | 234 +++++++++++++++++++++++ search_index/title_focus_search_index.py | 27 +++ 3 files changed, 263 insertions(+), 2 deletions(-) create mode 100644 search_index/german_stopwords.py diff --git a/Dockerfile b/Dockerfile index 03b465b..dd0c8b7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ FROM python:3.8 -RUN pip install pdfplumber Whoosh Flask nltk Deprecated -RUN python -m nltk.downloader punkt +RUN pip install pdfplumber Whoosh Flask nltk Deprecated pandas +RUN python -m nltk.downloader punkt && python -m nltk.downloader stopwords RUN apt-get update RUN apt-get install -y libmagickwand-dev ghostscript RUN rm /etc/ImageMagick-6/policy.xml diff --git a/search_index/german_stopwords.py b/search_index/german_stopwords.py new file mode 100644 index 0000000..0ff0d06 --- /dev/null +++ b/search_index/german_stopwords.py @@ -0,0 +1,234 @@ +stopwords = ["aber", + "alle", + "allem", + "allen", + "aller", + "alles", + "als", + "also", + "am", + "an", + "ander", + "andere", + "anderem", + "anderen", + "anderer", + "anderes", + "anderm", + "andern", + "anderr", + "anders", + "auch", + "auf", + "aus", + "bei", + "bin", + "bis", + "bist", + "da", + "damit", + "dann", + "der", + "den", + "des", + "dem", + "die", + "das", + "daß", + "derselbe", + "derselben", + "denselben", + "desselben", + "demselben", + "dieselbe", + "dieselben", + "dasselbe", + "dazu", + "dein", + "deine", + "deinem", + "deinen", + "deiner", + "deines", + "denn", + "derer", + "dessen", + "dich", + "dir", + "du", + "dies", + "diese", + "diesem", + "diesen", + "dieser", + "dieses", + "", + "doch", + "dort", + "", + "durch", + "ein", + "eine", + "einem", + "einen", + "einer", + "eines", + "einig", + "einige", + "einigem", + "einigen", + "einiger", + "einiges", + "einmal", + "er", + "ihn", + "ihm", + "es", + "etwas", + "euer", + "eure", + "eurem", + "euren", + "eurer", + "eures", + "für", + "gegen", + "gewesen", + "hab", + "habe", + "haben", + "hat", + "hatte", + "hatten", + "hier", + "hin", + "hinter", + "ich", + "mich", + "mir", + "", + "ihr", + "ihre", + "ihrem", + "ihren", + "ihrer", + "ihres", + "euch", + "im", + "in", + "indem", + "ins", + "ist", + "jede", + "jedem", + "jeden", + "jeder", + "jedes", + "jene", + "jenem", + "jenen", + "jener", + "jenes", + "jetzt", + "kann", + "kein", + "keine", + "keinem", + "keinen", + "keiner", + "keines", + "können", + "könnte", + "machen", + "man", + "manche", + "manchem", + "manchen", + "mancher", + "manches", + "mein", + "meine", + "meinem", + "meinen", + "meiner", + "meines", + "mit", + "muss", + "musste", + "nach", + "nicht", + "nichts", + "noch", + "nun", + "nur", + "ob", + "oder", + "ohne", + "sehr", + "sein", + "seine", + "seinem", + "seinen", + "seiner", + "seines", + "selbst", + "sich", + "sie", + "ihnen", + "sind", + "so", + "solche", + "solchem", + "solchen", + "solcher", + "solches", + "soll", + "sollte", + "sondern", + "sonst", + "über", + "um", + "und", + "uns", + "unse", + "unsem", + "unsen", + "unser", + "unses", + "unter", + "viel", + "vom", + "von", + "vor", + "während", + "war", + "waren", + "warst", + "was", + "weg", + "weil", + "weiter", + "welche", + "welchem", + "welchen", + "welcher", + "welches", + "wenn", + "werde", + "werden", + "wie", + "wieder", + "will", + "wir", + "wird", + "wirst", + "wo", + "wollen", + "wollte", + "würde", + "würden", + "zu", + "zum", + "zur", + "zwar", + "zwischen"] diff --git a/search_index/title_focus_search_index.py b/search_index/title_focus_search_index.py index 4e38425..c71330e 100644 --- a/search_index/title_focus_search_index.py +++ b/search_index/title_focus_search_index.py @@ -1,6 +1,32 @@ from whoosh.qparser import QueryParser, OrGroup +from nltk.corpus import stopwords + from .basic_search_index import BasicSearchIndex +from .german_stopwords import stopwords as german_stopwords + + +def clean_context(term, context): + context = context.split(" ") + # Remove words from context that are already in the term + for token in term.split(" "): + if token.strip() != "": + try: + context.remove(token) + except ValueError: + pass + + # Remove stop words + for lang in ["english", "german"]: + for stopword in stopwords.words(lang): + try: + context.remove(stopword) + except ValueError: + pass + # for stopword in german_stopwords: + # context = context.replace(stopword, "") + + return " ".join(context) class TitleFocusSearchIndex(BasicSearchIndex): @@ -8,6 +34,7 @@ class TitleFocusSearchIndex(BasicSearchIndex): def result_list(self, query, context): user_query = query + context = clean_context(user_query, context) query = "title:({})^2".format(user_query) query += " content:({})^1.2".format(user_query) if len(context.strip()) > 0: -- GitLab