From 75810af4bb2412f3cbf60a859b9dab17ab9e4380 Mon Sep 17 00:00:00 2001 From: "Zeit-Altpeter, Lukas" <lukas.zeit-altpeter@uni-hamburg.de> Date: Tue, 10 May 2022 13:50:32 +0200 Subject: [PATCH] initial commit --- main.py | 42 ++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 9 +++++++++ sample_list.txt | 3 +++ 3 files changed, 54 insertions(+) create mode 100755 main.py create mode 100644 requirements.txt create mode 100644 sample_list.txt diff --git a/main.py b/main.py new file mode 100755 index 0000000..9037657 --- /dev/null +++ b/main.py @@ -0,0 +1,42 @@ +#! python +import sys + +import requests +from bs4 import BeautifulSoup +from tqdm import tqdm + +output_filename = "output.txt" + +if __name__ == '__main__': + + if len(sys.argv) != 2: + print("Bitte Liste der URLs als Datei übergeben.") + quit(1) + + path = sys.argv[1] + + with open(path, "r") as f: + urls = f.read().splitlines() + + if len(urls) < 1: + print("Keine URLs in Datei gefunden oder Datei nicht lesbar.") + quit(1) + + titles = [] + + for i in tqdm(range(len(urls))): + url = urls[i] + if not url.startswith(("http://", "https://")): + url = "http://" + url + try: + res = requests.get(url) + soup = BeautifulSoup(res.text, 'html.parser') + title = soup.find("title").string + except: + title = "--- Titel konnte nicht gescraped werden. ---" + + titles.append(title) + + with open(output_filename, "w") as f: + f.write(("\n").join(titles)) + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..3156a8b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +beautifulsoup4==4.11.1 +bs4==0.0.1 +certifi==2021.10.8 +charset-normalizer==2.0.12 +idna==3.3 +requests==2.27.1 +soupsieve==2.3.2.post1 +tqdm==4.64.0 +urllib3==1.26.9 diff --git a/sample_list.txt b/sample_list.txt new file mode 100644 index 0000000..c7f40ba --- /dev/null +++ b/sample_list.txt @@ -0,0 +1,3 @@ +uni-hamburg.de +uni-jena.de +diese-seite-existiert-nicht.com -- GitLab