From 4b2e14bf27c3e612f86395aceebd9260dd03a83e Mon Sep 17 00:00:00 2001 From: "Zeit-Altpeter, Lukas" <lukas.zeit-altpeter@uni-hamburg.de> Date: Tue, 17 May 2022 09:50:01 +0200 Subject: [PATCH] ... --- main.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/main.py b/main.py index 9037657..5a16991 100755 --- a/main.py +++ b/main.py @@ -1,5 +1,6 @@ #! python import sys +import os import requests from bs4 import BeautifulSoup @@ -24,19 +25,28 @@ if __name__ == '__main__': titles = [] + if os.path.exists(output_filename): + os.remove(output_filename) + for i in tqdm(range(len(urls))): url = urls[i] if not url.startswith(("http://", "https://")): url = "http://" + url try: - res = requests.get(url) - soup = BeautifulSoup(res.text, 'html.parser') - title = soup.find("title").string + res = requests.get(url, timeout=2) + res.encoding = res.apparent_encoding + if 200 <= res.status_code < 300: + soup = BeautifulSoup(res.text, 'html.parser') + title = soup.find("title").string.strip() + else: + raise Exception except: title = "--- Titel konnte nicht gescraped werden. ---" - - titles.append(title) - with open(output_filename, "w") as f: - f.write(("\n").join(titles)) + titles.append(title+"\n") + + if (i%10==0 and i>0) or i==len(urls)-1: + with open(output_filename, "a",encoding="utf-8") as f: + f.writelines(titles) + titles = [] -- GitLab