From 4b2e14bf27c3e612f86395aceebd9260dd03a83e Mon Sep 17 00:00:00 2001
From: "Zeit-Altpeter, Lukas" <lukas.zeit-altpeter@uni-hamburg.de>
Date: Tue, 17 May 2022 09:50:01 +0200
Subject: [PATCH] ...

---
 main.py | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/main.py b/main.py
index 9037657..5a16991 100755
--- a/main.py
+++ b/main.py
@@ -1,5 +1,6 @@
 #! python
 import sys
+import os
 
 import requests
 from bs4 import BeautifulSoup
@@ -24,19 +25,28 @@ if __name__ == '__main__':
     
     titles = []
 
+    if os.path.exists(output_filename):
+        os.remove(output_filename)
+
     for i in tqdm(range(len(urls))):
         url = urls[i]
         if not url.startswith(("http://", "https://")):
             url = "http://" + url
         try:
-            res = requests.get(url)
-            soup = BeautifulSoup(res.text, 'html.parser')
-            title = soup.find("title").string
+            res = requests.get(url, timeout=2)
+            res.encoding = res.apparent_encoding
+            if 200 <= res.status_code < 300:
+                soup = BeautifulSoup(res.text, 'html.parser')
+                title = soup.find("title").string.strip()
+            else:
+                raise Exception
         except:
             title = "--- Titel konnte nicht gescraped werden. ---"
-        
-        titles.append(title)
 
-    with open(output_filename, "w") as f:
-        f.write(("\n").join(titles))
+        titles.append(title+"\n")
+        
+        if (i%10==0 and i>0) or i==len(urls)-1:
+            with open(output_filename, "a",encoding="utf-8") as f:
+                f.writelines(titles)
+                titles = []
             
-- 
GitLab