...

4b2e14bf · Zeit-Altpeter, Lukas · 75810af4 · 4b2e14bf
Commit 4b2e14bf authored 3 years ago by Zeit-Altpeter, Lukas
--- a/main.py
+++ b/main.py
 #! python
 import sys
+import os
 import requests
 from bs4 import BeautifulSoup
@@ -24,19 +25,28 @@ if __name__ == '__main__':
    titles = []
+    if os.path.exists(output_filename):
+        os.remove(output_filename)
    for i in tqdm(range(len(urls))):
        url = urls[i]
        if not url.startswith(("http://", "https://")):
            url = "http://" + url
        try:
-            res = requests.get(url)
+            res = requests.get(url, timeout=2)
+            res.encoding = res.apparent_encoding
+            if 200 <= res.status_code < 300:
                soup = BeautifulSoup(res.text, 'html.parser')
-            title = soup.find("title").string
+                title = soup.find("title").string.strip()
+            else:
+                raise Exception
        except:
            title = "--- Titel konnte nicht gescraped werden. ---"
-        titles.append(title)
+        titles.append(title+"\n")
-    with open(output_filename, "w") as f:
+        if (i%10==0 and i>0) or i==len(urls)-1:
-        f.write(("\n").join(titles))
+            with open(output_filename, "a",encoding="utf-8") as f:
+                f.writelines(titles)
+                titles = []