Skip to content
Snippets Groups Projects
Commit 4b2e14bf authored by Zeit-Altpeter, Lukas's avatar Zeit-Altpeter, Lukas
Browse files

...

parent 75810af4
No related branches found
No related tags found
No related merge requests found
#! python #! python
import sys import sys
import os
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
...@@ -24,19 +25,28 @@ if __name__ == '__main__': ...@@ -24,19 +25,28 @@ if __name__ == '__main__':
titles = [] titles = []
if os.path.exists(output_filename):
os.remove(output_filename)
for i in tqdm(range(len(urls))): for i in tqdm(range(len(urls))):
url = urls[i] url = urls[i]
if not url.startswith(("http://", "https://")): if not url.startswith(("http://", "https://")):
url = "http://" + url url = "http://" + url
try: try:
res = requests.get(url) res = requests.get(url, timeout=2)
res.encoding = res.apparent_encoding
if 200 <= res.status_code < 300:
soup = BeautifulSoup(res.text, 'html.parser') soup = BeautifulSoup(res.text, 'html.parser')
title = soup.find("title").string title = soup.find("title").string.strip()
else:
raise Exception
except: except:
title = "--- Titel konnte nicht gescraped werden. ---" title = "--- Titel konnte nicht gescraped werden. ---"
titles.append(title) titles.append(title+"\n")
with open(output_filename, "w") as f: if (i%10==0 and i>0) or i==len(urls)-1:
f.write(("\n").join(titles)) with open(output_filename, "a",encoding="utf-8") as f:
f.writelines(titles)
titles = []
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment