Skip to content
Snippets Groups Projects
Commit 4b2e14bf authored by Zeit-Altpeter, Lukas's avatar Zeit-Altpeter, Lukas
Browse files

...

parent 75810af4
No related branches found
No related tags found
No related merge requests found
#! python
import sys
import os
import requests
from bs4 import BeautifulSoup
......@@ -24,19 +25,28 @@ if __name__ == '__main__':
titles = []
if os.path.exists(output_filename):
os.remove(output_filename)
for i in tqdm(range(len(urls))):
url = urls[i]
if not url.startswith(("http://", "https://")):
url = "http://" + url
try:
res = requests.get(url)
res = requests.get(url, timeout=2)
res.encoding = res.apparent_encoding
if 200 <= res.status_code < 300:
soup = BeautifulSoup(res.text, 'html.parser')
title = soup.find("title").string
title = soup.find("title").string.strip()
else:
raise Exception
except:
title = "--- Titel konnte nicht gescraped werden. ---"
titles.append(title)
titles.append(title+"\n")
with open(output_filename, "w") as f:
f.write(("\n").join(titles))
if (i%10==0 and i>0) or i==len(urls)-1:
with open(output_filename, "a",encoding="utf-8") as f:
f.writelines(titles)
titles = []
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment