Select Git revision
__init__.py
Forked from
Ockenden, Samuel / CiS Projekt
Source project has a limited visibility.
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
bundesregierung.py 4.84 KiB
# -*- coding: utf-8 -*-
import scrapy
from scrapy.utils.markup import remove_tags
import json
from bs4 import BeautifulSoup
import datetime
class BundesregierungSpider(scrapy.Spider):
# name of scraper
name = 'bundesregierung'
allowed_domains = ['www.bundesregierung.de']
# csrf-protection for hidden-api
start_url_csrf = "https://www.bundesregierung.de/service/csrf"
# hidden-api link
json_api_url = 'https://www.bundesregierung.de/breg-de/suche/1000408!searchJson'
# prefix for transcriptions links
main_url = "https://www.bundesregierung.de"
# string must be contained to collect transcription
links_regpk_selector = "regierungspressekonferenz"
# field to save csrf-token
csrf = ""
def start_requests(self):
"""
Initially make request to csrf-service to set the csrf-prevention-cookie.
:return:
"""
yield scrapy.Request(self.start_url_csrf, callback=self.search_with_csrf)
def search_with_csrf(self, response):
"""
Retrieve csrf-token from page
:param response: scrapy.response
:return: scrapy.request[]
"""
# Retrieve csrf-token and save it
jsonresponse = json.loads(response.body_as_unicode())
self.csrf = jsonresponse["token"]
# Create requests for retrieving search results
yield scrapy.Request(self.json_api_url,
method="POST",
body='{{"search":{{"query":"","zipCodeCityQuery":"","sortOrder":"sortDate asc","page":{}}},"filters":[]}}'.format(str(164)),
headers={'Accept': 'application/json, text/javascript, */*; q=0.01', 'Content-Type': 'application/json', 'X-CSRF-TOKEN': self.csrf},
callback=self.follow_search_results)
def follow_search_results(self, response):
"""
Create all requests for retrieving the transcriptions with current search result and
create further requests for retrieving search results if possible
:param response: scrapy.response
:return: scrapy.request[]
"""
# create json object with response for queried items
jsonresponse = json.loads(response.body_as_unicode())
if "result" in jsonresponse:
# Create all requests for retrieving the transcriptions
if "items" in jsonresponse["result"]:
for item in jsonresponse["result"]["items"]:
if "payload" in item:
# payload is html encoded, so we have to create an dom to select href
# here beautiful soup is used, but could be done with parsel also
soup = BeautifulSoup(item['payload'].replace('\n', ''), 'lxml')
link = self.main_url + soup.find_all('a', href=True)[0].get('href')
if self.links_regpk_selector in link:
yield scrapy.Request(link)
# Create further requests for retrieving search results if possible
if ("pageCount" in jsonresponse["result"]) and ("page" in jsonresponse["result"]):
if jsonresponse["result"]["page"] < jsonresponse["result"]["pageCount"]: #and jsonresponse["result"]["page"] < 3:
yield scrapy.Request(self.json_api_url, method="POST",
body='{{"search":{{"query":"","zipCodeCityQuery":"","sortOrder":"sortDate asc","page":{}}},"filters":[]}}'.format(
str(jsonresponse["result"]["page"]+1)), headers={'Accept': 'application/json, text/javascript, */*; q=0.01',
'Content-Type': 'application/json', 'X-CSRF-TOKEN': self.csrf}, callback=self.follow_search_results)
def parse(self, response):
"""
Scrape transcriptions page and get the wanted properties
:param response:
:return: dict (href:string, shorttext:string, fulltext: string[], date: datetime)
"""
# xpath selector for shorttext
regpk_shorttext = response.xpath('//div[@class="bpa-short-text"]//p').extract_first()
regpk_shorttext = remove_tags(regpk_shorttext) # have to use remove_tags since /text() has a bug in this case -> will be reported to parsel
# xpath selector for transcription chunks
regpk_fulltext = response.xpath('//div[@class="bpa-richtext"]/div[@class="bpa-richtext"]//p/text()').extract()
# xpath selector for date
regpk_time_extracted = response.xpath('//span[@class="bpa-time"]/time/@datetime').extract_first()
# parse date with regular expression
regpk_time = datetime.datetime.strptime(regpk_time_extracted, '%Y-%m-%dT%H:%M:%SZ')
# return scraped fields
yield {"href": response.url, "shorttext": regpk_shorttext, "fulltext": regpk_fulltext, "date": regpk_time}