bundesregierung.py

# -*- coding: utf-8 -*-
import scrapy
from scrapy.utils.markup import remove_tags
import json
from bs4 import BeautifulSoup
import datetime

class BundesregierungSpider(scrapy.Spider):
    # name of scraper
    name = 'bundesregierung'

    allowed_domains = ['www.bundesregierung.de']

    # csrf-protection for hidden-api
    start_url_csrf = "https://www.bundesregierung.de/service/csrf"

    # hidden-api link
    json_api_url = 'https://www.bundesregierung.de/breg-de/suche/1000408!searchJson'

    # prefix for transcriptions links
    main_url = "https://www.bundesregierung.de"

    # string must be contained to collect transcription
    links_regpk_selector = "regierungspressekonferenz"

    # field to save csrf-token
    csrf = ""

    def start_requests(self):
        """
        Initially make request to csrf-service to set the csrf-prevention-cookie.
        :return:
        """
        yield scrapy.Request(self.start_url_csrf, callback=self.search_with_csrf)

    def search_with_csrf(self, response):
        """
        Retrieve csrf-token from page
        :param response: scrapy.response
        :return: scrapy.request[]
        """

        # Retrieve csrf-token and save it
        jsonresponse = json.loads(response.body_as_unicode())
        self.csrf = jsonresponse["token"]

        # Create requests for retrieving search results
        yield scrapy.Request(self.json_api_url,
                         method="POST",
                         body='{{"search":{{"query":"","zipCodeCityQuery":"","sortOrder":"sortDate asc","page":{}}},"filters":[]}}'.format(str(164)),
                         headers={'Accept': 'application/json, text/javascript, */*; q=0.01', 'Content-Type': 'application/json', 'X-CSRF-TOKEN': self.csrf},
                         callback=self.follow_search_results)

    def follow_search_results(self, response):
        """
        Create all requests for retrieving the transcriptions with current search result and
        create further requests for retrieving search results if possible
        :param response: scrapy.response
        :return: scrapy.request[]
        """
        # create json object with response for queried items
        jsonresponse = json.loads(response.body_as_unicode())

        if "result" in jsonresponse:
            # Create all requests for retrieving the transcriptions
            if "items" in jsonresponse["result"]:
                for item in jsonresponse["result"]["items"]:
                    if "payload" in item:
                        # payload is html encoded, so we have to create an dom to select href
                        # here beautiful soup is used, but could be done with parsel also
                        soup = BeautifulSoup(item['payload'].replace('\n', ''), 'lxml')
                        link = self.main_url + soup.find_all('a', href=True)[0].get('href')
                        if self.links_regpk_selector in link:
                            yield scrapy.Request(link)

            # Create further requests for retrieving search results if possible
            if ("pageCount" in jsonresponse["result"]) and ("page" in jsonresponse["result"]):
                if jsonresponse["result"]["page"] < jsonresponse["result"]["pageCount"]: #and jsonresponse["result"]["page"] < 3:
                    yield scrapy.Request(self.json_api_url, method="POST",
                                         body='{{"search":{{"query":"","zipCodeCityQuery":"","sortOrder":"sortDate asc","page":{}}},"filters":[]}}'.format(
                                             str(jsonresponse["result"]["page"]+1)), headers={'Accept': 'application/json, text/javascript, */*; q=0.01',
                                                  'Content-Type': 'application/json', 'X-CSRF-TOKEN': self.csrf}, callback=self.follow_search_results)

    def parse(self, response):
        """
        Scrape transcriptions page and get the wanted properties
        :param response:
        :return: dict (href:string, shorttext:string, fulltext: string[], date: datetime)
        """

        # xpath selector for shorttext
        regpk_shorttext = response.xpath('//div[@class="bpa-short-text"]//p').extract_first()
        regpk_shorttext = remove_tags(regpk_shorttext) # have to use remove_tags since /text() has a bug in this case -> will be reported to parsel

        # xpath selector for transcription chunks
        regpk_fulltext = response.xpath('//div[@class="bpa-richtext"]/div[@class="bpa-richtext"]//p/text()').extract()

        # xpath selector for date
        regpk_time_extracted = response.xpath('//span[@class="bpa-time"]/time/@datetime').extract_first()
        # parse date with regular expression
        regpk_time = datetime.datetime.strptime(regpk_time_extracted, '%Y-%m-%dT%H:%M:%SZ')

        # return scraped fields
        yield {"href": response.url, "shorttext": regpk_shorttext, "fulltext": regpk_fulltext, "date": regpk_time}