Skip to content
Snippets Groups Projects
Select Git revision
  • d760e3fba89c868d103e6c104eaf6515a244f188
  • main default protected
2 results

README.md

Blame
  • Code owners
    Assign users and groups as approvers for specific file changes. Learn more.
    bundesregierung.py 4.84 KiB
    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.utils.markup import remove_tags
    import json
    from bs4 import BeautifulSoup
    import datetime
    
    class BundesregierungSpider(scrapy.Spider):
        # name of scraper
        name = 'bundesregierung'
    
        allowed_domains = ['www.bundesregierung.de']
    
        # csrf-protection for hidden-api
        start_url_csrf = "https://www.bundesregierung.de/service/csrf"
    
        # hidden-api link
        json_api_url = 'https://www.bundesregierung.de/breg-de/suche/1000408!searchJson'
    
        # prefix for transcriptions links
        main_url = "https://www.bundesregierung.de"
    
        # string must be contained to collect transcription
        links_regpk_selector = "regierungspressekonferenz"
    
        # field to save csrf-token
        csrf = ""
    
        def start_requests(self):
            """
            Initially make request to csrf-service to set the csrf-prevention-cookie.
            :return:
            """
            yield scrapy.Request(self.start_url_csrf, callback=self.search_with_csrf)
    
        def search_with_csrf(self, response):
            """
            Retrieve csrf-token from page
            :param response: scrapy.response
            :return: scrapy.request[]
            """
    
            # Retrieve csrf-token and save it
            jsonresponse = json.loads(response.body_as_unicode())
            self.csrf = jsonresponse["token"]
    
            # Create requests for retrieving search results
            yield scrapy.Request(self.json_api_url,
                             method="POST",
                             body='{{"search":{{"query":"","zipCodeCityQuery":"","sortOrder":"sortDate asc","page":{}}},"filters":[]}}'.format(str(164)),
                             headers={'Accept': 'application/json, text/javascript, */*; q=0.01', 'Content-Type': 'application/json', 'X-CSRF-TOKEN': self.csrf},
                             callback=self.follow_search_results)
    
        def follow_search_results(self, response):
            """
            Create all requests for retrieving the transcriptions with current search result and
            create further requests for retrieving search results if possible
            :param response: scrapy.response
            :return: scrapy.request[]
            """
            # create json object with response for queried items
            jsonresponse = json.loads(response.body_as_unicode())
    
            if "result" in jsonresponse:
                # Create all requests for retrieving the transcriptions
                if "items" in jsonresponse["result"]:
                    for item in jsonresponse["result"]["items"]:
                        if "payload" in item:
                            # payload is html encoded, so we have to create an dom to select href
                            # here beautiful soup is used, but could be done with parsel also
                            soup = BeautifulSoup(item['payload'].replace('\n', ''), 'lxml')
                            link = self.main_url + soup.find_all('a', href=True)[0].get('href')
                            if self.links_regpk_selector in link:
                                yield scrapy.Request(link)
    
                # Create further requests for retrieving search results if possible
                if ("pageCount" in jsonresponse["result"]) and ("page" in jsonresponse["result"]):
                    if jsonresponse["result"]["page"] < jsonresponse["result"]["pageCount"]: #and jsonresponse["result"]["page"] < 3:
                        yield scrapy.Request(self.json_api_url, method="POST",
                                             body='{{"search":{{"query":"","zipCodeCityQuery":"","sortOrder":"sortDate asc","page":{}}},"filters":[]}}'.format(
                                                 str(jsonresponse["result"]["page"]+1)), headers={'Accept': 'application/json, text/javascript, */*; q=0.01',
                                                      'Content-Type': 'application/json', 'X-CSRF-TOKEN': self.csrf}, callback=self.follow_search_results)
    
        def parse(self, response):
            """
            Scrape transcriptions page and get the wanted properties
            :param response:
            :return: dict (href:string, shorttext:string, fulltext: string[], date: datetime)
            """
    
            # xpath selector for shorttext
            regpk_shorttext = response.xpath('//div[@class="bpa-short-text"]//p').extract_first()
            regpk_shorttext = remove_tags(regpk_shorttext) # have to use remove_tags since /text() has a bug in this case -> will be reported to parsel
    
            # xpath selector for transcription chunks
            regpk_fulltext = response.xpath('//div[@class="bpa-richtext"]/div[@class="bpa-richtext"]//p/text()').extract()
    
            # xpath selector for date
            regpk_time_extracted = response.xpath('//span[@class="bpa-time"]/time/@datetime').extract_first()
            # parse date with regular expression
            regpk_time = datetime.datetime.strptime(regpk_time_extracted, '%Y-%m-%dT%H:%M:%SZ')
    
            # return scraped fields
            yield {"href": response.url, "shorttext": regpk_shorttext, "fulltext": regpk_fulltext, "date": regpk_time}