known working version

555fe7b7 · Gallenkamp · 54b489da · 555fe7b7 · 54b489da · 54b489da
Commit 555fe7b7 authored 5 years ago by Gallenkamp
--- a/.gitignore
+++ b/.gitignore
 .scrapy/
 .idea/
 **/__pycache__/**
+result.json
--- a/regpk/middlewares.py
+++ b/regpk/middlewares.py
-# -*- coding: utf-8 -*-
-# Define here the models for your spider middleware
-#
-# See documentation in:
-# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
-from scrapy import signals
-class RegpkSpiderMiddleware(object):
-    # Not all methods need to be defined. If a method is not defined,
-    # scrapy acts as if the spider middleware does not modify the
-    # passed objects.
-    @classmethod
-    def from_crawler(cls, crawler):
-        # This method is used by Scrapy to create your spiders.
-        s = cls()
-        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
-        return s
-    def process_spider_input(self, response, spider):
-        # Called for each response that goes through the spider
-        # middleware and into the spider.
-        # Should return None or raise an exception.
-        return None
-    def process_spider_output(self, response, result, spider):
-        # Called with the results returned from the Spider, after
-        # it has processed the response.
-        # Must return an iterable of Request, dict or Item objects.
-        for i in result:
-            yield i
-    def process_spider_exception(self, response, exception, spider):
-        # Called when a spider or process_spider_input() method
-        # (from other spider middleware) raises an exception.
-        # Should return either None or an iterable of Request, dict
-        # or Item objects.
-        pass
-    def process_start_requests(self, start_requests, spider):
-        # Called with the start requests of the spider, and works
-        # similarly to the process_spider_output() method, except
-        # that it doesn’t have a response associated.
-        # Must return only requests (not items).
-        for r in start_requests:
-            yield r
-    def spider_opened(self, spider):
-        spider.logger.info('Spider opened: %s' % spider.name)
-class RegpkDownloaderMiddleware(object):
-    # Not all methods need to be defined. If a method is not defined,
-    # scrapy acts as if the downloader middleware does not modify the
-    # passed objects.
-    @classmethod
-    def from_crawler(cls, crawler):
-        # This method is used by Scrapy to create your spiders.
-        s = cls()
-        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
-        return s
-    def process_request(self, request, spider):
-        # Called for each request that goes through the downloader
-        # middleware.
-        # Must either:
-        # - return None: continue processing this request
-        # - or return a Response object
-        # - or return a Request object
-        # - or raise IgnoreRequest: process_exception() methods of
-        #   installed downloader middleware will be called
-        return None
-    def process_response(self, request, response, spider):
-        # Called with the response returned from the downloader.
-        # Must either;
-        # - return a Response object
-        # - return a Request object
-        # - or raise IgnoreRequest
-        return response
-    def process_exception(self, request, exception, spider):
-        # Called when a download handler or a process_request()
-        # (from other downloader middleware) raises an exception.
-        # Must either:
-        # - return None: continue processing this exception
-        # - return a Response object: stops process_exception() chain
-        # - return a Request object: stops process_exception() chain
-        pass
-    def spider_opened(self, spider):
-        spider.logger.info('Spider opened: %s' % spider.name)
--- a/regpk/pipelines.py
+++ b/regpk/pipelines.py
-# -*- coding: utf-8 -*-
-# Define your item pipelines here
-#
-# Don't forget to add your pipeline to the ITEM_PIPELINES setting
-# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
-class RegpkPipeline(object):
-    def process_item(self, item, spider):
-        return item
--- a/regpk/settings.py
+++ b/regpk/settings.py
@@ -29,8 +29,8 @@ ROBOTSTXT_OBEY = True
 # See also autothrottle settings and docs
 DOWNLOAD_DELAY = 3
 # The download delay setting will honor only one of:
-#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+CONCURRENT_REQUESTS_PER_DOMAIN = 1
-CONCURRENT_REQUESTS_PER_IP = 1
+#CONCURRENT_REQUESTS = 1
 # Disable cookies (enabled by default)
 #COOKIES_ENABLED = False
@@ -89,5 +89,9 @@ HTTPCACHE_EXPIRATION_SECS = 0
 #HTTPCACHE_IGNORE_HTTP_CODES = []
 HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+FEED_FORMAT = 'json'
+FEED_URI = 'result.json'
+FEED_EXPORT_ENCODING = 'utf-8'
 COOKIES_DEBUG = True
 RETRY_TIMES = 0
\ No newline at end of file
--- a/regpk/spiders/archiv_bundesregierung.py
+++ b/regpk/spiders/archiv_bundesregierung.py
 # -*- coding: utf-8 -*-
 import scrapy
+import datetime, locale
+from scrapy.utils.markup import remove_tags
 class ArchivBundesregierungSpider(scrapy.Spider):
+    # name of scraper
    name = 'archiv_bundesregierung'
+    # area of scraping
    allowed_domains = ['archiv.bundesregierung.de']
-    start_urls = ['http://archiv.bundesregierung.de']
+    # prefix for transcriptions links
+    main_url = "https://archiv.bundesregierung.de"
+    # start at specific page
+    start_url = "https://archiv.bundesregierung.de/archiv-de/dokumente/69986!search?page=720"
+    # string must be contained to collect transcription
+    links_regpk_selector = "regierungspressekonferenz"
+    def start_requests(self):
+        """
+        Initially make request to first archive page and process then in parse_search_items
+        :return: scrapy.request[]
+        """
+        yield scrapy.Request(self.start_url, callback=self.parse_search_items)
+    def parse_search_items(self, response):
+        """
+        Create request for parsed search items and retrieve further search items
+        :param response: scrapy.response
+        :return: scrapy.request[]
+        """
+        # search next page button
+        next_page_container = response.xpath('//li[@class="forward"]/a/@href')
+        if next_page_container:
+            # if next page button present then create request for next page with retrieved url
+            url_postfix = next_page_container.extract_first()
+            yield scrapy.Request("{}{}".format(self.main_url, url_postfix), callback=self.parse_search_items)
+        # create requests for all search items linked to the regpk_selector
+        search_items_container = response.xpath('//li/h3/a/@href').extract()
+        for link in search_items_container:
+            if self.links_regpk_selector in link:
+                yield scrapy.Request("{}{}".format(self.main_url, link))
    def parse(self, response):
-        pass
+        """
+        Scrape transcriptions page and get the wanted properties
+        :param response:
+        :return: dict (href:string, shorttext:string, fulltext: string[], date: datetime)
+        """
+        # xpath selector for shorttext
+        regpk_shorttext = response.xpath('//div[@class="abstract"]').extract_first()
+        regpk_shorttext = remove_tags(regpk_shorttext) # have to use remove_tags since /text() has a bug in this case -> will be reported to parsel
+        regpk_shorttext = regpk_shorttext.replace(u'\xa0', ' ').strip() # cleaning
+        # xpath selector for transcription chunks
+        regpk_fulltext = response.xpath('//div[@class="basepage_pages"]/*[not(self::div)]//text()').extract()
+        # using list comprehension to clean text from artifacts and empty entries
+        regpk_fulltext = [item.replace(u'\xa0', ' ') for item in regpk_fulltext if not "" == item.strip()]
+        # xpath selector for date
+        regpk_time_extracted = response.xpath('//p[@class="date"]/text()').extract_first()
+        # parse date with regular expression and german locale FIXME: do only once?
+        locale.setlocale(locale.LC_ALL, "german")
+        regpk_time = datetime.datetime.strptime(regpk_time_extracted.split(",")[1].strip(), '%d. %B %Y')
+        # return scraped fields
+        yield {"href": response.url, "shorttext": regpk_shorttext, "fulltext": regpk_fulltext, "date": regpk_time.strftime("%Y-%m-%d")}
\ No newline at end of file
--- a/regpk/spiders/bundesregierung.py
+++ b/regpk/spiders/bundesregierung.py
 # -*- coding: utf-8 -*-
 import scrapy
+from scrapy.utils.markup import remove_tags
 import json
 from bs4 import BeautifulSoup
+import datetime
 class BundesregierungSpider(scrapy.Spider):
+    # name of scraper
    name = 'bundesregierung'
    allowed_domains = ['www.bundesregierung.de']
-    #start_urls_regex = 'https://www.bundesregierung.de/breg-de/aktuelles/pressekonferenzen?page=%d&sort=sortDate%%20asc'
+    # csrf-protection for hidden-api
    start_url_csrf = "https://www.bundesregierung.de/service/csrf"
+    # hidden-api link
    json_api_url = 'https://www.bundesregierung.de/breg-de/suche/1000408!searchJson'
+    # prefix for transcriptions links
    main_url = "https://www.bundesregierung.de"
+    # string must be contained to collect transcription
    links_regpk_selector = "regierungspressekonferenz"
+    # field to save csrf-token
    csrf = ""
    def start_requests(self):
+        """
+        Initially make request to csrf-service to set the csrf-prevention-cookie.
+        :return:
+        """
        yield scrapy.Request(self.start_url_csrf, callback=self.search_with_csrf)
    def search_with_csrf(self, response):
-        '''
+        """
+        Retrieve csrf-token from page
-        :param response:
+        :param response: scrapy.response
-        :return:
+        :return: scrapy.request[]
-        '''
+        """
+        # Retrieve csrf-token and save it
        jsonresponse = json.loads(response.body_as_unicode())
-        print(jsonresponse)
        self.csrf = jsonresponse["token"]
-        for i in range(0, 1):
-            #FIXME Make JsonRequest
+        # Create requests for retrieving search results
        yield scrapy.Request(self.json_api_url,
                         method="POST",
-                         body='{"search":{"query":"","zipCodeCityQuery":"","sortOrder":"sortDate asc","page":%d},"filters":[]}' % i,
+                         body='{{"search":{{"query":"","zipCodeCityQuery":"","sortOrder":"sortDate asc","page":{}}},"filters":[]}}'.format(str(164)),
                         headers={'Accept': 'application/json, text/javascript, */*; q=0.01', 'Content-Type': 'application/json', 'X-CSRF-TOKEN': self.csrf},
                         callback=self.follow_search_results)
    def follow_search_results(self, response):
-        '''
+        """
+        Create all requests for retrieving the transcriptions with current search result and
-        :return:
+        create further requests for retrieving search results if possible
-        '''
+        :param response: scrapy.response
+        :return: scrapy.request[]
+        """
+        # create json object with response for queried items
        jsonresponse = json.loads(response.body_as_unicode())
        if "result" in jsonresponse:
+            # Create all requests for retrieving the transcriptions
            if "items" in jsonresponse["result"]:
                for item in jsonresponse["result"]["items"]:
                    if "payload" in item:
-                        #yield scrapy.Request('')
+                        # payload is html encoded, so we have to create an dom to select href
+                        # here beautiful soup is used, but could be done with parsel also
                        soup = BeautifulSoup(item['payload'].replace('\n', ''), 'lxml')
                        link = self.main_url + soup.find_all('a', href=True)[0].get('href')
                        if self.links_regpk_selector in link:
                            yield scrapy.Request(link)
-                    break
+            # Create further requests for retrieving search results if possible
+            if ("pageCount" in jsonresponse["result"]) and ("page" in jsonresponse["result"]):
+                if jsonresponse["result"]["page"] < jsonresponse["result"]["pageCount"]: #and jsonresponse["result"]["page"] < 3:
+                    yield scrapy.Request(self.json_api_url, method="POST",
+                                         body='{{"search":{{"query":"","zipCodeCityQuery":"","sortOrder":"sortDate asc","page":{}}},"filters":[]}}'.format(
+                                             str(jsonresponse["result"]["page"]+1)), headers={'Accept': 'application/json, text/javascript, */*; q=0.01',
+                                                  'Content-Type': 'application/json', 'X-CSRF-TOKEN': self.csrf}, callback=self.follow_search_results)
    def parse(self, response):
-        regpks = []
+        """
+        Scrape transcriptions page and get the wanted properties
+        :param response:
+        :return: dict (href:string, shorttext:string, fulltext: string[], date: datetime)
+        """
-        regpk_shorttext = response.xpath('//div[@class="bpa-short-text"]/*').extract_first()
+        # xpath selector for shorttext
+        regpk_shorttext = response.xpath('//div[@class="bpa-short-text"]//p').extract_first()
+        regpk_shorttext = remove_tags(regpk_shorttext) # have to use remove_tags since /text() has a bug in this case -> will be reported to parsel
-        regpk_fulltext_container = response.xpath('//div[@class="bpa-container"]/div/div[@class="bpa-richtext"]/div[@class="bpa-richtext"]')
+        # xpath selector for transcription chunks
-        regpk_fulltext = regpk_fulltext_container.xpath('//p/text()').extract()
+        regpk_fulltext = response.xpath('//div[@class="bpa-richtext"]/div[@class="bpa-richtext"]//p/text()').extract()
-        regpk_time = response.xpath('//span[@class="bpa-time"]/time/@datetime').extract_first()
+        # xpath selector for date
+        regpk_time_extracted = response.xpath('//span[@class="bpa-time"]/time/@datetime').extract_first()
+        # parse date with regular expression
+        regpk_time = datetime.datetime.strptime(regpk_time_extracted, '%Y-%m-%dT%H:%M:%SZ')
-        print((regpk_shorttext, regpk_fulltext, regpk_time))
+        # return scraped fields
-        #search_results = response.css('a[class="bpa-teaser"]')
+        yield {"href": response.url, "shorttext": regpk_shorttext, "fulltext": regpk_fulltext, "date": regpk_time}
-        pass