diff --git a/.gitignore b/.gitignore index 6190c839b940fddde4b63b738115ab5db425eb6f..377ddfed2d3db8f230ae088e9966c3eb0725867e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .scrapy/ .idea/ **/__pycache__/** +result.json diff --git a/regpk/middlewares.py b/regpk/middlewares.py deleted file mode 100644 index 0849ab4d8f9374b7cde92f43ea0b581bc6046682..0000000000000000000000000000000000000000 --- a/regpk/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class RegpkSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Request, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class RegpkDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/regpk/pipelines.py b/regpk/pipelines.py deleted file mode 100644 index bebb2957c96e644d5227530555f16e2699b43bee..0000000000000000000000000000000000000000 --- a/regpk/pipelines.py +++ /dev/null @@ -1,11 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html - - -class RegpkPipeline(object): - def process_item(self, item, spider): - return item diff --git a/regpk/settings.py b/regpk/settings.py index 77d86cec76ede4698a26a0d67dc73313ec5ee7cd..fd0976d0000590c8cdc616f998121b1f22c3f1c4 100644 --- a/regpk/settings.py +++ b/regpk/settings.py @@ -29,8 +29,8 @@ ROBOTSTXT_OBEY = True # See also autothrottle settings and docs DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -CONCURRENT_REQUESTS_PER_IP = 1 +CONCURRENT_REQUESTS_PER_DOMAIN = 1 +#CONCURRENT_REQUESTS = 1 # Disable cookies (enabled by default) #COOKIES_ENABLED = False @@ -89,5 +89,9 @@ HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_IGNORE_HTTP_CODES = [] HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' +FEED_FORMAT = 'json' +FEED_URI = 'result.json' +FEED_EXPORT_ENCODING = 'utf-8' + COOKIES_DEBUG = True RETRY_TIMES = 0 \ No newline at end of file diff --git a/regpk/spiders/archiv_bundesregierung.py b/regpk/spiders/archiv_bundesregierung.py index c8a4a84c03a37e73121215a480e9435326237dfd..959ee5ac738d663e14c3d40915a30be36281cc92 100644 --- a/regpk/spiders/archiv_bundesregierung.py +++ b/regpk/spiders/archiv_bundesregierung.py @@ -1,11 +1,69 @@ # -*- coding: utf-8 -*- import scrapy - +import datetime, locale +from scrapy.utils.markup import remove_tags class ArchivBundesregierungSpider(scrapy.Spider): + # name of scraper name = 'archiv_bundesregierung' + # area of scraping allowed_domains = ['archiv.bundesregierung.de'] - start_urls = ['http://archiv.bundesregierung.de'] + # prefix for transcriptions links + main_url = "https://archiv.bundesregierung.de" + # start at specific page + start_url = "https://archiv.bundesregierung.de/archiv-de/dokumente/69986!search?page=720" + # string must be contained to collect transcription + links_regpk_selector = "regierungspressekonferenz" + + def start_requests(self): + """ + Initially make request to first archive page and process then in parse_search_items + :return: scrapy.request[] + """ + yield scrapy.Request(self.start_url, callback=self.parse_search_items) + + def parse_search_items(self, response): + """ + Create request for parsed search items and retrieve further search items + :param response: scrapy.response + :return: scrapy.request[] + """ + # search next page button + next_page_container = response.xpath('//li[@class="forward"]/a/@href') + + if next_page_container: + # if next page button present then create request for next page with retrieved url + url_postfix = next_page_container.extract_first() + yield scrapy.Request("{}{}".format(self.main_url, url_postfix), callback=self.parse_search_items) + + # create requests for all search items linked to the regpk_selector + search_items_container = response.xpath('//li/h3/a/@href').extract() + for link in search_items_container: + if self.links_regpk_selector in link: + yield scrapy.Request("{}{}".format(self.main_url, link)) def parse(self, response): - pass + """ + Scrape transcriptions page and get the wanted properties + :param response: + :return: dict (href:string, shorttext:string, fulltext: string[], date: datetime) + """ + + # xpath selector for shorttext + regpk_shorttext = response.xpath('//div[@class="abstract"]').extract_first() + regpk_shorttext = remove_tags(regpk_shorttext) # have to use remove_tags since /text() has a bug in this case -> will be reported to parsel + regpk_shorttext = regpk_shorttext.replace(u'\xa0', ' ').strip() # cleaning + + # xpath selector for transcription chunks + regpk_fulltext = response.xpath('//div[@class="basepage_pages"]/*[not(self::div)]//text()').extract() + # using list comprehension to clean text from artifacts and empty entries + regpk_fulltext = [item.replace(u'\xa0', ' ') for item in regpk_fulltext if not "" == item.strip()] + + # xpath selector for date + regpk_time_extracted = response.xpath('//p[@class="date"]/text()').extract_first() + # parse date with regular expression and german locale FIXME: do only once? + locale.setlocale(locale.LC_ALL, "german") + regpk_time = datetime.datetime.strptime(regpk_time_extracted.split(",")[1].strip(), '%d. %B %Y') + + # return scraped fields + yield {"href": response.url, "shorttext": regpk_shorttext, "fulltext": regpk_fulltext, "date": regpk_time.strftime("%Y-%m-%d")} \ No newline at end of file diff --git a/regpk/spiders/bundesregierung.py b/regpk/spiders/bundesregierung.py index 6f61b772f5be89de548e2d4de178ee685f5714b5..b6bf0a1414c991e992c97b1e53190e96816a6716 100644 --- a/regpk/spiders/bundesregierung.py +++ b/regpk/spiders/bundesregierung.py @@ -1,67 +1,104 @@ # -*- coding: utf-8 -*- import scrapy +from scrapy.utils.markup import remove_tags import json from bs4 import BeautifulSoup +import datetime class BundesregierungSpider(scrapy.Spider): + # name of scraper name = 'bundesregierung' + allowed_domains = ['www.bundesregierung.de'] - #start_urls_regex = 'https://www.bundesregierung.de/breg-de/aktuelles/pressekonferenzen?page=%d&sort=sortDate%%20asc' + + # csrf-protection for hidden-api start_url_csrf = "https://www.bundesregierung.de/service/csrf" + + # hidden-api link json_api_url = 'https://www.bundesregierung.de/breg-de/suche/1000408!searchJson' + + # prefix for transcriptions links main_url = "https://www.bundesregierung.de" + + # string must be contained to collect transcription links_regpk_selector = "regierungspressekonferenz" + + # field to save csrf-token csrf = "" def start_requests(self): + """ + Initially make request to csrf-service to set the csrf-prevention-cookie. + :return: + """ yield scrapy.Request(self.start_url_csrf, callback=self.search_with_csrf) def search_with_csrf(self, response): - ''' - - :param response: - :return: - ''' + """ + Retrieve csrf-token from page + :param response: scrapy.response + :return: scrapy.request[] + """ + # Retrieve csrf-token and save it jsonresponse = json.loads(response.body_as_unicode()) - print(jsonresponse) self.csrf = jsonresponse["token"] - for i in range(0, 1): - #FIXME Make JsonRequest - yield scrapy.Request(self.json_api_url, + + # Create requests for retrieving search results + yield scrapy.Request(self.json_api_url, method="POST", - body='{"search":{"query":"","zipCodeCityQuery":"","sortOrder":"sortDate asc","page":%d},"filters":[]}' % i, + body='{{"search":{{"query":"","zipCodeCityQuery":"","sortOrder":"sortDate asc","page":{}}},"filters":[]}}'.format(str(164)), headers={'Accept': 'application/json, text/javascript, */*; q=0.01', 'Content-Type': 'application/json', 'X-CSRF-TOKEN': self.csrf}, callback=self.follow_search_results) def follow_search_results(self, response): - ''' - - :return: - ''' + """ + Create all requests for retrieving the transcriptions with current search result and + create further requests for retrieving search results if possible + :param response: scrapy.response + :return: scrapy.request[] + """ + # create json object with response for queried items jsonresponse = json.loads(response.body_as_unicode()) + if "result" in jsonresponse: + # Create all requests for retrieving the transcriptions if "items" in jsonresponse["result"]: for item in jsonresponse["result"]["items"]: if "payload" in item: - #yield scrapy.Request('') + # payload is html encoded, so we have to create an dom to select href + # here beautiful soup is used, but could be done with parsel also soup = BeautifulSoup(item['payload'].replace('\n', ''), 'lxml') link = self.main_url + soup.find_all('a', href=True)[0].get('href') if self.links_regpk_selector in link: yield scrapy.Request(link) - break + # Create further requests for retrieving search results if possible + if ("pageCount" in jsonresponse["result"]) and ("page" in jsonresponse["result"]): + if jsonresponse["result"]["page"] < jsonresponse["result"]["pageCount"]: #and jsonresponse["result"]["page"] < 3: + yield scrapy.Request(self.json_api_url, method="POST", + body='{{"search":{{"query":"","zipCodeCityQuery":"","sortOrder":"sortDate asc","page":{}}},"filters":[]}}'.format( + str(jsonresponse["result"]["page"]+1)), headers={'Accept': 'application/json, text/javascript, */*; q=0.01', + 'Content-Type': 'application/json', 'X-CSRF-TOKEN': self.csrf}, callback=self.follow_search_results) def parse(self, response): - regpks = [] + """ + Scrape transcriptions page and get the wanted properties + :param response: + :return: dict (href:string, shorttext:string, fulltext: string[], date: datetime) + """ - regpk_shorttext = response.xpath('//div[@class="bpa-short-text"]/*').extract_first() + # xpath selector for shorttext + regpk_shorttext = response.xpath('//div[@class="bpa-short-text"]//p').extract_first() + regpk_shorttext = remove_tags(regpk_shorttext) # have to use remove_tags since /text() has a bug in this case -> will be reported to parsel - regpk_fulltext_container = response.xpath('//div[@class="bpa-container"]/div/div[@class="bpa-richtext"]/div[@class="bpa-richtext"]') - regpk_fulltext = regpk_fulltext_container.xpath('//p/text()').extract() + # xpath selector for transcription chunks + regpk_fulltext = response.xpath('//div[@class="bpa-richtext"]/div[@class="bpa-richtext"]//p/text()').extract() - regpk_time = response.xpath('//span[@class="bpa-time"]/time/@datetime').extract_first() + # xpath selector for date + regpk_time_extracted = response.xpath('//span[@class="bpa-time"]/time/@datetime').extract_first() + # parse date with regular expression + regpk_time = datetime.datetime.strptime(regpk_time_extracted, '%Y-%m-%dT%H:%M:%SZ') - print((regpk_shorttext, regpk_fulltext, regpk_time)) - #search_results = response.css('a[class="bpa-teaser"]') - pass + # return scraped fields + yield {"href": response.url, "shorttext": regpk_shorttext, "fulltext": regpk_fulltext, "date": regpk_time}