Skip to content
Snippets Groups Projects
Commit 555fe7b7 authored by Gallenkamp's avatar Gallenkamp
Browse files

known working version

parent 54b489da
No related branches found
No related tags found
No related merge requests found
.scrapy/ .scrapy/
.idea/ .idea/
**/__pycache__/** **/__pycache__/**
result.json
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class RegpkSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class RegpkDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
class RegpkPipeline(object):
def process_item(self, item, spider):
return item
...@@ -29,8 +29,8 @@ ROBOTSTXT_OBEY = True ...@@ -29,8 +29,8 @@ ROBOTSTXT_OBEY = True
# See also autothrottle settings and docs # See also autothrottle settings and docs
DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 CONCURRENT_REQUESTS_PER_DOMAIN = 1
CONCURRENT_REQUESTS_PER_IP = 1 #CONCURRENT_REQUESTS = 1
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
#COOKIES_ENABLED = False #COOKIES_ENABLED = False
...@@ -89,5 +89,9 @@ HTTPCACHE_EXPIRATION_SECS = 0 ...@@ -89,5 +89,9 @@ HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
FEED_FORMAT = 'json'
FEED_URI = 'result.json'
FEED_EXPORT_ENCODING = 'utf-8'
COOKIES_DEBUG = True COOKIES_DEBUG = True
RETRY_TIMES = 0 RETRY_TIMES = 0
\ No newline at end of file
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import scrapy import scrapy
import datetime, locale
from scrapy.utils.markup import remove_tags
class ArchivBundesregierungSpider(scrapy.Spider): class ArchivBundesregierungSpider(scrapy.Spider):
# name of scraper
name = 'archiv_bundesregierung' name = 'archiv_bundesregierung'
# area of scraping
allowed_domains = ['archiv.bundesregierung.de'] allowed_domains = ['archiv.bundesregierung.de']
start_urls = ['http://archiv.bundesregierung.de'] # prefix for transcriptions links
main_url = "https://archiv.bundesregierung.de"
# start at specific page
start_url = "https://archiv.bundesregierung.de/archiv-de/dokumente/69986!search?page=720"
# string must be contained to collect transcription
links_regpk_selector = "regierungspressekonferenz"
def start_requests(self):
"""
Initially make request to first archive page and process then in parse_search_items
:return: scrapy.request[]
"""
yield scrapy.Request(self.start_url, callback=self.parse_search_items)
def parse_search_items(self, response):
"""
Create request for parsed search items and retrieve further search items
:param response: scrapy.response
:return: scrapy.request[]
"""
# search next page button
next_page_container = response.xpath('//li[@class="forward"]/a/@href')
if next_page_container:
# if next page button present then create request for next page with retrieved url
url_postfix = next_page_container.extract_first()
yield scrapy.Request("{}{}".format(self.main_url, url_postfix), callback=self.parse_search_items)
# create requests for all search items linked to the regpk_selector
search_items_container = response.xpath('//li/h3/a/@href').extract()
for link in search_items_container:
if self.links_regpk_selector in link:
yield scrapy.Request("{}{}".format(self.main_url, link))
def parse(self, response): def parse(self, response):
pass """
Scrape transcriptions page and get the wanted properties
:param response:
:return: dict (href:string, shorttext:string, fulltext: string[], date: datetime)
"""
# xpath selector for shorttext
regpk_shorttext = response.xpath('//div[@class="abstract"]').extract_first()
regpk_shorttext = remove_tags(regpk_shorttext) # have to use remove_tags since /text() has a bug in this case -> will be reported to parsel
regpk_shorttext = regpk_shorttext.replace(u'\xa0', ' ').strip() # cleaning
# xpath selector for transcription chunks
regpk_fulltext = response.xpath('//div[@class="basepage_pages"]/*[not(self::div)]//text()').extract()
# using list comprehension to clean text from artifacts and empty entries
regpk_fulltext = [item.replace(u'\xa0', ' ') for item in regpk_fulltext if not "" == item.strip()]
# xpath selector for date
regpk_time_extracted = response.xpath('//p[@class="date"]/text()').extract_first()
# parse date with regular expression and german locale FIXME: do only once?
locale.setlocale(locale.LC_ALL, "german")
regpk_time = datetime.datetime.strptime(regpk_time_extracted.split(",")[1].strip(), '%d. %B %Y')
# return scraped fields
yield {"href": response.url, "shorttext": regpk_shorttext, "fulltext": regpk_fulltext, "date": regpk_time.strftime("%Y-%m-%d")}
\ No newline at end of file
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import scrapy import scrapy
from scrapy.utils.markup import remove_tags
import json import json
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import datetime
class BundesregierungSpider(scrapy.Spider): class BundesregierungSpider(scrapy.Spider):
# name of scraper
name = 'bundesregierung' name = 'bundesregierung'
allowed_domains = ['www.bundesregierung.de'] allowed_domains = ['www.bundesregierung.de']
#start_urls_regex = 'https://www.bundesregierung.de/breg-de/aktuelles/pressekonferenzen?page=%d&sort=sortDate%%20asc'
# csrf-protection for hidden-api
start_url_csrf = "https://www.bundesregierung.de/service/csrf" start_url_csrf = "https://www.bundesregierung.de/service/csrf"
# hidden-api link
json_api_url = 'https://www.bundesregierung.de/breg-de/suche/1000408!searchJson' json_api_url = 'https://www.bundesregierung.de/breg-de/suche/1000408!searchJson'
# prefix for transcriptions links
main_url = "https://www.bundesregierung.de" main_url = "https://www.bundesregierung.de"
# string must be contained to collect transcription
links_regpk_selector = "regierungspressekonferenz" links_regpk_selector = "regierungspressekonferenz"
# field to save csrf-token
csrf = "" csrf = ""
def start_requests(self): def start_requests(self):
"""
Initially make request to csrf-service to set the csrf-prevention-cookie.
:return:
"""
yield scrapy.Request(self.start_url_csrf, callback=self.search_with_csrf) yield scrapy.Request(self.start_url_csrf, callback=self.search_with_csrf)
def search_with_csrf(self, response): def search_with_csrf(self, response):
''' """
Retrieve csrf-token from page
:param response: :param response: scrapy.response
:return: :return: scrapy.request[]
''' """
# Retrieve csrf-token and save it
jsonresponse = json.loads(response.body_as_unicode()) jsonresponse = json.loads(response.body_as_unicode())
print(jsonresponse)
self.csrf = jsonresponse["token"] self.csrf = jsonresponse["token"]
for i in range(0, 1):
#FIXME Make JsonRequest # Create requests for retrieving search results
yield scrapy.Request(self.json_api_url, yield scrapy.Request(self.json_api_url,
method="POST", method="POST",
body='{"search":{"query":"","zipCodeCityQuery":"","sortOrder":"sortDate asc","page":%d},"filters":[]}' % i, body='{{"search":{{"query":"","zipCodeCityQuery":"","sortOrder":"sortDate asc","page":{}}},"filters":[]}}'.format(str(164)),
headers={'Accept': 'application/json, text/javascript, */*; q=0.01', 'Content-Type': 'application/json', 'X-CSRF-TOKEN': self.csrf}, headers={'Accept': 'application/json, text/javascript, */*; q=0.01', 'Content-Type': 'application/json', 'X-CSRF-TOKEN': self.csrf},
callback=self.follow_search_results) callback=self.follow_search_results)
def follow_search_results(self, response): def follow_search_results(self, response):
''' """
Create all requests for retrieving the transcriptions with current search result and
:return: create further requests for retrieving search results if possible
''' :param response: scrapy.response
:return: scrapy.request[]
"""
# create json object with response for queried items
jsonresponse = json.loads(response.body_as_unicode()) jsonresponse = json.loads(response.body_as_unicode())
if "result" in jsonresponse: if "result" in jsonresponse:
# Create all requests for retrieving the transcriptions
if "items" in jsonresponse["result"]: if "items" in jsonresponse["result"]:
for item in jsonresponse["result"]["items"]: for item in jsonresponse["result"]["items"]:
if "payload" in item: if "payload" in item:
#yield scrapy.Request('') # payload is html encoded, so we have to create an dom to select href
# here beautiful soup is used, but could be done with parsel also
soup = BeautifulSoup(item['payload'].replace('\n', ''), 'lxml') soup = BeautifulSoup(item['payload'].replace('\n', ''), 'lxml')
link = self.main_url + soup.find_all('a', href=True)[0].get('href') link = self.main_url + soup.find_all('a', href=True)[0].get('href')
if self.links_regpk_selector in link: if self.links_regpk_selector in link:
yield scrapy.Request(link) yield scrapy.Request(link)
break
# Create further requests for retrieving search results if possible
if ("pageCount" in jsonresponse["result"]) and ("page" in jsonresponse["result"]):
if jsonresponse["result"]["page"] < jsonresponse["result"]["pageCount"]: #and jsonresponse["result"]["page"] < 3:
yield scrapy.Request(self.json_api_url, method="POST",
body='{{"search":{{"query":"","zipCodeCityQuery":"","sortOrder":"sortDate asc","page":{}}},"filters":[]}}'.format(
str(jsonresponse["result"]["page"]+1)), headers={'Accept': 'application/json, text/javascript, */*; q=0.01',
'Content-Type': 'application/json', 'X-CSRF-TOKEN': self.csrf}, callback=self.follow_search_results)
def parse(self, response): def parse(self, response):
regpks = [] """
Scrape transcriptions page and get the wanted properties
:param response:
:return: dict (href:string, shorttext:string, fulltext: string[], date: datetime)
"""
regpk_shorttext = response.xpath('//div[@class="bpa-short-text"]/*').extract_first() # xpath selector for shorttext
regpk_shorttext = response.xpath('//div[@class="bpa-short-text"]//p').extract_first()
regpk_shorttext = remove_tags(regpk_shorttext) # have to use remove_tags since /text() has a bug in this case -> will be reported to parsel
regpk_fulltext_container = response.xpath('//div[@class="bpa-container"]/div/div[@class="bpa-richtext"]/div[@class="bpa-richtext"]') # xpath selector for transcription chunks
regpk_fulltext = regpk_fulltext_container.xpath('//p/text()').extract() regpk_fulltext = response.xpath('//div[@class="bpa-richtext"]/div[@class="bpa-richtext"]//p/text()').extract()
regpk_time = response.xpath('//span[@class="bpa-time"]/time/@datetime').extract_first() # xpath selector for date
regpk_time_extracted = response.xpath('//span[@class="bpa-time"]/time/@datetime').extract_first()
# parse date with regular expression
regpk_time = datetime.datetime.strptime(regpk_time_extracted, '%Y-%m-%dT%H:%M:%SZ')
print((regpk_shorttext, regpk_fulltext, regpk_time)) # return scraped fields
#search_results = response.css('a[class="bpa-teaser"]') yield {"href": response.url, "shorttext": regpk_shorttext, "fulltext": regpk_fulltext, "date": regpk_time}
pass
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment