Skip to content
Snippets Groups Projects
Commit f3e14800 authored by Gallenkamp's avatar Gallenkamp
Browse files

First working draft

parents
No related branches found
No related tags found
No related merge requests found
.idea/
*.pyc
cache/
data.csv
immobilienscout24/
result.json
LICENSE 0 → 100644
MIT License
Copyright (c) 2016 Scrapy project
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
# Usage
TODO
# -*- coding: utf-8 -*-
import os, json
import pandas as pd
class StoragePipeline(object):
# Note: It is better to use a database for this task or do the drop_duplicate once after the scraper finished,
# but this is a simple work-of-concept
def process_item(self, item, spider):
df_new_item = pd.DataFrame.from_records([item], index='id')
if os.path.isfile('data.csv'):
df_existing = pd.read_csv('data.csv', index_col=0)
else:
df_new_item.to_csv('data.csv')
return item
try:
df_new = pd.concat([df_existing, df_new_item], join='inner', verify_integrity=True)
df_new.to_csv('data.csv')
except ValueError:
print("Item already there!")
return item
# -*- coding: utf-8 -*-
import os
# Scrapy settings for quotesbot project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'immobot'
SPIDER_MODULES = ['immoscraper.immoscraper.spiders']
NEWSPIDER_MODULE = 'immoscraper.immoscraper.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'quotesbot (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 10
# The download delay setting will honor only one of:
CONCURRENT_REQUESTS_PER_DOMAIN = 1
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'quotesbot.middlewares.MyCustomSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'quotesbot.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'immoscraper.immoscraper.pipelines.StoragePipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = "{}\..\..\cache".format(os.path.dirname(os.path.abspath(__file__)))
HTTPCACHE_IGNORE_HTTP_CODES = [500, 502, 503, 504, 408]
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
FEED_URI = 'result.json'
FEED_FORMAT = 'json'
\ No newline at end of file
# -*- coding: utf-8 -*-
import scrapy
import json
import urllib.parse
class Immobilienscout24(scrapy.Spider):
name = "immobilienscout24"
# settings for JSON-Request
custom_settings = {'JOBDIR': "immobilienscout24",
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
}
header_json = {
'accept': 'application/json; charset=utf-8',
'accept-language': 'de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7',
'content-type': 'application/json; charset=utf-8',
'x-requested-with': 'XMLHttpRequest'
}
# immobilienscout24 search endpoint url
search_url = 'https://www.immobilienscout24.de/Suche'
def start_requests(self):
"""
Create initial requests through hidden search api of i24
:return: Initial requests
"""
# set pagenumber
pagenumber = 1
# imobilienscout24 search endpoint
params = {
'country': 'de',
'city': 'berlin',
'county': 'berlin',
'search': 'wohnung-mieten'
}
# immobilienscout24 search parameters
additional_params = {
'price': '-700',
'livingspace': '50.0',
'sorting': '2',
}
# make sure pagenumber is sanitized
if pagenumber >= 1:
additional_params['pagenumber'] = pagenumber
# attach search parameters to url if there exist
additional_params_url = "?{}".format(urllib.parse.urlencode(additional_params)) if bool(additional_params) else ""
# retrieve search results considering given page and the search parameters
yield scrapy.Request(url="{}/{}/{}/{}/{}{}".format(self.search_url, params['country'], params['city'], params['county'], params['search'], additional_params_url),
method='POST',
callback=self.parse_search_result,
headers=self.header_json,
meta={
'params': params,
'additional_params': additional_params,
'dont_cache': True
# don't cache, to get current results
},
dont_filter=True) # don't filter to retrieve current results
def parse_search_result(self, response):
"""
Processes search results, retrieve exposes and retrieve further search results
:param response: response from previous request
:return: further requests
"""
# parse json response as python-dict
jsonresponse = json.loads(response.body_as_unicode())
# select only search results
items = jsonresponse['searchResponseModel']['resultlist.resultlist']['resultlistEntries'][0]['resultlistEntry']
# select maximum of search_pages
max_pages = jsonresponse['searchResponseModel']['resultlist.resultlist']['paging']['numberOfPages']
# visit each search result separately
for item in items:
id = item["@id"]
metadata = {'id': item["@id"],
'publishDate': item["@publishDate"],
'creation': item["@creation"],
'modification': item["@modification"]
}
# retrieve expose for each id in search result
yield scrapy.Request(url="https://www.immobilienscout24.de/expose/{}".format(id),
callback=self.parse_expose,
meta={
'dont_cache': False, # cache result
'metadata': metadata
}) # don't visit elements twice
# search next page if exists
if response.meta["additional_params"]["pagenumber"] < max_pages:
params = response.meta["params"]
response.meta["additional_params"]["pagenumber"] += 1
additional_params_url = "?{}".format(urllib.parse.urlencode(response.meta["additional_params"]))
yield scrapy.Request(url="{}/{}/{}/{}/{}{}".format(self.search_url, params['country'], params['city'], params['county'], params['search'], additional_params_url),
method='POST',
headers=self.header_json,
callback=self.parse_search_result,
meta={
'params': response.meta["params"],
'additional_params': response.meta["additional_params"],
'dont_cache': True
# don't cache, to get current results
},
dont_filter=True)
def parse_expose(self, response):
# get content javascript tags
for script_element in response.xpath('//script/text()').getall():
# split content with 'keyValues = '-string
splitted_script_element = script_element.split("keyValues = ")
# if 'keyValues = '-string exists
if len(splitted_script_element) > 1:
# then convert javascript string to dict
expose = json.loads(splitted_script_element[1].split("}")[0] + str("}"))
# add the metadata to expose-data
for key_name in response.meta["metadata"]:
expose[key_name] = response.meta["metadata"][key_name]
expose['id'] = int(expose['id'])
# and return expose
yield expose
\ No newline at end of file
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
from immoscraper.immoscraper.spiders.Immobilienscout24 import Immobilienscout24
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
import os
class ImmoScrapingProcessWatcher:
def __init__(self):
settings_file_path = 'immoscraper.immoscraper.settings' # The path seen from root, ie. from main.py
os.environ.setdefault('SCRAPY_SETTINGS_MODULE', settings_file_path)
self.process = CrawlerProcess(get_project_settings())
self.spider = Immobilienscout24 # The spider you want to crawl
def run_spiders(self):
self.process.crawl(self.spider)
self.process.start() # the script will block here until the crawling is finished
\ No newline at end of file
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = quotesbot.settings
[deploy]
#url = http://localhost:6800/
project = quotesbot
from immoscraper.run_scraper import ImmoScrapingProcessWatcher
scraper = ImmoScrapingProcessWatcher()
scraper.run_spiders()
import delimited using C:/Users/bax1489/Projekte/immo/data.csv
\ No newline at end of file
python script C:/Users/bax1489/Projekte/immo/main.py
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment