Scarpy 起始url 自定义代理 自定义去重规则

时间:2023-03-09 03:48:33
Scarpy 起始url 自定义代理 自定义去重规则
- start_urls

    - 内部原理
"""
scrapy引擎来爬虫中去起始的URL:
1. 调用start_requests并获取返回值
2. v = iter(返回值)
3.
req1 = 执行 v.__next__()
req2 = 执行 v.__next__()
req3 = 执行 v.__next__()
...
4.req全部放到调度器中
""" - 编写
class Pc1Spider(scrapy.Spider):
name = 'pc1'
allowed_domains = ['chouti.com']
start_urls = ["https://dig.chouti.com"] #定制:可以去redis中获取 def start_requests(self):
         # 添加内置代理
          """
          import os
          os.environ["HTTPS_PROXY"] = "https://root:password@192.168.1.3:9999/"
          os.environ["HTTP_PROXY"] = "123.11.12.13"           """
 
         # 方式一:
# for url in self.start_urls:
# yield Request(url=url)
# 方式二:
# req_list = []
# for url in self.start_urls:
# req_list.append(Request(url=url))
# return req_list def parse(self,response):
pass

自定义代理

import base64
import random
from six.moves.urllib.parse import unquote, urlunparse try:
from urllib2 import _parse_proxy
except ImportError:
from urllib.request import _parse_proxy from scrapy.utils.python import to_bytes class AqlProxyMiddleware(object): def _basic_auth_header(self, username, password):
user_pass = to_bytes(
'%s:%s' % (unquote(username), unquote(password)),
encoding='latin-1')
return base64.b64encode(user_pass) def process_request(self, request, spider): PROXIES = [
"http://root:password@192.168.1.3:9991/",
"http://root:password@192.168.1.3:9991/",
"http://root:password@192.168.1.3:9991/",
"http://root:password@192.168.1.3:9991/",
"http://root:password@192.168.1.3:9991/", ] url = random.choice(PROXIES)
orig_type = ""
proxt_type, user, password, hostport = _parse_proxy(url)
proxt_url = urlunparse((proxt_type or orig_type, hostport,'','','','')) if user:
creds = self._basic_auth_header(user,password)
else:
creds = None request.meta['proxy'] = proxt_url
if creds :
request.headers['Proxy-Authorization'] = b'Basic ' + creds

settings

# -*- coding: utf- -*-

# Scrapy settings for step8_king project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html # . 爬虫名称
BOT_NAME = 'step8_king' # . 爬虫应用路径
SPIDER_MODULES = ['step8_king.spiders']
NEWSPIDER_MODULE = 'step8_king.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent
# . 客户端 user-agent请求头
# USER_AGENT = 'step8_king (+http://www.yourdomain.com)' # Obey robots.txt rules
# . 禁止爬虫配置
# ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: )
# . 并发请求数
# CONCURRENT_REQUESTS = # Configure a delay for requests for the same website (default: )
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# . 延迟下载秒数
# DOWNLOAD_DELAY = # The download delay setting will honor only one of:
# . 单域名访问并发数,并且延迟下次秒数也应用在每个域名
# CONCURRENT_REQUESTS_PER_DOMAIN =
# 单IP访问并发数,如果有值则忽略:CONCURRENT_REQUESTS_PER_DOMAIN,并且延迟下次秒数也应用在每个IP
# CONCURRENT_REQUESTS_PER_IP = # Disable cookies (enabled by default)
# . 是否支持cookie,cookiejar进行操作cookie
# COOKIES_ENABLED = True
# COOKIES_DEBUG = True # Disable Telnet Console (enabled by default)
# . Telnet用于查看当前爬虫的信息,操作爬虫等...
# 使用telnet ip port ,然后通过命令操作
# TELNETCONSOLE_ENABLED = True
# TELNETCONSOLE_HOST = '127.0.0.1'
# TELNETCONSOLE_PORT = [,] # . 默认请求头
# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# } # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
# . 定义pipeline处理请求
# ITEM_PIPELINES = {
# 'step8_king.pipelines.JsonPipeline': ,
# 'step8_king.pipelines.FilePipeline': ,
# } # . 自定义扩展,基于信号进行调用
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# # 'step8_king.extensions.MyExtension': ,
# } # . 爬虫允许的最大深度,可以通过meta查看当前深度;0表示无深度
# DEPTH_LIMIT = # . 爬取时,0表示深度优先Lifo(默认);1表示广度优先FiFo # 后进先出,深度优先
# DEPTH_PRIORITY =
# SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleLifoDiskQueue'
# SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.LifoMemoryQueue'
# 先进先出,广度优先 # DEPTH_PRIORITY =
# SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleFifoDiskQueue'
# SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.FifoMemoryQueue' # . 调度器队列
# SCHEDULER = 'scrapy.core.scheduler.Scheduler'
# from scrapy.core.scheduler import Scheduler # . 访问URL去重
# DUPEFILTER_CLASS = 'step8_king.duplication.RepeatUrl' # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html """
. 自动限速算法
from scrapy.contrib.throttle import AutoThrottle
自动限速设置
. 获取最小延迟 DOWNLOAD_DELAY
. 获取最大延迟 AUTOTHROTTLE_MAX_DELAY
. 设置初始下载延迟 AUTOTHROTTLE_START_DELAY
. 当请求下载完成后,获取其"连接"时间 latency,即:请求连接到接受到响应头之间的时间
. 用于计算的... AUTOTHROTTLE_TARGET_CONCURRENCY
target_delay = latency / self.target_concurrency
new_delay = (slot.delay + target_delay) / 2.0 # 表示上一次的延迟时间
new_delay = max(target_delay, new_delay)
new_delay = min(max(self.mindelay, new_delay), self.maxdelay)
slot.delay = new_delay
""" # 开始自动限速
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# 初始下载延迟
# AUTOTHROTTLE_START_DELAY =
# The maximum download delay to be set in case of high latencies
# 最大下载延迟
# AUTOTHROTTLE_MAX_DELAY =
# The average number of requests Scrapy should be sending in parallel to each remote server
# 平均每秒并发数
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received:
# 是否显示
# AUTOTHROTTLE_DEBUG = True # Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings """
. 启用缓存
目的用于将已经发送的请求或相应缓存下来,以便以后使用 from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware
from scrapy.extensions.httpcache import DummyPolicy
from scrapy.extensions.httpcache import FilesystemCacheStorage
"""
# 是否启用缓存策略
# HTTPCACHE_ENABLED = True # 缓存策略:所有请求均缓存,下次在请求直接访问原来的缓存即可
# HTTPCACHE_POLICY = "scrapy.extensions.httpcache.DummyPolicy"
# 缓存策略:根据Http响应头:Cache-Control、Last-Modified 等进行缓存的策略
# HTTPCACHE_POLICY = "scrapy.extensions.httpcache.RFC2616Policy" # 缓存超时时间
# HTTPCACHE_EXPIRATION_SECS = # 缓存保存路径
# HTTPCACHE_DIR = 'httpcache' # 缓存忽略的Http状态码
# HTTPCACHE_IGNORE_HTTP_CODES = [] # 缓存存储的插件
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' """
. 代理,需要在环境变量中设置
from scrapy.contrib.downloadermiddleware.httpproxy import HttpProxyMiddleware 方式一:使用默认
os.environ
{
http_proxy:http://root:woshiniba@192.168.11.11:9999/
https_proxy:http://192.168.11.11:9999/
}
方式二:使用自定义下载中间件 def to_bytes(text, encoding=None, errors='strict'):
if isinstance(text, bytes):
return text
if not isinstance(text, six.string_types):
raise TypeError('to_bytes must receive a unicode, str or bytes '
'object, got %s' % type(text).__name__)
if encoding is None:
encoding = 'utf-8'
return text.encode(encoding, errors) class ProxyMiddleware(object):
def process_request(self, request, spider):
PROXIES = [
{'ip_port': '111.11.228.75:80', 'user_pass': ''},
{'ip_port': '120.198.243.22:80', 'user_pass': ''},
{'ip_port': '111.8.60.9:8123', 'user_pass': ''},
{'ip_port': '101.71.27.120:80', 'user_pass': ''},
{'ip_port': '122.96.59.104:80', 'user_pass': ''},
{'ip_port': '122.224.249.122:8088', 'user_pass': ''},
]
proxy = random.choice(PROXIES)
if proxy['user_pass'] is not None:
request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])
encoded_user_pass = base64.encodestring(to_bytes(proxy['user_pass']))
request.headers['Proxy-Authorization'] = to_bytes('Basic ' + encoded_user_pass)
print "**************ProxyMiddleware have pass************" + proxy['ip_port']
else:
print "**************ProxyMiddleware no pass************" + proxy['ip_port']
request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port']) DOWNLOADER_MIDDLEWARES = {
'step8_king.middlewares.ProxyMiddleware': ,
} """ """
. Https访问
Https访问时有两种情况:
. 要爬取网站使用的可信任证书(默认支持)
DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory"
DOWNLOADER_CLIENTCONTEXTFACTORY = "scrapy.core.downloader.contextfactory.ScrapyClientContextFactory" . 要爬取网站使用的自定义证书
DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory"
DOWNLOADER_CLIENTCONTEXTFACTORY = "step8_king.https.MySSLFactory" # https.py
from scrapy.core.downloader.contextfactory import ScrapyClientContextFactory
from twisted.internet.ssl import (optionsForClientTLS, CertificateOptions, PrivateCertificate) class MySSLFactory(ScrapyClientContextFactory):
def getCertificateOptions(self):
from OpenSSL import crypto
v1 = crypto.load_privatekey(crypto.FILETYPE_PEM, open('/Users/wupeiqi/client.key.unsecure', mode='r').read())
v2 = crypto.load_certificate(crypto.FILETYPE_PEM, open('/Users/wupeiqi/client.pem', mode='r').read())
return CertificateOptions(
privateKey=v1, # pKey对象
certificate=v2, # X509对象
verify=False,
method=getattr(self, 'method', getattr(self, '_ssl_method', None))
)
其他:
相关类
scrapy.core.downloader.handlers.http.HttpDownloadHandler
scrapy.core.downloader.webclient.ScrapyHTTPClientFactory
scrapy.core.downloader.contextfactory.ScrapyClientContextFactory
相关配置
DOWNLOADER_HTTPCLIENTFACTORY
DOWNLOADER_CLIENTCONTEXTFACTORY """ """
. 爬虫中间件
class SpiderMiddleware(object): def process_spider_input(self,response, spider):
'''
下载完成,执行,然后交给parse处理
:param response:
:param spider:
:return:
'''
pass def process_spider_output(self,response, result, spider):
'''
spider处理完成,返回时调用
:param response:
:param result:
:param spider:
:return: 必须返回包含 Request 或 Item 对象的可迭代对象(iterable)
'''
return result def process_spider_exception(self,response, exception, spider):
'''
异常调用
:param response:
:param exception:
:param spider:
:return: None,继续交给后续中间件处理异常;含 Response 或 Item 的可迭代对象(iterable),交给调度器或pipeline
'''
return None def process_start_requests(self,start_requests, spider):
'''
爬虫启动时调用
:param start_requests:
:param spider:
:return: 包含 Request 对象的可迭代对象
'''
return start_requests 内置爬虫中间件:
'scrapy.contrib.spidermiddleware.httperror.HttpErrorMiddleware': ,
'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware': ,
'scrapy.contrib.spidermiddleware.referer.RefererMiddleware': ,
'scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware': ,
'scrapy.contrib.spidermiddleware.depth.DepthMiddleware': , """
# from scrapy.contrib.spidermiddleware.referer import RefererMiddleware
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
# 'step8_king.middlewares.SpiderMiddleware': ,
} """
. 下载中间件
class DownMiddleware1(object):
def process_request(self, request, spider):
'''
请求需要被下载时,经过所有下载器中间件的process_request调用
:param request:
:param spider:
:return:
None,继续后续中间件去下载;
Response对象,停止process_request的执行,开始执行process_response
Request对象,停止中间件的执行,将Request重新调度器
raise IgnoreRequest异常,停止process_request的执行,开始执行process_exception
'''
pass def process_response(self, request, response, spider):
'''
spider处理完成,返回时调用
:param response:
:param result:
:param spider:
:return:
Response 对象:转交给其他中间件process_response
Request 对象:停止中间件,request会被重新调度下载
raise IgnoreRequest 异常:调用Request.errback
'''
print('response1')
return response def process_exception(self, request, exception, spider):
'''
当下载处理器(download handler)或 process_request() (下载中间件)抛出异常
:param response:
:param exception:
:param spider:
:return:
None:继续交给后续中间件处理异常;
Response对象:停止后续process_exception方法
Request对象:停止中间件,request将会被重新调用下载
'''
return None 默认下载中间件
{
'scrapy.contrib.downloadermiddleware.robotstxt.RobotsTxtMiddleware': ,
'scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware': ,
'scrapy.contrib.downloadermiddleware.downloadtimeout.DownloadTimeoutMiddleware': ,
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': ,
'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware': ,
'scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware': ,
'scrapy.contrib.downloadermiddleware.redirect.MetaRefreshMiddleware': ,
'scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware': ,
'scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware': ,
'scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware': ,
'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': ,
'scrapy.contrib.downloadermiddleware.chunked.ChunkedTransferMiddleware': ,
'scrapy.contrib.downloadermiddleware.stats.DownloaderStats': ,
'scrapy.contrib.downloadermiddleware.httpcache.HttpCacheMiddleware': ,
} """
# from scrapy.contrib.downloadermiddleware.httpauth import HttpAuthMiddleware
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# 'step8_king.middlewares.DownMiddleware1': ,
# 'step8_king.middlewares.DownMiddleware2': ,
# }

scrapy-redis 自定义去重规则

############### xxx.py  ######

from scrapy_redis.dupefilter import RFPDupeFilter
from scrapy_redis.connection import get_redis_from_settings
from scrapy_redis import defaults class RdisDupeFilter(RFPDupeFilter): @classmethod
def from_settings(cls, settings): server = get_redis_from_settings(settings) key = defaults.DUPEFILTER_KEY % {'timestamp':'myScrapy'}
debug = settings.getbool('DUPEFILTER_DEBUG')
return cls(server, key=key, debug=debug) ########################### settings.py ##########
# ######################### scrapy redis连接 ##############
REDIS_HOST = "129.28.96.43" #主机名
REDIS_PORT = 6379 #端口
REDIS_PARAMS = {'password':"beta"}
REDIS_ENCODEING = "utf-8" #redis编码类型 # REDIS_URL = 'redis://user:pwd@hostname:9001' #连接URL 优先上面配置 DUPEFILTER_KEY = 'dupefilter:%(timestamp)s' # DUPEFLITER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
DUPEFLITER_CLASS = 'myscrapy.xxx.RedisDupeFilter'