scrapy实现自动抓取51job并分别保存到redis，mongo和mysql数据库中

项目简介

利用scrapy抓取51job上的python招聘信息，关键词为“python”，范围：全国

利用redis的set数据类型保存抓取过的url，现实避免重复抓取；

利用脚本实现每隔一段时间，网站更新后自动抓取；

利用mongo和mysql，分别保存抓取结果。

主要内容

网站分析

进入51job后，输入关键字python，搜索范围改为全国，通过分析得到该网页为静态网页

搜索后生成的url即为开始抓取的url：https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,1.html

明确抓取字段

编写items.py文件，明确要抓取的字段

import scrapy

class QcItem(scrapy.Item):

    # define the fields for your item here like:

    # name = scrapy.Field()

    # 数据来源

    source = scrapy.Field()

    # 抓取时间

    utc_time = scrapy.Field()

    # 职位名称

    work_position = scrapy.Field()

    # 公司名称

    name_company = scrapy.Field()

    # 工作地点

    work_place = scrapy.Field()

    # 薪资范围

    salary = scrapy.Field()

    # 发布时间

    publish_time = scrapy.Field()

    # 工作详情

    content = scrapy.Field()

    # 联系方式

    contact = scrapy.Field()

编写爬虫文件

来到爬虫文件后，考虑给每一个请求添加一个请求头信息，因此，在下载中间件中添加请求头中间件

class QcSpiderMiddleware(object):

    # Not all methods need to be defined. If a method is not defined,

    # scrapy acts as if the spider middleware does not modify the

    # passed objects.

    def process_request(self, request, spider):

        """

        给每一个请求随机分配一个代理

        :param request:

        :param spider:

        :return:

        """

        user_agent = random.choice(ua)

        request.headers['User-Agent'] = user_agent

添加请求头后，来到爬虫文件，编写parse函数，解析数据：

class QcSpider(scrapy.Spider):

    name = 'qc'

    # allowed_domains = ['51job.com']

    # 开始url

    start_urls = ['https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,1.html']

    def parse(self, response):

        # 先编写下载中间件，给每个请求加一个User-Agent

        # 解析数据

        node_list = response.xpath('//div[@class="el"]')

        for node in node_list:

            # 匹配详情页链接,观察51job发现前面4个节点不是招聘信息，因此也没有详情页

            # 因此，取不到详情页链接，表示可以忽略，不用存

            detail_link = node.xpath('./p/span/a/@href')

            if detail_link:

                item = QcItem()

                item['work_position'] = node.xpath('./p/span/a/@title').extract_first()

                item['name_company'] = node.xpath('./span[@class="t2"]/a/text()').extract_first()

                item['work_place'] = node.xpath('./span[@class="t3"]/text()').extract_first()

                item['salary'] = node.xpath('./span[@class="t4"]/text()').extract_first()

                item['publish_time'] = node.xpath('./span[@class="t5"]/text()').extract_first()

                # 解析详情页数据

                yield scrapy.Request(detail_link.extract_first(), callback=self.parse_detail, meta={"item": item})

在开始解析详情页数据之前，中下载中间件中，搭建redis，利用redis的set数据类型，将每一个详情页的链接添加到数据库中；

实现避免重复抓取，如果详情页的url在redis中，则忽略该次请求。

class QcRedisMiddleware(object):

    """

    将第一个页面上的每一个url放入redis的set类型中，防止重复爬取

    """

    # 连接redis

    def __init__(self):

        self.redis = redis.StrictRedis(host='localhost', port=6379, db=1)

    def process_request(self, request, spider):

        # 将来自详情页的链接存到redis中

        if request.url.startswith("https://jobs.51job.com/"):

            # MD5加密详情页链接

            url_md5 = hashlib.md5(request.url.encode()).hexdigest()

            # 添加到redis，添加成功返回True,否则返回False

            result = self.redis.sadd('qc_url', url_md5)

            # 添加失败，说明链接已爬取，忽略该请求

            if not result:

                raise IgnoreRequest

继续来到爬虫文件中，编写详情页数据解析的内容。

   def parse_detail(self, response):

        item = response.meta['item']

        # 编写下载中间件，将详情页链接存到redis中，达到去重复的目的

        # 解析页面所有数据

        content = response.xpath('//div[@class="bmsg job_msg inbox"]').xpath('string(.)').extract()

        # content = response.xpath('//div[@class="bmsg job_msg inbox"]/*/text()').extract()

        # 取联系方式

        contact = response.xpath('//div[@class="bmsg inbox"]/p/text()').extract()

        # 拿到的content有空格和换行符，利用正则，去掉空白符

        item['content'] = re.sub('\s', '', ''.join(content))

        item['contact'] = ''.join(contact).strip()

        yield item

此时，索要解析的数据均解析完毕，接下来就是将解析的数据进行保存。

数据保存

编写pipelines.py文件，保存item。利用mongo和mysql两种方式分别保存数据。

# -*- coding: utf-8 -*-

# Define your item pipelines here

#

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import json

import pymongo

import pymysql

from datetime import datetime

class QcPipeline(object):

    def process_item(self, item, spider):

        # 添加数据源

        item['source'] = spider.name

        # 添加爬取时间

        item['utc_time'] = str(datetime.utcnow())

        return item

class QcJsonPipeline(object):

    """

    保存为json数据

    """

    def open_spider(self, spider):

        # 打开文件

        self.file = open('qc.json', 'a', encoding='utf-8')

    def process_item(self, item, spider):

        content = json.dumps(dict(item), ensure_ascii=False) + '\n'

        self.file.write(content)

        return item

    def close_spider(self, spider):

        self.file.close()

class QcMongoPipeline(object):

    """

    存入大Mongodb中

    """

    def open_spider(self, spider):

        # 实例化mongo客户端并链接

        self.client = pymongo.MongoClient(host='localhost', port=27017)

        # 创建库和集合

        self.collection = self.client['qc']['qc']

    def process_item(self, item, spider):

        # 添加数据

        self.collection.insert(dict(item))

        return item

    def close_spider(self, spider):

        # 关闭数据库

        self.client.close()

class QcMysqlPipeline(object):

    """

    数据存入到mysql

    """

    def open_spider(self, spider):

        self.conn = pymysql.connect(

            host='localhost',

            port=3306,

            database='qc',

            user='z',

            password='',

            charset='utf8'

        )

        # 实例一个游标

        self.cursor = self.conn.cursor()

    def process_item(self, item, spider):

        sql = ("insert into qc(source, utcTime, workName, "

               "company, workPosition, salary, publishTime, "

               "content, contact)"

               "values (%s, %s, %s, %s, %s, %s, %s, %s, %s)")

        list_item = [item['source'], item['utc_time'], item['work_position'],

                  item['name_company'], item['work_place'], item['salary'], item['publish_time'],

                  item['content'], item['contact']]

        self.cursor.execute(sql, list_item)

        # 提交数据

        self.conn.commit()

        return item

    def close_spider(self, spider):

        self.cursor.close()

        self.conn.close()

    # create table qc

    # (

    #     id INT unsigned PRIMARY KEY auto_increment NOT NULL,

    #     source VARCHAR(20) DEFAULT "",

    #     utcTime DATETIME DEFAULT "1111-11-11 11:11:11",

    #     workName VARCHAR(40) DEFAULT "",

    #     company VARCHAR(40) DEFAULT "",

    #     workPosition VARCHAR(40) DEFAULT "",

    #     salary VARCHAR(40) DEFAULT "",

    #     publishTime VARCHAR(20) DEFAULT "",

    #     content TEXT(1024),

    #     contact VARCHAR(40) DEFAULT ""

    # );

自动抓取

最后实现自动爬取，单独编写一个脚本文件，隔一段时间自动抓取。

from scrapy import cmdline

import time

# cmdline.execute("scrapy crawl qc".split())

import os

import time

while True:

    """

    每隔10s自动爬取一次，实现自动更新

    """

    os.system("scrapy crawl qc")

    time.sleep(20)

完整代码

参见：https://github.com/zInPython/qiancheng