Python——Scrapy爬取链家网站所有房源信息

用scrapy爬取链家全国以上房源分类的信息：

路径：

Python——Scrapy爬取链家网站所有房源信息

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items

#

# See documentation in:

# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class LianItem(scrapy.Item):

    # define the fields for your item here like:

    # name = scrapy.Field()

    pass

class ErShouFangItem(scrapy.Item):

    # 省份

    province = scrapy.Field()

    # 城市

    city = scrapy.Field()

    # 总价

    total_price = scrapy.Field()

    # 单价

    single_price = scrapy.Field()

    # 楼层

    room_info = scrapy.Field()

    # 住宅位置

    region = scrapy.Field()

    # 房屋朝向及装修情况

    direction = scrapy.Field()

    # 建筑面积

    area = scrapy.Field()

    # 建筑类型

    house_struct = scrapy.Field()

    # 房屋户型

    huxing = scrapy.Field()

    # 购买时间

    buy_time = scrapy.Field()

    # url

    ershou_detail_url = scrapy.Field()

class NewHouseItem(scrapy.Item):

    # 省份

    province = scrapy.Field()

    # 城市

    city = scrapy.Field()

    # 标题

    title = scrapy.Field()

    # 位置

    region = scrapy.Field()

    # 房屋信息

    room_info = scrapy.Field()

    # 建筑面积

    area = scrapy.Field()

    # 价格

    price = scrapy.Field()

    # 详情页

    newHouse_detail_url = scrapy.Field()

class RentHouseItem(scrapy.Item):

    # 省份

    province = scrapy.Field()

    # 城市

    city = scrapy.Field()

    # 标题

    title = scrapy.Field()

    # 价格

    price = scrapy.Field()

    # 房间信息（房源户型、朝向、面积、租赁方式）

    house_info = scrapy.Field()

    # 发布时间

    pub_time = scrapy.Field()

    # 入住：

    in_time = scrapy.Field()

    # 租期

    lease = scrapy.Field()

    # 楼层

    floor = scrapy.Field()

    # 电梯：

    lift = scrapy.Field()

    # 车位：

    carport = scrapy.Field()

    # 用水：

    use_water = scrapy.Field()

    # 用电：

    use_electricity = scrapy.Field()

    # 燃气：

    use_gas = scrapy.Field()

    # url

    rent_detail_url = scrapy.Field()

class OfficeHouseItem(scrapy.Item):

    # 省份

    province = scrapy.Field()

    # 城市

    city = scrapy.Field()

    # 标题

    title = scrapy.Field()

    # 价格

    price = scrapy.Field()

    # 数量

    num = scrapy.Field()

    # 面积

    area = scrapy.Field()

    # url

    office_detail_url = scrapy.Field()

class XiaoquHouseItem(scrapy.Item):

    # 省份

    province = scrapy.Field()

    # 城市

    city = scrapy.Field()

    # 标题

    title = scrapy.Field()

    # 地区

    region = scrapy.Field()

    # 单价

    single_price = scrapy.Field()

    # 建筑年代

    build_time = scrapy.Field()

    # 建筑类型

    house_struct = scrapy.Field()

    # 物业费用

    service_fees = scrapy.Field()

    # 物业公司

    service_company = scrapy.Field()

    # 开发商

    build_company = scrapy.Field()

    # 楼栋数

    building_nums = scrapy.Field()

    # 房屋总数

    house_nums = scrapy.Field()

    # url

    xiaoqu_detail_url = scrapy.Field()

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here

#

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

from scrapy.exporters import JsonLinesItemExporter

from lian.items import ErShouFangItem, NewHouseItem,RentHouseItem,OfficeHouseItem,XiaoquHouseItem  # 已经导入成功，不用管

class LianPipeline(object):

    def __init__(self):

        self.ershoufang_fp = open('ershoufang.json', 'wb')

        self.ershoufang_exporter = JsonLinesItemExporter(self.ershoufang_fp, ensure_ascii=False)

        self.newhouse_fp = open('newhouse.json', 'wb')

        self.newhouse_exporter = JsonLinesItemExporter(self.newhouse_fp, ensure_ascii=False)

        self.renthouse_fp = open('renthouse.json', 'wb')

        self.renthouse_exporter = JsonLinesItemExporter(self.renthouse_fp, ensure_ascii=False)

        self.officehouse_fp = open('officehouse.json', 'wb')

        self.officehouse_exporter = JsonLinesItemExporter(self.officehouse_fp, ensure_ascii=False)

        self.xiaoquhouse_fp = open('xiaoquhouse.json', 'wb')

        self.xiaoquhouse_exporter = JsonLinesItemExporter(self.xiaoquhouse_fp, ensure_ascii=False)

    def process_item(self, item, spider):

        if isinstance(item, ErShouFangItem):

            self.ershoufang_exporter.export_item(item)

        elif isinstance(item, NewHouseItem):

            self.newhouse_exporter.export_item(item)

        elif isinstance(item, RentHouseItem):

            self.renthouse_exporter.export_item(item)

        elif isinstance(item ,OfficeHouseItem):

            self.officehouse_exporter.export_item(item)

        else:

            self.xiaoquhouse_exporter.export_item(item)

        return item

    def close_spider(self, spider):

        self.ershoufang_fp.close()

        self.newhouse_fp.close()

        self.renthouse_fp.close()

        # self.officehouse_fp.closed()

        self.xiaoquhouse_fp.close()

lian_spider.py

# -*- coding: utf-8 -*-

import scrapy

import re

from lian.items import ErShouFangItem,NewHouseItem,RentHouseItem,OfficeHouseItem,XiaoquHouseItem # 已经导入成功，不用管

class LianSpiderSpider(scrapy.Spider):

    name = 'lian_spider'

    allowed_domains = ['lianjia.com']

    start_urls = ['https://www.lianjia.com/city/']

    headers = {

        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',

        'Cookie': 'select_city=510700; lianjia_uuid=8bd3d017-2c99-49a5-826e-986f56ce99b9; _smt_uid=5cd3cd13.44c49764; UM_distinctid=16a9b59145a158-0442ba7704d667-3b654406-c0000-16a9b59146011e; _jzqckmp=1; _ga=GA1.2.822868133.1557384475; _gid=GA1.2.801531476.1557384475; all-lj=ed5a77c9e9ec3809d0c1321ec78803ae; lianjia_ssid=50fd11a7-d48c-4dde-b281-287224c40487; TY_SESSION_ID=ae45e1a4-b6d9-46bb-81c8-7cff32931953; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1557384618,1557389971,1557392984,1557446598; _jzqc=1; _jzqy=1.1557384468.1557446599.1.jzqsr=baidu|jzqct=%E9%93%BE%E5%AE%B6.-; _qzjc=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216a9b5916632a6-01ac8dcdbbb8a7-3b654406-786432-16a9b59166452e%22%2C%22%24device_id%22%3A%2216a9b5916632a6-01ac8dcdbbb8a7-3b654406-786432-16a9b59166452e%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; _jzqa=1.1500973956232310800.1557384468.1557451920.1557454945.6; _jzqx=1.1557451920.1557454945.2.jzqsr=mianyang%2Elianjia%2Ecom|jzqct=/ershoufang/pag1/.jzqsr=mianyang%2Elianjia%2Ecom|jzqct=/ershoufang/; CNZZDATA1255604082=609852050-1557381958-https%253A%252F%252Fwww.baidu.com%252F%7C1557455869; CNZZDATA1254525948=1645681089-1557382543-https%253A%252F%252Fwww.baidu.com%252F%7C1557458144; CNZZDATA1255633284=262578687-1557381275-https%253A%252F%252Fwww.baidu.com%252F%7C1557458627; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1557459240; _qzja=1.677427564.1557384472885.1557451920228.1557454945305.1557459200351.1557459240226.0.0.0.62.6; _qzjb=1.1557454945305.13.0.0.0; _qzjto=33.3.0; _jzqb=1.13.10.1557454945.1'

    }

    # 每个城市

    def parse(self, response):

        lis = response.xpath('//div[@class="city_list_section"]/ul/li')

        city_links = []

        for li in lis:

            province = li.xpath('.//div[@class="city_list_tit c_b"]/text()').extract_first()

            # print(province)

            lis2 = li.xpath('.//div[@class="city_province"]/ul/li')

            city_info = {}

            for l in lis2:

                city_info['city'] = l.xpath('./a/text()').extract_first()

                city_info['city_link'] = l.xpath('./a/@href').extract_first()

                city_links.append(city_info)

                # print(city_info)

                yield scrapy.Request(

                    url=city_info['city_link'],

                    headers=self.headers,

                    callback=self.parse_rent_type,

                    meta={'city_name': (province,city_info['city'])}

                )

        # 爬取海外房源，所有城市标题上房有海外房源的信息均为一致，所以只爬取一次

        # yield scrapy.Request(

        #     url='https://i.lianjia.com/us',

        #     headers=self.headers,

        #     callback=self.parse_haiwai

        # )

    # 类型（二手房，新房，租房，商业办公，小区）

    def parse_rent_type(self, response):

        province,city_name = response.meta.get('city_name')

        lis = response.xpath('//div[@class="nav typeUserInfo"]/ul/li')

        for li in lis:

            type = li.xpath('./a/text()').extract_first()

            if type == '二手房':

                ershoufang_link = li.xpath('./a/@href').extract_first()

                # print("city：{}————————{}".format(city_name,ershoufang_link))

                next_urls = [ershoufang_link + '/pg{}/'.format(str(i)) for i in range(1, 101)]

                i = 0

                for url in next_urls:

                    i = i+1

                    yield scrapy.Request(

                        url=url,

                        headers=self.headers,

                        callback=self.parse_ershoufang,

                        meta={'city_name': (province,city_name,i)}

                    )

            # 不好找页码

            elif type == '新房':

                xinfang_link = li.xpath('./a/@href').extract_first()

                xinfang_link = xinfang_link + '/loupan/'

                yield scrapy.Request(

                    url=xinfang_link,

                    headers=self.headers,

                    callback=self.parse_xinfang,

                    meta={'city_name': (province,city_name)}

                )

            elif type == '租房':

                zufang_link = li.xpath('./a/@href').extract_first()

                next_urls = [zufang_link + '/pg{}/'.format(str(i)) for i in range(1, 101)]

                i = 0

                for url in next_urls:

                    i = i + 1

                    yield scrapy.Request(

                        url=url,

                        headers=self.headers,

                        callback=self.parse_zufang,

                        meta={'city_name': (url,province,city_name,i)}

                    )

            # 不好找页码

            elif type == '商业办公':

                #TODO 有一个重定向，只会爬取一页

                shangyebangong_link = li.xpath('./a/@href').extract_first()

                shangyebangong_link = str(shangyebangong_link)+"/xzl/rent/mlist"

                # print(province, city_name,shangyebangong_link)

                if shangyebangong_link == None:

                    continue

                yield scrapy.Request(

                    url=shangyebangong_link,

                    headers=self.headers,

                    callback=self.parse_shangyebangong,

                    meta={'city_name': (province,city_name)}

                )

            # 不好找页码

            elif type == '小区':

                xiaoqu_link = li.xpath('./a/@href').extract_first()

                yield scrapy.Request(

                    url=xiaoqu_link,

                    headers=self.headers,

                    callback=self.parse_xiaoqu,

                    meta={'city_name': (province,city_name)}

                )

    # 获取二手房主页item+

    def parse_ershoufang(self, response):

        province,city_name,i = response.meta.get('city_name')

        lis = response.xpath('//ul[@class="sellListContent"]/li')

        for li in lis:

            ershou_detail_link = li.xpath('.//div[@class="title"]/a/@href').extract_first()

            # 注意有的房屋信息为None

            if ershou_detail_link == None:

                continue

            # print("{}——————{}".format(city_name,ershou_detail_link))

            yield scrapy.Request(

                url=ershou_detail_link,

                headers=self.headers,

                callback=self.parse_ershoufang_detail,

                meta={'city_name': (ershou_detail_link,province,city_name,i)}

            )

    # 二手房item详情页

    def parse_ershoufang_detail(self, response):

        ershou_detail_link,province,city_name,i = response.meta.get('city_name')

        title = response.xpath('//div[@class="sellDetailHeader"]//div[@class="title"]/h1/text()').extract_first()

        # print("***第{}页*** 城市：{}   二手房   标题：{}".format(i,city_name, title))

        total_price = response.xpath('//div[@class="price "]/span[@class="total"]/text()').extract_first() + str(response.xpath('//div[@class="price "]/span[@class="unit"]/span/text()').extract_first()).strip()

        single_price = response.xpath('//span[@class="unitPriceValue"]/text()').extract_first() + str(response.xpath('//span[@class="unitPriceValue"]/i/text()').extract_first())

        room_info = response.xpath('//div[@class="room"]/div[1]/text()').extract_first() + '-' + response.xpath('//div[@class="room"]/div[2]/text()').extract_first()

        region = response.xpath('//div[@class="areaName"]/span[@class="info"]/a[1]/text()').extract_first() + '-' +  response.xpath('//div[@class="areaName"]/span[@class="info"]/a[2]/text()').extract_first()

        direction = response.xpath('//div[@class="type"]/div[1]/text()').extract_first() + '-' + response.xpath('//div[@class="type"]/div[2]/text()').extract_first()

        area = response.xpath('//div[@class="area"]/div[1]/text()').extract_first()

        house_struct = response.xpath('//div[@class="area"]/div[2]/text()').extract_first()

        huxing = response.xpath('//div[@class="introContent"]/div[1]/div[2]/ul/li[1]/text()').extract_first()

        buy_time = response.xpath('//div[@class="transaction"]/div[2]/ul/li[3]/span[2]/text()').extract_first()

        print("***第{}页*** 城市：{}   二手房   标题：{} 总价：{} 单价：{} 楼层：{} 住宅位置：{} 房屋朝向：{} 建筑面积：{} 建筑类型：{} 房屋户型：{} 购买时间：{}".format(i, city_name, title,total_price,single_price,room_info,region,direction,area,house_struct,huxing,buy_time))

        item = ErShouFangItem(

            province = province,

            city = city_name,

            total_price = total_price,

            single_price = single_price,

            room_info = room_info,

            region = region,

            direction = direction,

            area = area,

            house_struct = house_struct,

            huxing = huxing,

            buy_time = buy_time,

            ershou_detail_url = ershou_detail_link

        )

        yield item

    # 新房楼盘主页

    def parse_xinfang(self, response):

        province,city_name = response.meta.get('city_name')

        lis = response.xpath('//ul[@class="resblock-list-wrapper"]/li')

        for li in lis:

            title = li.xpath('./a[@class="resblock-img-wrapper "]/@title').extract_first()

            region_infos = li.xpath('.//div[@class="resblock-location"]//text()').extract()

            region = ''

            for i in region_infos:

                region = region + i.replace('\n', '').strip(' ')

            room_infos = li.xpath('.//a[@class="resblock-room"]/span//text()').extract()

            room_info = ''

            for i in room_infos:

                room_info = room_info + i.strip(' ')

            area_infos = li.xpath('.//div[@class="main-price"]/span//text()').extract()

            area = ''

            for i in area_infos:

                area = area + i.strip(' ')

            # 加上单位并去除首尾空格

            price = li.xpath('.//div[@class="main-price"]/span[1]/text()').extract_first() + str(li.xpath('.//div[@class="main-price"]/span[2]/text()').extract_first()).strip()

            newhouse_detail_url = 'https://bj.fang.lianjia.com'+str(li.xpath('./a[@class="resblock-img-wrapper "]/@href').extract_first())

            print("城市：{}   新房  {}  {}".format(city_name,title, newhouse_detail_url))

            item = NewHouseItem(

                province=province,

                city = city_name,

                title = title,

                region = region,

                room_info = room_info,

                area = area,

                price = price,

                newHouse_detail_url = newhouse_detail_url

            )

            yield item

    # 租房首页

    def parse_zufang(self, response):

        zufang_link, province, city_name, i = response.meta.get('city_name')

        # 去掉链接pg页码信息

        # print("去掉之前：{}".format(zufang_link))

        zufang_link = re.findall('(.*?)/zufang//pg\d+/',zufang_link)[0]

        items = response.xpath('//div[@class="content__list"]/div')

        for zu in items:

            zufang_detail_link = zufang_link + str(zu.xpath('./a[@class="content__list--item--aside"]/@href').extract_first())

            # 注意有的房屋信息为None

            if zufang_detail_link == None:

                continue

            # print("{}——————{}".format(city_name,zufang_detail_link))

            yield scrapy.Request(

                url=zufang_detail_link,

                headers=self.headers,

                callback=self.parse_zufang_detail,

                meta={'city_name': (zufang_detail_link,province,city_name,i)}

            )

    # 租房信息详情

    def parse_zufang_detail(self, response):

        zufang_detail_link, province, city_name, i = response.meta.get('city_name')

        title = response.xpath('//div[@class="content clear w1150"]/p/text()').extract_first()

        price = response.xpath('//div[@class="content__aside fr"]/p/span/text()').extract_first()

        house_infos = response.xpath('//ul[@class="content__aside__list"]/p//text()').extract()

        house_info = ''

        for i in house_infos:

            house_info = house_info + i.replace('\n','/').strip(' ')

        # 发布时间

        pub_time = str(response.xpath('string(//div[@class="content__subtitle"])').extract_first())

        pub_time = re.findall('\d{4}-\d{1,2}-\d{1,2}',pub_time)

        if pub_time:

            pub_time = pub_time[0]

        else:

            pub_time = None

        # 入住时间

        in_time = response.xpath('//div[@class="content__article__info"]/ul/li[3]/text()').extract_first()

        # 租期

        lease = response.xpath('//div[@class="content__article__info"]/ul/li[5]/text()').extract_first()

        # 楼层

        floor = response.xpath('//div[@class="content__article__info"]/ul/li[8]/text()').extract_first()

        # 是否有电梯

        lift = response.xpath('//div[@class="content__article__info"]/ul/li[9]/text()').extract_first()

        # 是否有停车位

        carport = response.xpath('//div[@class="content__article__info"]/ul/li[11]/text()').extract_first()

        use_water = response.xpath('//div[@class="content__article__info"]/ul/li[12]/text()').extract_first()

        use_electricity = response.xpath('//div[@class="content__article__info"]/ul/li[14]/text()').extract_first()

        use_gas = response.xpath('//div[@class="content__article__info"]/ul/li[15]/text()').extract_first()

        # print(" 城市：{}   租房   {} {} {} {} {} {} {}".format(city_name, lease,floor,lift,carport,use_water,use_electricity,use_gas))

        item = RentHouseItem(

            province = province,

            city = city_name,

            title = title,

            price = price,

            house_info = house_info,

            pub_time = pub_time,

            in_time = in_time,

            lease = lease,

            floor = floor,

            lift = lift,

            carport = carport,

            use_water = use_water,

            use_electricity = use_electricity,

            use_gas = use_gas,

            rent_detail_url = zufang_detail_link

        )

        yield item

        print("***第{}页*** 城市：{}   租房   {}   {}".format(i, city_name, title, price))

    # 海外房源信息

    # def parse_haiwai(self,response):

    #     items = response.xpath('//*[@id="env"]/div[4]/div/div[2]')

    #     for i in items:

    #         title = i.xpath('.//div[class="titles"]/a/div/text()').extract_first()

    #         price = i.xpath('.//span[@class="fr"]/text()').extract_first()

    #         print("城市：美国   标题：{}   价格：{}".format(title,price))

    # 商业办公主页item详情

    def parse_shangyebangong(self, response):

        province, city_name = response.meta.get('city_name')

        items = response.xpath('//div[@class="result__ul"]/a')

        for i in items:

            office_detail_url = response.xpath('./@href')

            title = i.xpath('./div/p[@class="result__li-title"]/text()').extract_first()

            area = i.xpath('./div/p[@class="result__li-features"]/text()').extract_first()

            nums = i.xpath('./div/p[@class="result__li-other"]/text()').extract_first()

            price = i.xpath('./div/p[@class="result__li-price"]/span/text()').extract_first()

            item = OfficeHouseItem(

                province = province,

                city = city_name,

                title = title,

                price = price,

                num = nums,

                area = area,

                office_detail_url = office_detail_url

            )

            yield item

            print("城市：{}   商业办公   标题：{}   面积：{}   数量：{}   价格：{}   url:{}".format(city_name, title, area, nums, price, office_detail_url))

    # 小区主页item

    def parse_xiaoqu(self, response):

        province,city_name = response.meta.get('city_name')

        ul = response.xpath('//ul[@class="listContent"]/li')

        for li in ul:

            xiaoqu_detail_link = li.xpath('.//a[@class="img"]/@href').extract_first()

            if xiaoqu_detail_link == None:

                continue

            yield scrapy.Request(

                url=xiaoqu_detail_link,

                headers=self.headers,

                callback=self.parse_xiaoqu_detail,

                meta={'city_name': (xiaoqu_detail_link,province,city_name)}

            )

    # 小区item详情

    def parse_xiaoqu_detail(self, response):

        xiaoqu_detail_link,province,city_name = response.meta.get('city_name')

        title = response.xpath('//h1[@class="detailTitle"]/text()').extract_first()

        region = response.xpath('//div[@class="detailDesc"]/text()').extract_first()

        single_price = response.xpath('//span[@class="xiaoquUnitPrice"]/text()').extract_first()

        # 注意有的房屋没有建成时间信息，影响后面值得获取，需要进行判断后准确取值

        build_time = str(response.xpath('//div[@class="xiaoquInfo"]/div[1]/span[2]/text()').extract_first()).strip()

        house_struct = None

        service_fees = None

        pattern = re.compile('[0-9]+')

        if pattern.findall(build_time):

            build_time = build_time

            house_struct = response.xpath('//div[@class="xiaoquInfo"]/div[2]/span[2]/text()').extract_first()

            service_fees = response.xpath('//div[@class="xiaoquInfo"]/div[3]/span[2]/text()').extract_first()

            service_company = response.xpath('//div[@class="xiaoquInfo"]/div[4]/span[2]/text()').extract_first()

            build_company = response.xpath('//div[@class="xiaoquInfo"]/div[5]/span[2]/text()').extract_first()

            building_nums = response.xpath('//div[@class="xiaoquInfo"]/div[6]/span[2]/text()').extract_first()

            house_nums = response.xpath('//div[@class="xiaoquInfo"]/div[7]/span[2]/text()').extract_first()

        else:

            build_time = None

            house_struct = response.xpath('//div[@class="xiaoquInfo"]/div[1]/span[2]/text()').extract_first()

            service_fees = response.xpath('//div[@class="xiaoquInfo"]/div[2]/span[2]/text()').extract_first()

            service_company = response.xpath('//div[@class="xiaoquInfo"]/div[3]/span[2]/text()').extract_first()

            build_company = response.xpath('//div[@class="xiaoquInfo"]/div[4]/span[2]/text()').extract_first()

            building_nums = response.xpath('//div[@class="xiaoquInfo"]/div[5]/span[2]/text()').extract_first()

            house_nums = response.xpath('//div[@class="xiaoquInfo"]/div[6]/span[2]/text()').extract_first()

        item = XiaoquHouseItem(

            province=province,

            city = city_name,

            title=title,

            region=region,

            single_price=single_price,

            build_time=build_time,

            house_struct=house_struct,

            service_fees=service_fees,

            service_company=service_company,

            build_company=build_company,

            building_nums=building_nums,

            house_nums=house_nums,

            xiaoqu_detail_url=xiaoqu_detail_link

        )

        yield item

        print("省份：{} 城市：{}   小区   {}   {}   {}   {}   {}   {}   {}".format(province, city_name, build_time,house_struct,service_fees,service_company,build_company,building_nums,house_nums))

settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for lian project

#

# For simplicity, this file contains only settings considered important or

# commonly used. You can find more settings consulting the documentation:

#

#     https://doc.scrapy.org/en/latest/topics/settings.html

#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'lian'

SPIDER_MODULES = ['lian.spiders']

NEWSPIDER_MODULE = 'lian.spiders'

LOG_LEVEL = "WARNING"

# Crawl responsibly by identifying yourself (and your website) on the user-agent

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36'

# Obey robots.txt rules

ROBOTSTXT_OBEY = True

DOWNLOAD_FAIL_ON_DATALOSS = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)

#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)

# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay

# See also autothrottle settings and docs

#DOWNLOAD_DELAY = 3

# The download delay setting will honor only one of:

#CONCURRENT_REQUESTS_PER_DOMAIN = 16

#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)

#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)

#TELNETCONSOLE_ENABLED = False

# Override the default request headers:

#DEFAULT_REQUEST_HEADERS = {

#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

#   'Accept-Language': 'en',

#}

# Enable or disable spider middlewares

# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html

#SPIDER_MIDDLEWARES = {

#    'lian.middlewares.LianSpiderMiddleware': 543,

#}

# Enable or disable downloader middlewares

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

#DOWNLOADER_MIDDLEWARES = {

#    'lian.middlewares.LianDownloaderMiddleware': 543,

#}

# Enable or disable extensions

# See https://doc.scrapy.org/en/latest/topics/extensions.html

#EXTENSIONS = {

#    'scrapy.extensions.telnet.TelnetConsole': None,

#}

# Configure item pipelines

# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html

ITEM_PIPELINES = {

   'lian.pipelines.LianPipeline': 300,

}

# Enable and configure the AutoThrottle extension (disabled by default)

# See https://doc.scrapy.org/en/latest/topics/autothrottle.html

#AUTOTHROTTLE_ENABLED = True

# The initial download delay

#AUTOTHROTTLE_START_DELAY = 5

# The maximum download delay to be set in case of high latencies

#AUTOTHROTTLE_MAX_DELAY = 60

# The average number of requests Scrapy should be sending in parallel to

# each remote server

#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

# Enable showing throttling stats for every response received:

#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

#HTTPCACHE_ENABLED = True

#HTTPCACHE_EXPIRATION_SECS = 0

#HTTPCACHE_DIR = 'httpcache'

#HTTPCACHE_IGNORE_HTTP_CODES = []

#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

结果：

Python——Scrapy爬取链家网站所有房源信息

秒客网

Python——Scrapy爬取链家网站所有房源信息

相关文章