20.Scrapy日常练手

1.创建爬虫项目：

scrapy startproject tutorial

2.创建 spider

cd tutorial

scrapy genspider quotes quotes.toscrape.com

如下图：

20.Scrapy日常练手

quotes.py

___________________________________________________________________________

 # -*- coding: utf-8 -*-

 import scrapy

 from tutorial.items import TutorialItem

 import logging

 class QuotesSpider(scrapy.Spider):

     name = 'quotes'

     allowed_domains = ['quotes.toscrape.com']

     start_urls = ['http://quotes.toscrape.com/']

     def parse(self, response):

         quotes=response.css('.quote')

         for quote in quotes:

             item=TutorialItem()

             #内容

             item['text']=quote.css('.text::text').extract_first()

             #作者

             item['author']=quote.css('.author::text').extract_first()

             #标签

             item['tags']=quote.css('.tags .tag::text').extract_first()

             yield item

         #下一页

         next=response.css('.pager .next a::attr("href")').extract_first()

         url=response.urljoin(next)

         yield scrapy.Request(url=url,callback=self.parse)


items.py
________________________________________________________________________

 # -*- coding: utf-8 -*-

 # Define here the models for your scraped items

 #

 # See documentation in:

 # https://doc.scrapy.org/en/latest/topics/items.html

 import scrapy

 class TutorialItem(scrapy.Item):

     # define the fields for your item here like:

     # name = scrapy.Field()

     text=scrapy.Field()

     author=scrapy.Field()

     tags=scrapy.Field()

 piplines.py

_________________________________________________________________________

 # -*- coding: utf-8 -*-

 # Define your item pipelines here

 #

 # Don't forget to add your pipeline to the ITEM_PIPELINES setting

 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

 from scrapy.exceptions import DropItem

 import pymysql

 class TutorialPipeline(object):

     # def __init__(self):

     #     self.limit=50

     # def process_item(self, item, spider):

     #     if  item['text']:

     #         if len(item['text'])>self.limit:

     #             item['text']=item['text'][0:self.limit].rstrip()+'...'

     #         return item

     #     else:

     #         return DropItem('Missing Text')

     def __init__(self):

         pass

     def open_spider(self, spider):

         self.my_conn = pymysql.connect(

             host = '192.168.113.129',

             port = 3306,

             database = 'datas',

             user = 'root',

             password = '',

             charset = 'utf8'

         )

         self.my_cursor = self.my_conn.cursor()

     def process_item(self,item, spider):

         dict(item)

         insert_sql = "insert into quotes(author,tags,text) values(%s,%s,%s)"

         self.my_cursor.execute(insert_sql,[item['author'],item['tags'],item['text']])

         return  item

     def close_spider(self, spider):

         self.my_conn.commit()

         self.my_cursor.close()

         self.my_conn.close()


setting.py
___________________________________________________________________________

# Obey robots.txt rules

ROBOTSTXT_OBEY = True

ITEM_PIPELINES = {

   'tutorial.pipelines.TutorialPipeline': 200,

}

代码配置完：

保存文件格式

scrapy crawl  quotes -o quotes.xml

scrapy crawl  quotes -o quotes.csv

秒客网

20.Scrapy日常练手

相关文章