Scrapy——將爬取圖片下載到本地

1. Spider程序：

 1 import scrapy, json

 2 from UnsplashImageSpider.items import ImageItem

 3

 4 class UnsplashImageSpider(scrapy.Spider):

 5     # 定义Spider的名称

 6     name = 'unsplash_image'

 7     allowed_domains = ['unsplash.com']

 8     # 定义起始页面

 9     start_urls = ['https://unsplash.com/napi/photos?page=1&per_page=12']

10     def __init__ (self):

11         self.page_index = 1

12

13     def parse(self, response):

14         # 解析服务器响应的JSON字符串

15         photo_list = json.loads(response.text) # ①

16         # 遍历每张图片

17         for photo in photo_list:

18             item = ImageItem()

19             item['image_id'] = photo['id']

20             item['download'] = photo['links']['download']

21             yield item

22

23         self.page_index += 1

24         # 获取下一页的链接

25         next_link = 'https://unsplash.com/napi/photos?page='\

26             + str(self.page_index) + '&per_page=12'

27         # 继续获取下一页的图片

28         yield scrapy.Request(next_link, callback=self.parse)

2. 在Pipeline中使用urllib.request包直接下載圖片:

 1 from urllib.request import *

 2

 3 class UnsplashimagespiderPipeline(object):

 4     def process_item(self, item, spider):

 5         # 每个item代表一个要下载的图片

 6         print('----------' + item['image_id'])

 7         real_url = item['download'] + "?force=true"

 8         try:

 9             pass

10             # 打开URL对应的资源

11             with urlopen(real_url) as result:

12                 # 读取图片数据

13                 data = result.read()

14                 # 打开图片文件

15                 with open("images/" + item['image_id'] + '.jpg', 'wb+') as f:

16                     # 写入读取的数据

17                     f.write(data)

18         except:

19             print('下载图片出现错误' % item['image_id'])

秒客网

Scrapy——將爬取圖片下載到本地

相关文章