1. Spider程序:
1 import scrapy, json
2 from UnsplashImageSpider.items import ImageItem
3
4 class UnsplashImageSpider(scrapy.Spider):
5 # 定义Spider的名称
6 name = 'unsplash_image'
7 allowed_domains = ['unsplash.com']
8 # 定义起始页面
9 start_urls = ['https://unsplash.com/napi/photos?page=1&per_page=12']
10 def __init__ (self):
11 self.page_index = 1
12
13 def parse(self, response):
14 # 解析服务器响应的JSON字符串
15 photo_list = json.loads(response.text) # ①
16 # 遍历每张图片
17 for photo in photo_list:
18 item = ImageItem()
19 item['image_id'] = photo['id']
20 item['download'] = photo['links']['download']
21 yield item
22
23 self.page_index += 1
24 # 获取下一页的链接
25 next_link = 'https://unsplash.com/napi/photos?page='\
26 + str(self.page_index) + '&per_page=12'
27 # 继续获取下一页的图片
28 yield scrapy.Request(next_link, callback=self.parse)
2. 在Pipeline中使用urllib.request包直接下載圖片:
1 from urllib.request import *
2
3 class UnsplashimagespiderPipeline(object):
4 def process_item(self, item, spider):
5 # 每个item代表一个要下载的图片
6 print('----------' + item['image_id'])
7 real_url = item['download'] + "?force=true"
8 try:
9 pass
10 # 打开URL对应的资源
11 with urlopen(real_url) as result:
12 # 读取图片数据
13 data = result.read()
14 # 打开图片文件
15 with open("images/" + item['image_id'] + '.jpg', 'wb+') as f:
16 # 写入读取的数据
17 f.write(data)
18 except:
19 print('下载图片出现错误' % item['image_id'])