爬虫爬取淘女郎某模特照片

使用模块：

import urllib2
import urllib
import chardet
import time
import os
from bs4 import BeautifulSoup


urllib，urllib2  用来获取网站html数据
chardet  自动检测网站编码方式，避免有些网站没有声明编码
time     延时抓取速度
os       创建保存图片的文件路径
BeautifulSoup 从网站源码中筛选想要的内容，模特名字，图片url

源码：

 1 # -*- coding: utf-8 -*-
 2 
 3 
 4 import urllib2
 5 import urllib
 6 import chardet
 7 import time
 8 import os
 9 from bs4 import BeautifulSoup
10 
11 
12 #要抓取的url
13 url = "https://mm.taobao.com/self/aiShow.htm?spm=719.7763510.1998643336.9.9sZTM1&userId=10378945"
14 #模拟浏览器头部信息
15 my_header = {
16     "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
17     "referer": "https://mm.taobao.com/search_tstar_model.htm?spm=5679.126488.640745.2.H0hEXn&style=&place=city%3A%E5%B9%BF%E5%B7%9E",
18     "GET": "log.mmstat.com",
19 }
20 
21 #获取网站源码
22 def get_html(url, my_header):
23     req = urllib2.Request(url, headers=my_header)
24     html = urllib2.urlopen(req).read()
25     return html
26 
27 
28 # 自动获取网站编码格式，此次爬取实际上可以不需要这个方法，只是为了练习加上去的
29 def automatic_code(arg):
30     code = chardet.detect(arg)
31     if code["encoding"] == "GB2312":
32         return "gbk"
33     if code["encoding"] == "utf-8":
34         return "utf-8"
35     if code["encoding"].lower() == "gbk":
36         return "gbk"
37 
38 #使用Beautifulsoup 解析网站源码，提取模特名字，跟图片url
39 def get_url(html):
40 
41     soup = BeautifulSoup(html, "html.parser")
42     name = soup.dd.a.string
43     try:
44         path = os.path.join(os.path.dirname(__file__), name)
45         os.mkdir(path)   #创建保存图片的文件夹
46     except Exception as e:
47         print e, "目录已存在"
48     list = soup.select("p > img")
49     print len(list)
50 
51     num = 0
52     for line in list:
53         url = line.get("src")
54         get_url = "http:" + url
55 
56         # urllib  urlretrieve方法下载图片并保存到文件
57         urllib.urlretrieve(get_url, path + "\\%s%d.jpg" % (name, num))
58         print "正在下载   " + str(get_url)
59         num += 1
60         time.sleep(1)
61 
62 
63 if __name__ == "__main__":
64     get_html = get_html(url, my_header)
65     get_code = automatic_code(get_html)
66     html = get_html.decode(get_code).encode("utf-8")
67     get_url(html)

View Code

秒客网

爬虫爬取淘女郎某模特照片

相关文章