爬虫爬取淘女郎某模特照片

时间:2021-08-20 06:25:06

 

使用模块:

import urllib2
import urllib
import chardet
import time
import os
from bs4 import BeautifulSoup


urllib,urllib2 用来获取网站html数据
chardet 自动检测网站编码方式,避免有些网站没有声明编码
time 延时抓取速度
os 创建保存图片的文件路径
BeautifulSoup 从网站源码中筛选想要的内容,模特名字,图片url

 源码:

爬虫爬取淘女郎某模特照片爬虫爬取淘女郎某模特照片
 1 # -*- coding: utf-8 -*-
2
3
4 import urllib2
5 import urllib
6 import chardet
7 import time
8 import os
9 from bs4 import BeautifulSoup
10
11
12 #要抓取的url
13 url = "https://mm.taobao.com/self/aiShow.htm?spm=719.7763510.1998643336.9.9sZTM1&userId=10378945"
14 #模拟浏览器头部信息
15 my_header = {
16 "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
17 "referer": "https://mm.taobao.com/search_tstar_model.htm?spm=5679.126488.640745.2.H0hEXn&style=&place=city%3A%E5%B9%BF%E5%B7%9E",
18 "GET": "log.mmstat.com",
19 }
20
21 #获取网站源码
22 def get_html(url, my_header):
23 req = urllib2.Request(url, headers=my_header)
24 html = urllib2.urlopen(req).read()
25 return html
26
27
28 # 自动获取网站编码格式,此次爬取实际上可以不需要这个方法,只是为了练习加上去的
29 def automatic_code(arg):
30 code = chardet.detect(arg)
31 if code["encoding"] == "GB2312":
32 return "gbk"
33 if code["encoding"] == "utf-8":
34 return "utf-8"
35 if code["encoding"].lower() == "gbk":
36 return "gbk"
37
38 #使用Beautifulsoup 解析网站源码,提取模特名字,跟图片url
39 def get_url(html):
40
41 soup = BeautifulSoup(html, "html.parser")
42 name = soup.dd.a.string
43 try:
44 path = os.path.join(os.path.dirname(__file__), name)
45 os.mkdir(path) #创建保存图片的文件夹
46 except Exception as e:
47 print e, "目录已存在"
48 list = soup.select("p > img")
49 print len(list)
50
51 num = 0
52 for line in list:
53 url = line.get("src")
54 get_url = "http:" + url
55
56 # urllib urlretrieve方法下载图片并保存到文件
57 urllib.urlretrieve(get_url, path + "\\%s%d.jpg" % (name, num))
58 print "正在下载 " + str(get_url)
59 num += 1
60 time.sleep(1)
61
62
63 if __name__ == "__main__":
64 get_html = get_html(url, my_header)
65 get_code = automatic_code(get_html)
66 html = get_html.decode(get_code).encode("utf-8")
67 get_url(html)
View Code