python 爬图 helloworld

时间:2023-03-09 22:50:49
python 爬图 helloworld

最近发现 吾志 上用户的头像都很个性,另外,对于没有把日记设为私密的用户,最后一天的日记是公开的,谁都可以查看。

所以,如果每天把所有可查看的日记爬一遍,那么~~ 哈哈

以前对爬虫只是了解一点点,没有真的玩过。既然今晚兴致来了,那就随便学一下咯~

参考 http://blog.****.net/pleasecallmewhy/article/details/8925978

参考 http://cuiqingcai.com/1052.html

 #coding=utf-8
import os
import urllib
import urllib2
import re
import cookielib def mkdir(path):
# 去除左右两边的空格
path = path.strip()
# 去除尾部 \ 符号
path = path.rstrip("\\") if not os.path.exists(path):
os.makedirs(path) return path def save_file(path, file_name, data):
if data == None:
return mkdir(path)
if (not path.endswith("/")):
path = path + "/"
f = open(path+file_name, "wb")
f.write(data)
f.flush()
f.close() user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'
headers = {'User-Agent' : user_agent}
values = {}
data = urllib.urlencode(values) def getHtml(url):
req = urllib2.Request(url, data, headers)
page = urllib2.urlopen(req, timeout=10)
html = page.read()
page.close()
#print html
return html def get_file(url):
try:
opener = urllib2.build_opener()
opener.addheaders = [('User-Agent', 'Mozilla/5.0')]
urllib2.install_opener(opener)
req = urllib2.Request(url)
operate = opener.open(req)
data = operate.read()
operate.close()
return data
except BaseException, e:
print e, 'fuck'
return None def getImg(html):
reg = r'src="(.+?\.jpg)" alt='
imgre = re.compile(reg)
imglist = re.findall(imgre, html) x = 0
for imgurl in imglist:
#urllib.urlretrieve(imgurl, '%s.jpg' % x)
da = get_file(imgurl)
save_file('.', '%s.jpg' % x, da)
x += 1 return x html = getHtml("https://wuzhi.me/last") print getImg(html)

十分简陋,哈哈~