python爬虫系列二:爬取糗百成人的妹子图片(requests+正则)
# !/usr/bin/env python
# -*- coding: utf-8 -*-
import requests
from import RequestException
import re
import os
def get_url_content(url, retry_times=2):
print 'Downloading: ', url
try:
send_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Connection': 'keep-alive'
}
html_content = (url, headers=send_headers).('gbk').encode('utf-8') # 解决中文乱码问题
except RequestException, e:
html_content = None
print "retry times:", retry_times
if retry_times > 0:
if hasattr(e, 'code') and 500 <= < 600:
get_url_content(url, retry_times - 1)
return html_content
def download_pic(save_path, pic_url):
pic_name_reg = r'[^\/]+$' # 根据src找到pic_name
pic_name = (pic_name_reg, pic_url)[0]
if not (save_path + pic_name):
r = (pic_url)
if r.status_code == 200:
open(save_path + pic_name, 'wb').write()
def mkdir(mkdir_path):
path = mkdir_path.strip()
if not (path):
(path)
return path
if __name__ == "__main__":
save_path = mkdir("/meizi/")
url_list = ['/%' % i for i in range(1, 806)]
for index, url in enumerate(url_list):
htm_content = get_url_content(url)
if htm_content:
pic_save_path = mkdir(save_path + str(index + 1) + "/")
src_reg = r'<img alt="(.*)" src="(.*)" style=".*?" />' # 找到页面的所有src
for pic_alt, pic_src in (src_reg, htm_content, ):
print '图片介绍:', pic_alt, pic_src
download_pic(pic_save_path, pic_src)
print '第' + str(index + 1) + '页,爬取完毕。撸叼屎,拿去撸吧!'