python爬虫系列二:爬取糗百成人的妹子图片(requests+正则)

时间:2025-05-12 08:24:36
# !/usr/bin/env python # -*- coding: utf-8 -*- import requests from import RequestException import re import os def get_url_content(url, retry_times=2): print 'Downloading: ', url try: send_headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Connection': 'keep-alive' } html_content = (url, headers=send_headers).('gbk').encode('utf-8') # 解决中文乱码问题 except RequestException, e: html_content = None print "retry times:", retry_times if retry_times > 0: if hasattr(e, 'code') and 500 <= < 600: get_url_content(url, retry_times - 1) return html_content def download_pic(save_path, pic_url): pic_name_reg = r'[^\/]+$' # 根据src找到pic_name pic_name = (pic_name_reg, pic_url)[0] if not (save_path + pic_name): r = (pic_url) if r.status_code == 200: open(save_path + pic_name, 'wb').write() def mkdir(mkdir_path): path = mkdir_path.strip() if not (path): (path) return path if __name__ == "__main__": save_path = mkdir("/meizi/") url_list = ['/%' % i for i in range(1, 806)] for index, url in enumerate(url_list): htm_content = get_url_content(url) if htm_content: pic_save_path = mkdir(save_path + str(index + 1) + "/") src_reg = r'<img alt="(.*)" src="(.*)" style=".*?" />' # 找到页面的所有src for pic_alt, pic_src in (src_reg, htm_content, ): print '图片介绍:', pic_alt, pic_src download_pic(pic_save_path, pic_src) print '第' + str(index + 1) + '页,爬取完毕。撸叼屎,拿去撸吧!'