Python爬虫urllib2笔记(三)之使用正则表达式提取百度贴吧网页中的楼主发的图片

时间:2022-01-07 15:15:17

使用正则表达式提取百度贴吧网页中的楼主发的图片


# -*- coding:utf-8 -
import re #正则模块
import urllib2
import urllib

#使用正则表达式提取百度贴吧网页中的楼主发的图片

def get_content(url):
html = urllib.urlopen(url)
content = html.read()
html.close()
return content

def get_imgs(info):
"""
<img class="BDE_Image" src="http://imgsrc.baidu.com/forum/w%3D580/sign=eadc62078094a4c20a23e7233ef51bac/63395bb5c9ea15ce0a876301b1003af33a87b24b.jpg"
size="65807" height="600" width="425">
"""
#正则表达式+?重复一次或多次 发现BDE_Image属性值是楼主发的图片--不会把签名和头像下载下
regex = r'class="BDE_Image" src="(.+?\.jpg)"'
pat=re.compile(regex)
images_code=re.findall(pat,info)
i = 0
for img_url in images_code:
print img_url
urllib.urlretrieve(img_url,'F:\\data\\pachong\\pic2\\%s.jpg' % i)
i +=1


# info=get_content("http://tieba.baidu.com/p/4311459540")
info=get_content("http://tieba.baidu.com/p/4364768066")
print get_imgs(info)

# -*- coding:utf-8 -
import re
import urllib2
import urllib

def get_content(url):
response = urllib2.urlopen(url)
content = response.read()
return content

def get_pages(content):
reg = re.compile(r'<span class="red">(\d+)</span>')
pages = reg.search(content).groups()
print "本帖子共%d页 " % int(pages[0])
return str(pages[0])

def get_imgurls(content):
''' doc.'''
global num
regxt = r'class="BDE_Image" src="(.+?\.jpg)"'
#urllist= regxt.search(content).groups()
urllist= re.findall(regxt, content)
if not urllist:
regxt = r'class="BDE_Image" height=".+?" src="(.+?\.jpg)"'
#urllist= regxt.search(content).groups()
urllist= re.findall(regxt, content)
print "第%d页一共%d张图片" % (num+1,len(urllist))
return urllist
def get_imgs(urllist):
global item
global retre_sec
retre_sec.clear()
i = 0
for ImgUrl in urllist:

try:
urllib.urlretrieve(ImgUrl, '%03d.jpg' % item)
print "保存%03d图片成功。%d" % (item,i)
except:
retre_sec[ImgUrl] = item
print "保存%03d图片失败。进行第二次保存。%d" % (item,i)
finally:
item += 1
i += 1

def retre_images_sec(urldict):
if len(urldict):
print "现在开始第二次保存,共%03d张图片" % len(urldict)
for url in urldict:
try:
urllib.urlretrieve(url, '%03d.jpg' % urldict[url])
print "第二次保存%03d图片成功。" % urldict[url]
except:
print "第二次保存%03d图片失败" % item
print "图片地址为 %s" % url
else:
print "第二次无图片需要保存。"

item = 1
retre_sec = dict()
url = 'http://tieba.baidu.com/p/2982230467'
url += '?see_lz=1'
content = get_content(url)
pages = int(get_pages(content)) #得到帖子页数 只看楼主

for num in range(pages):
url += '&pn=%d' % (num+1)
content = get_content(url) #得到html源码
img_list = get_imgurls(content) #得到所有图片的URL 以list形式返回
get_imgs(img_list) #遍历list 保存图片