urlController.py
import bsController
from urllib import request class SpiderMain(object):
def __init__(self):
self.header = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
self.bsManage = bsController.bsManage() def getUrl(self,rootUrl):
for i in range(1,500):
url = rootUrl+'%s' %i+'.html'
req = request.Request(url)
for h in self.header:
req.add_header(h, self.header[h])
try:
html = request.urlopen(req).read()
# print(html)
self.bsManage.getPageUrl(html,i)
req.close()
except request.URLError as e:
if hasattr(e, 'code'):
print('Error code:',e.code)
elif hasattr(e, 'reason'):
print('Reason:',e.reason) if __name__=='__main__':
rootUrl = 'http://www.meitulu.com/item/'
obj_root = SpiderMain()
obj_root.getUrl(rootUrl)
bsController.py
from bs4 import BeautifulSoup
from urllib import request
import os class bsManage:
def __init__(self):
self.pageUrl = 'http://www.meitulu.com/item/'
self.header = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'} # html是获取到的网页的html
# i表示i_x.html
def getPageUrl(self,html,i):
soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
# 获取到最后一个连接
lastUrl = soup.find_all('div', {'id': 'pages'})[0].find_all('a')[-2]['href']
# print(html)
# print(lastUrl)
# 获取到最后一页的数字
if i < 10:
len = 1
elif i<100:
len = 2
elif i<1000:
len = 3
elif i<10000:
len = 4
lastPage = int(lastUrl[29+len:-5])
# 创建图片文件夹
if not os.path.exists('img'):
os.mkdir('img')
path = 'img/%s' %i
if not os.path.exists(path):
os.mkdir(path)
# 先爬取第一页 因为url格式不一样
# 获取所需要图片的连接 array
links = soup.find_all('img',class_='content_img')
for link in links:
name = str(link['src'])[-21:]
data = request.urlopen(link['src']).read()
img = open('img/%s/' %i + name,'wb+')
img.write(data)
img.close()
# print('%d 已经爬完' %i) # str = self.pageUrl + '%s' %i + '.html'
# print(str) # 每一个页面下有lastPage个小页面
for j in range(2,lastPage+1):
# 重新拼接url 获取到下一页的url
url = self.pageUrl + '%s_%s' %(i,j) + '.html'
self.saveImgWithUrl(url,i)
print('%d 已经爬完' %i) def saveImgWithUrl(self,url,i):
req = request.Request(url)
for h in self.header:
req.add_header(h, self.header[h])
try:
html = request.urlopen(req).read()
soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
# 获取所需要图片的连接 array
links = soup.find_all('img', class_='content_img')
for link in links:
name = str(link['src'])[-21:]
data = request.urlopen(link['src']).read()
img = open('img/%s/' % i + name, 'wb+')
img.write(data)
img.close()
except request.URLError as e:
if hasattr(e, 'code'):
print('Error code:', e.code)
elif hasattr(e, 'reason'):
print('Reason:', e.reason)