import urllib
from time import sleep
import requests
from lxml import etree
try:
def all_links(url,page):
# if "" in url:
# print("结束");
# return None
url = url + str(page) + ".html";
response = (url)
print(url, response.status_code)
html = (('gbk'))
## 获取图片 并且保存
imgs = ('.//div[@]//div[@class="ui-module"]//img/@src')
for img in imgs:
file_name = ('/')[-1]
first = ('/')[0]
if first != 'http:' and first != 'https:':
print("错误图片"+img)
else:
dir_path = "/www/spider/images/"
try:
file_content = (img)
if file_content.status_code != 200:
print(img,"下载失败")
else:
#(img, dir_path + file_name)
with open(dir_path+file_name,"wb") as f:
(file_content.content)
print("保存图片" + dir_path + file_name + "成功")
except Exception as ee:
print(str(ee))
# links = ('.//div[@class="page"]//a[contains(text(),"下一页")]/@href')
# print(links)
# if len(links) < 1:
# pass
# else:
sleep(1)
host = '/'
next_page = page + 1
all_links(host,next_page)
for i in range(1,991):
all_links("/",354)
except Exception as e:
print(str(e))
循环的版本
import urllib
from time import sleep
import requests
from lxml import etree
try:
def all_links(url):
if "" in url:
print("结束");
return None
response = (url)
print(url, response.status_code)
html = (('gbk'))
## 获取图片 并且保存
imgs = ('.//div[@]//div[@class="ui-module"]//img/@src')
for img in imgs:
file_name = ('/')[-1]
first = ('/')[0]
if first != 'http:' and first != 'https:':
print("错误图片"+img)
else:
dir_path = "d:\\www\\spider\\images\\"
(img, dir_path + file_name)
print("保存图片" + dir_path + file_name + "成功")
links = ('.//div[@class="page"]//a[contains(text(),"下一页")]/@href')
print(links)
if len(links) < 1:
pass
else:
sleep(5)
host = '/'
new_url = host + links[0];
all_links(new_url)
all_links("/")
except Exception as e:
print(str(e))