爬取虎牙直播分类页面的主播的头像,名字,人气
今天学习了python3爬虫,上课闲着无聊,自己写了一个爬虫
就顺着老师思路
爬了虎牙直播分类页面的主播,头像,名字,和人气
HuYaCateScrapy.py
#!/usr/bin/python
# -*- coding: utf-8 -*- """
@ author: happy_code
@ contact: happy_code@foxmail.com
@ software: 爬虫
@ desc: 获取虎牙直播类页面主播名和人气,下载头像,并以名字和人气命名
""" import urllib
import requests
import re
import os class Spider: # 目标url
url = "" # 保存地址
myrobot = "D:/scinfo/" # 获取的主播名和人气,头像
part = '<span class="txt">\s*' \
'\s*<span class="avatar fl">\s*' \
'\s*<img.*data-original="(.*?)".*>\s*' \
'\s*<i.*>(.*?)</i>\s*' \
'\s*</span>\s*' \
'\s*<span.*><i.*></i><i class="js-num">(.*?)</i></span>\s*' \
'\s*</span>' def __init__(self, url):
self.url = url # 获取网站源代码
def gethtml(self):
res = requests.get(self.url)
res.encoding = "UTF-8"
return res.text # 获得信息
def gethtmlinfo(self):
html = self.gethtml()
all = re.findall(self.part, html)
return all # 下载图片,保存到myrobot下, 可以自定义文件名,哈哈重复下载保证成功(最多3次)
def downloadimg(self, url, name=None):
ok = 0
for i in range(3):
try:
if name != None:
path = self.myrobot + name + "." +url.split('.')[-1]
else:
path = self.myrobot + url.split('/')[-1]
url = url.replace('\\', '')
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
if not os.path.exists(self.myrobot):
os.makedirs(self.myrobot)
if not os.path.exists(path):
with open(path, 'wb') as f:
f.write(r.content)
f.close()
print(path + ' 文件保存成功')
ok = 1
else:
print('文件已经存在')
except:
print("异常")
continue if ok == 1:
break # 保存信息
def saveinfo(self, data):
for i in data:
s.downloadimg(i[0], i[1]+"-"+str(i[2])); if __name__ == "__main__":
# lol分类的url
s = Spider("https://www.huya.com/g/lol")
data = s.gethtmlinfo()
s.saveinfo(data)
只需要在main中给出分类页面的url即可
然后呢,修改一下保存路径就好了
结果如下: