利用cookies+requests包登陆微博,使用xpath抓取目标用户的用户信息、微博以及对应评论

时间:2023-03-08 16:13:55

本文目的:介绍如何抓取微博内容,利用requests包+cookies实现登陆微博,lxml包的xpath语法解析网页,抓取目标内容。

所需python包:requests、lxml 皆使用pip安装即可  

  XPath即为XML路径语言,它是一种用来确定XML(标准通用标记语言的子集)文档中某部分位置的语言。XPath基于XML的树状结构,提供在数据结构树中找寻节点的能力。 xPath 同样也支持HTML.

  XPath 是一门小型的查询语言,这里我们将它与 python 爬虫相结合来介绍。

    xpath使用方法简介:

     step1: 安装 lxml 库。

     step2: from lxml import etree

     step3: Selector = etree.HTML(网页源代码)

     step4: Selector.xpath(一段神奇的符号)

    具体的xpath解析方法大家可以参考之后下边的代码或者参考这篇博文http://cighao.com/2016/03/01/introduction-of-xPath/

言归正传,接下来从头看一下抓取的思路,首先找到微博的网址,为了方便抓取,准备抓取3g版微博,网址为 https://weibo.cn/,进而找到登陆的网址为 https://passport.weibo.cn/signin/login,按F12调出开发者工具,点击最上边一栏的network,然后输入自己的账号密码进行登录,登录之后可以看到弹出了很多元素,找到m.weibo.cn这一项点击,如图所示,里面三个红色框就是我们需要的

    利用cookies+requests包登陆微博,使用xpath抓取目标用户的用户信息、微博以及对应评论

Request URL 代表了我们刚刚请求的网址

cookies代表了刚刚登录我们的账号里面包含的密码信息,待会登录会用到

user-Agent代表了我们使用的浏览器的头部信息,用来对我们的爬虫进行伪装,降低被封禁的风险

接下来上一段代码

#coding=utf-8
import re
import time
import string
import os
import pickleimport requests
from lxml import etree
import traceback
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class weibo:
cookie = {"Cookie": "~~~~~~~~~~"} #将your cookie替换成自己的cookie
header={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}#这里就是浏览器的头部信息
def __init__(self,user_id,filter = 0):
self.user_id = user_id #用户id,即需要我们输入的数字,如昵称为“Dear-迪丽热巴”的id为1669879400
self.filter = filter #取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博
self.userName = '' #用户名,如“Dear-迪丽热巴”
self.weiboNum = 0 #用户全部微博数
self.weiboNum2 = 0 #爬取到的微博数
self.following = 0 #用户关注数
self.followers = 0 #用户粉丝数
self.weibos = [] #微博内容
self.num_zan = [] #微博对应的点赞数
self.num_forwarding = [] #微博对应的转发数
self.num_comment = [] #微博对应的评论数
self.weibo_detail_urls=[]#pickle.load(open("weibourl1.pkl", "r"))#微博评论
self.weibourls=[]#pickle.load(open('weibourl2.pkl','r'))#每一条微博链接,用于断点续爬
def getUserName(self):
try:
url = 'http://weibo.cn/%d/info'%(self.user_id)
html = requests.get(url, cookies = weibo.cookie,headers=weibo.header).content #这一句里面的cookies与headers就实现了登录
selector = etree.HTML(html)
userName = selector.xpath("//title/text()")[0]
self.userName = userName[:-3]
#print '用户昵称:' + self.userName
except Exceptione,e:
print "Error: ",e
traceback.print_exc()

上边的代码就实现了利用requests包+cookies登录微博,接下来就是找到想要抓取的用户id,然后使用xpath解析网页进行信息抓取,代码如下:

    def getUserInfo(self):
try:
url = 'http://weibo.cn/u/%d?filter=%d&page=1'%(self.user_id,self.filter)
html = requests.get(url, cookies = weibo.cookie,headers=weibo.header).content
selector = etree.HTML(html)
pattern = r"\d+\.?\d*"
str_wb = selector.xpath("//div[@class='tip2']/span[@class='tc']/text()")
guid = re.findall(pattern, str_wb[0], re.S|re.M)
for value in guid:
num_wb = int(value)
break
self.weiboNum = num_wb
print '微博数: ' + str(self.weiboNum)
str_gz = selector.xpath("//div[@class='tip2']/a/text()")[0]
guid = re.findall(pattern, str_gz, re.M)
self.following = int(guid[0])
print '关注数: ' + str(self.following)
str_fs = selector.xpath("//div[@class='tip2']/a/text()")[1]
guid = re.findall(pattern, str_fs, re.M)
self.followers = int(guid[0])
print '粉丝数: ' + str(self.followers)
except Exception,e:
print "Error: ",e
traceback.print_exc()
def getWeiboInfo(self):
try:
url = 'http://weibo.cn/u/%d?filter=%d&page=1'%(self.user_id,self.filter)
html = requests.get(url, cookies = weibo.cookie,headers=weibo.header).content
selector = etree.HTML(html)
if selector.xpath('//input[@name="mp"]')==[]:
pageNum = 1
else:
pageNum = (int)(selector.xpath('//input[@name="mp"]')[0].attrib['value'])
pattern = r"\d+\.?\d*"
f = open("./%s.txt"%self.user_id, "wb")
for page in range(1,pageNum+1):
if page % 10 == 0:
print('[ATTEMPTING] rest for 5 minutes to cheat weibo site, avoid being banned.')
time.sleep(60*5)
url2 = 'http://weibo.cn/u/%d?filter=%d&page=%d'%(self.user_id,self.filter,page)
html2 = requests.get(url2, cookies = weibo.cookie,headers=weibo.header).content
selector2 = etree.HTML(html2)
info = selector2.xpath("//div[@class='c']")
if len(info) > 3:
for i in range(0,len(info)-2):
detail = info[i].xpath("@id")[0]
url3='http://weibo.cn/comment/{}?uid={}&rl=0'.format(detail.split('_')[-1], self.user_id)
if url3 not in self.weibo_detail_urls:
self.weiboNum2 = self.weiboNum2 + 1
#print self.weibo_detail_urls
str_t = info[i].xpath("div/span[@class='ctt']")
weibos = str_t[0].xpath('string(.)')
self.weibos.append(weibos)
#print '微博内容:'+ weibos+'***'+'No.%s'%self.weiboNum2
str_zan = info[i].xpath("div/a/text()")[-4]
guid = re.findall(pattern, str_zan, re.M)
num_zan = int(guid[0])
self.num_zan.append(num_zan)
#print '点赞数: ' + str(num_zan)
forwarding = info[i].xpath("div/a/text()")[-3]
guid = re.findall(pattern, forwarding, re.M)
num_forwarding = int(guid[0])
self.num_forwarding.append(num_forwarding)
#print '转发数: ' + str(num_forwarding)
comment = info[i].xpath("div/a/text()")[-2]
guid = re.findall(pattern, comment, re.M)
num_comment = int(guid[0])
self.num_comment.append(num_comment)
#print '评论数: ' + str(num_comment)
self.weibo_detail_urls.append(url3)
text=str(self.weiboNum2)+':'+weibos + '\t'+'点赞数:' + str(num_zan) + '\t'+ ' 转发数:' + str(num_forwarding) +'\t'+ ' 评论数:' + str(num_comment) + '\n'
f.write(text)
pickle.dump(self.weibo_detail_urls, open("weibourl1.pkl", "w"))
else:
print url3+'这条微博已经爬取过,忽略'
if self.filter == 0:
print '共'+str(self.weiboNum2)+'条微博'
else:
print '共'+str(self.weiboNum)+'条微博,其中'+str(self.weiboNum2)+'条为原创微博'
except Exception,e:
print "Error: ",e
traceback.print_exc()
def get_weibo_detail_comment(self):
weibo_comments_save_path = './weibo/{}.txt'.format(self.user_id)
with open(weibo_comments_save_path, 'a') as f:
for i, url in enumerate(self.weibo_detail_urls):
if url not in self.weibourls:
self.weibourls.append(url)
pickle.dump(self.weibourls, open("weibourl2.pkl", "w"))
print('solving weibo detail from {}'.format(url))
html_detail = requests.get(url, cookies=weibo.cookie, headers=weibo.header).content
selector = etree.HTML(html_detail)
str1='id="pagelist"'
if str1 in html_detail:
all_comment_pages = selector.xpath('//*[@id="pagelist"]/form/div/input[1]/@value')[0]
else:
all_comment_pages = 1
print('\n这是 {} 的微博:'.format(self.userName))
#print('微博内容: {}'.format(self.weibos[i]))
#print('接下来是下面的评论:\n\n')
f.writelines('微博内容: {}'.format(self.weibos[i])+'\n')
f.writelines('接下来是下面的评论:\n')
for page in range(1,int(all_comment_pages) + 1):
if page % 10 == 0:
print('[ATTEMPTING] rest for 5 minutes to cheat weibo site, avoid being banned.')
time.sleep(60*5)
detail_comment_url = url + '&page=' + str(page)
try:
html_detail_page = requests.get(detail_comment_url, cookies=weibo.cookie,headers=weibo.header).content
selector = etree.HTML(html_detail_page)
comment_div_element = selector.xpath('//div[starts-with(@id, "C_")]')
for child in comment_div_element:
single_comment_user_name = child.xpath('a[1]/text()')[0]
if child.xpath('span[1][count(*)=0]'):
single_comment_content = child.xpath('span[1][count(*)=0]/text()')[0]
else:
span_element = child.xpath('span[1]')[0]
at_user_name = span_element.xpath('a/text()')[0]
at_user_name = '$' + at_user_name.split('@')[-1] + '$'
single_comment_content = span_element.xpath('/text()')
single_comment_content.insert(1, at_user_name)
single_comment_content = ' '.join(single_comment_content)
full_single_comment = '<' + single_comment_user_name + '>' + ': ' + single_comment_content
#print(full_single_comment)
f.writelines(full_single_comment + '\n')
#f.writelines('F\n')
except etree.XMLSyntaxError as e:
print('user id {} all done!'.format(self.user_id))
print('all weibo content and comments saved into {}'.format(weibo_comments_save_path))
f.writelines('F\n')
else:
print 'has alredy'
def writeTxt(self):
try:
if self.filter == 1:
resultHeader = '\n\n原创微博内容:\n'
else:
resultHeader = '\n\n微博内容:\n'
result = '用户信息\n用户昵称:' + self.userName + '\n用户id:' + str(self.user_id) + '\n微博数:' + str(self.weiboNum) + '\n关注数:' + str(self.following) + '\n粉丝数:' + str(self.followers) + resultHeader
if os.path.isdir('weibo') == False:
os.mkdir('weibo')
f = open("./%s.txt"%self.user_id, "wb")
f.write(result)
f.close()
except Exception,e:
print "Error: ",e
traceback.print_exc()
def start(self):
try:
weibo.getUserName(self)
weibo.getUserInfo(self)
weibo.writeTxt(self)
weibo.getWeiboInfo(self)
weibo.get_weibo_detail_comment(self)
print '信息抓取完毕'
print '==========================================================================='
except Exception,e:
print "Error: ",e
user_id =~~~~~~~~~~~~~ #可以改成任意合法的用户id(爬虫的微博id除外)
filter = 0 #值为0表示爬取全部的微博信息(原创微博+转发微博),值为1表示只爬取原创微博
#open('./weibourl1.pkl','w')
#open('./weibourl2.pkl','w')
wb = weibo(user_id,filter) #调用weibo类,创建微博实例wb
wb.start() #爬取微博信息
print '用户名:' + wb.userName
print '全部微博数:' + str(wb.weiboNum)
print '关注数:' + str(wb.following)
print '粉丝数:' + str(wb.followers)
print '最新一条微博为:' + wb.weibos[0] #若filter=1则为最新的原创微博,如果该用户微博数为0,即len(wb.weibos)==0,打印会出错,下同
print '最新一条微博获得的点赞数:' + str(wb.num_zan[0])
print '最新一条微博获得的转发数:' + str(wb.num_forwarding[0])
print '最新一条微博获得的评论数:' + str(wb.num_comment[0])
wb.writeTxt() #wb.writeTxt()只是把信息写到文件里,大家可以根据自己的需要重新编写writeTxt()函数

这两段代码就实现了对微博用户的用户名、微博数、微博转发、评论、点赞数、以及微博及对微博评论的抓取。

参考博文:http://cighao.com/2016/03/01/introduction-of-xPath/