利用cookies+requests包登陆微博，使用xpath抓取目标用户的用户信息、微博以及对应评论

本文目的：介绍如何抓取微博内容，利用requests包+cookies实现登陆微博，lxml包的xpath语法解析网页，抓取目标内容。

所需python包：requests、lxml 皆使用pip安装即可　　

　　XPath即为XML路径语言，它是一种用来确定XML（标准通用标记语言的子集）文档中某部分位置的语言。XPath基于XML的树状结构，提供在数据结构树中找寻节点的能力。 xPath 同样也支持HTML.

　　XPath 是一门小型的查询语言，这里我们将它与 python 爬虫相结合来介绍。

　　　　xpath使用方法简介：

　　　　step1: 安装 lxml 库。

　　　　step2:　from lxml import etree

　　　　step3: Selector = etree.HTML(网页源代码)

　　　　step4: Selector.xpath(一段神奇的符号)

　　　　具体的xpath解析方法大家可以参考之后下边的代码或者参考这篇博文http://cighao.com/2016/03/01/introduction-of-xPath/

言归正传，接下来从头看一下抓取的思路，首先找到微博的网址，为了方便抓取，准备抓取3g版微博，网址为 https://weibo.cn/，进而找到登陆的网址为 https://passport.weibo.cn/signin/login，按F12调出开发者工具，点击最上边一栏的network，然后输入自己的账号密码进行登录，登录之后可以看到弹出了很多元素，找到m.weibo.cn这一项点击，如图所示，里面三个红色框就是我们需要的

　　　　利用cookies+requests包登陆微博，使用xpath抓取目标用户的用户信息、微博以及对应评论

Request URL 代表了我们刚刚请求的网址

cookies代表了刚刚登录我们的账号里面包含的密码信息，待会登录会用到

user-Agent代表了我们使用的浏览器的头部信息，用来对我们的爬虫进行伪装，降低被封禁的风险

接下来上一段代码

#coding=utf-8

import re

import time

import string

import os

import pickleimport requests

from lxml import etree

import traceback

import sys

reload(sys)

sys.setdefaultencoding('utf-8')

class weibo:

    cookie = {"Cookie": "~~~~~~~~~~"} #将your cookie替换成自己的cookie

    header={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}#这里就是浏览器的头部信息

    def __init__(self,user_id,filter = 0):

        self.user_id = user_id #用户id，即需要我们输入的数字，如昵称为“Dear-迪丽热巴”的id为1669879400

        self.filter = filter #取值范围为0、1，程序默认值为0，代表要爬取用户的全部微博，1代表只爬取用户的原创微博

        self.userName = '' #用户名，如“Dear-迪丽热巴”

        self.weiboNum = 0 #用户全部微博数

        self.weiboNum2 = 0 #爬取到的微博数

        self.following = 0 #用户关注数

        self.followers = 0 #用户粉丝数

        self.weibos = [] #微博内容

        self.num_zan = [] #微博对应的点赞数

        self.num_forwarding = [] #微博对应的转发数

        self.num_comment = [] #微博对应的评论数

        self.weibo_detail_urls=[]#pickle.load(open("weibourl1.pkl", "r"))#微博评论

        self.weibourls=[]#pickle.load(open('weibourl2.pkl','r'))#每一条微博链接，用于断点续爬

    def getUserName(self):

        try:

            url = 'http://weibo.cn/%d/info'%(self.user_id)

            html = requests.get(url, cookies = weibo.cookie,headers=weibo.header).content  #这一句里面的cookies与headers就实现了登录

            selector = etree.HTML(html)

            userName = selector.xpath("//title/text()")[0]

            self.userName = userName[:-3]

            #print '用户昵称：' + self.userName

        except Exceptione,e:

            print "Error: ",e

            traceback.print_exc()

上边的代码就实现了利用requests包+cookies登录微博，接下来就是找到想要抓取的用户id，然后使用xpath解析网页进行信息抓取，代码如下：

    def getUserInfo(self):

        try:

            url = 'http://weibo.cn/u/%d?filter=%d&page=1'%(self.user_id,self.filter)

            html = requests.get(url, cookies = weibo.cookie,headers=weibo.header).content

            selector = etree.HTML(html)

            pattern = r"\d+\.?\d*"

            str_wb = selector.xpath("//div[@class='tip2']/span[@class='tc']/text()")

            guid = re.findall(pattern, str_wb[0], re.S|re.M)

            for value in guid:

                num_wb = int(value)

                break

            self.weiboNum = num_wb

            print '微博数: ' + str(self.weiboNum)

            str_gz = selector.xpath("//div[@class='tip2']/a/text()")[0]

            guid = re.findall(pattern, str_gz, re.M)

            self.following = int(guid[0])

            print '关注数: ' + str(self.following)

            str_fs = selector.xpath("//div[@class='tip2']/a/text()")[1]

            guid = re.findall(pattern, str_fs, re.M)

            self.followers = int(guid[0])

            print '粉丝数: ' + str(self.followers)

        except Exception,e:

            print "Error: ",e

            traceback.print_exc()

    def getWeiboInfo(self):

        try:

            url = 'http://weibo.cn/u/%d?filter=%d&page=1'%(self.user_id,self.filter)

            html = requests.get(url, cookies = weibo.cookie,headers=weibo.header).content

            selector = etree.HTML(html)

            if selector.xpath('//input[@name="mp"]')==[]:

                pageNum = 1

            else:

                pageNum = (int)(selector.xpath('//input[@name="mp"]')[0].attrib['value'])

            pattern = r"\d+\.?\d*"

            f = open("./%s.txt"%self.user_id, "wb")

            for page in range(1,pageNum+1):

                if page % 10 == 0:

                    print('[ATTEMPTING] rest for 5 minutes to cheat weibo site, avoid being banned.')

                    time.sleep(60*5)

                url2 = 'http://weibo.cn/u/%d?filter=%d&page=%d'%(self.user_id,self.filter,page)

                html2 = requests.get(url2, cookies = weibo.cookie,headers=weibo.header).content

                selector2 = etree.HTML(html2)

                info = selector2.xpath("//div[@class='c']")

                if len(info) > 3:

                    for i in range(0,len(info)-2):

                        detail = info[i].xpath("@id")[0]

                        url3='http://weibo.cn/comment/{}?uid={}&rl=0'.format(detail.split('_')[-1], self.user_id)

                        if url3 not in self.weibo_detail_urls:

                            self.weiboNum2 = self.weiboNum2 + 1

                            #print self.weibo_detail_urls

                            str_t = info[i].xpath("div/span[@class='ctt']")

                            weibos = str_t[0].xpath('string(.)')

                            self.weibos.append(weibos)

                            #print '微博内容：'+ weibos+'***'+'No.%s'%self.weiboNum2

                            str_zan = info[i].xpath("div/a/text()")[-4]

                            guid = re.findall(pattern, str_zan, re.M)

                            num_zan = int(guid[0])

                            self.num_zan.append(num_zan)

                            #print '点赞数: ' + str(num_zan)

                            forwarding = info[i].xpath("div/a/text()")[-3]

                            guid = re.findall(pattern, forwarding, re.M)

                            num_forwarding = int(guid[0])

                            self.num_forwarding.append(num_forwarding)

                            #print '转发数: ' + str(num_forwarding)

                            comment = info[i].xpath("div/a/text()")[-2]

                            guid = re.findall(pattern, comment, re.M)

                            num_comment = int(guid[0])

                            self.num_comment.append(num_comment)

                            #print '评论数: ' + str(num_comment)

                            self.weibo_detail_urls.append(url3)

                            text=str(self.weiboNum2)+':'+weibos + '\t'+'点赞数：' + str(num_zan) + '\t'+ ' 转发数：' + str(num_forwarding) +'\t'+ ' 评论数：' + str(num_comment) + '\n'

                            f.write(text)

                            pickle.dump(self.weibo_detail_urls, open("weibourl1.pkl", "w"))

                        else:

                            print url3+'这条微博已经爬取过，忽略'

            if self.filter == 0:

                print '共'+str(self.weiboNum2)+'条微博'

            else:

                print '共'+str(self.weiboNum)+'条微博，其中'+str(self.weiboNum2)+'条为原创微博'

        except Exception,e:

            print "Error: ",e

            traceback.print_exc()

    def get_weibo_detail_comment(self):

        weibo_comments_save_path = './weibo/{}.txt'.format(self.user_id)

        with open(weibo_comments_save_path, 'a') as f:

            for i, url in enumerate(self.weibo_detail_urls):

                if url not in self.weibourls:

                    self.weibourls.append(url)

                    pickle.dump(self.weibourls, open("weibourl2.pkl", "w"))

                    print('solving weibo detail from {}'.format(url))

                    html_detail = requests.get(url, cookies=weibo.cookie, headers=weibo.header).content

                    selector = etree.HTML(html_detail)

                    str1='id="pagelist"'

                    if str1 in html_detail:

                        all_comment_pages = selector.xpath('//*[@id="pagelist"]/form/div/input[1]/@value')[0]

                    else:

                        all_comment_pages = 1

                    print('\n这是 {} 的微博：'.format(self.userName))

                    #print('微博内容： {}'.format(self.weibos[i]))

                    #print('接下来是下面的评论：\n\n')

                    f.writelines('微博内容： {}'.format(self.weibos[i])+'\n')

                    f.writelines('接下来是下面的评论:\n')

                    for page in range(1,int(all_comment_pages) + 1):

                        if page % 10 == 0:

                            print('[ATTEMPTING] rest for 5 minutes to cheat weibo site, avoid being banned.')

                            time.sleep(60*5)

                        detail_comment_url = url + '&page=' + str(page)

                        try:

                            html_detail_page = requests.get(detail_comment_url, cookies=weibo.cookie,headers=weibo.header).content

                            selector = etree.HTML(html_detail_page)

                            comment_div_element = selector.xpath('//div[starts-with(@id, "C_")]')

                            for child in comment_div_element:

                                single_comment_user_name = child.xpath('a[1]/text()')[0]

                                if child.xpath('span[1][count(*)=0]'):

                                    single_comment_content = child.xpath('span[1][count(*)=0]/text()')[0]

                                else:

                                    span_element = child.xpath('span[1]')[0]

                                    at_user_name = span_element.xpath('a/text()')[0]

                                    at_user_name = '$' + at_user_name.split('@')[-1] + '$'

                                    single_comment_content = span_element.xpath('/text()')

                                    single_comment_content.insert(1, at_user_name)

                                    single_comment_content = ' '.join(single_comment_content)

                                full_single_comment = '<' + single_comment_user_name + '>' + ': ' + single_comment_content

                                #print(full_single_comment)

                                f.writelines(full_single_comment + '\n')

            #f.writelines('F\n')

                        except etree.XMLSyntaxError as e:

                            print('user id {} all done!'.format(self.user_id))

                            print('all weibo content and comments saved into {}'.format(weibo_comments_save_path))

                    f.writelines('F\n')

                else:

                    print 'has alredy'

    def writeTxt(self):

        try:

            if self.filter == 1:

                resultHeader = '\n\n原创微博内容：\n'

            else:

                resultHeader = '\n\n微博内容：\n'

            result = '用户信息\n用户昵称：' + self.userName + '\n用户id：' + str(self.user_id) + '\n微博数：' + str(self.weiboNum) + '\n关注数：' + str(self.following) + '\n粉丝数：' + str(self.followers) + resultHeader

            if os.path.isdir('weibo') == False:

                os.mkdir('weibo')

            f = open("./%s.txt"%self.user_id, "wb")

            f.write(result)

            f.close()

        except Exception,e:

            print "Error: ",e

            traceback.print_exc()

    def start(self):

        try:

            weibo.getUserName(self)

            weibo.getUserInfo(self)

            weibo.writeTxt(self)

            weibo.getWeiboInfo(self)

            weibo.get_weibo_detail_comment(self)

            print '信息抓取完毕'

            print '==========================================================================='

        except Exception,e:

            print "Error: ",e

user_id =~~~~~~~~~~~~~  #可以改成任意合法的用户id（爬虫的微博id除外）

filter = 0 #值为0表示爬取全部的微博信息（原创微博+转发微博），值为1表示只爬取原创微博

#open('./weibourl1.pkl','w')

#open('./weibourl2.pkl','w')

wb = weibo(user_id,filter) #调用weibo类，创建微博实例wb

wb.start() #爬取微博信息

print '用户名：' + wb.userName

print '全部微博数：' + str(wb.weiboNum)

print '关注数：' + str(wb.following)

print '粉丝数：' + str(wb.followers)

print '最新一条微博为：' + wb.weibos[0] #若filter=1则为最新的原创微博，如果该用户微博数为0，即len(wb.weibos)==0,打印会出错，下同

print '最新一条微博获得的点赞数：' + str(wb.num_zan[0])

print '最新一条微博获得的转发数：' + str(wb.num_forwarding[0])

print '最新一条微博获得的评论数：' + str(wb.num_comment[0])

wb.writeTxt() #wb.writeTxt()只是把信息写到文件里，大家可以根据自己的需要重新编写writeTxt()函数

这两段代码就实现了对微博用户的用户名、微博数、微博转发、评论、点赞数、以及微博及对微博评论的抓取。

参考博文：http://cighao.com/2016/03/01/introduction-of-xPath/