BS4爬取糗百

# -*- coding: cp936 -*-

import urllib,urllib2

from bs4 import BeautifulSoup

user_agent='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'

headers={

    'User-Agent':user_agent

    }

url='https://www.qiushibaike.com'

#----------------------

def getcode(url):

    req=urllib2.Request(url,headers=headers)

    html=urllib2.urlopen(req).read()

    soup=BeautifulSoup(html,'html.parser')

    content=soup.select('.content')[0].stripped_strings

    print '内容：'

    for i in content:

        print i

def getinfo(x):

    q=0

    for m in range(x):

        print '获取第%s页'%str(m+1)

        murl='https://www.qiushibaike.com/text'+'/page/'+str(m+1)

        req=urllib2.Request(murl,headers=headers)

        html=urllib2.urlopen(req,timeout=60).read().decode('utf-8')

        #创建beautiful对象

        #创建本地HTML对象：soup=BeautifulSoup(open('index.html'))

        soup=BeautifulSoup(html,'html.parser')

        #打印HTML内容 print soup.prettify()

        authors=soup.select('h2')

        article=soup.select('.contentHerf')

        jokes=soup.select('.stats-vote > .number')

        comment_num=soup.select('.stats-comments > .qiushi_comments > .number')

        #获取界面上的文字

        ##print '内容：',article[0].span.string.strip()

        ##获取链接文字

        n=0

        for a in article:

            print '获取第%s页第%s条'%(str(m+1),str(n+1))

            print '作者：',authors[n].string.strip()

            print '好笑数：',jokes[n].string

            article_url=article[n]['href']

            article_url=url+article_url

            #print article_url

            getcode(article_url)

            print '评论数：',comment_num[n].string

            n+=1

            q+=1

            print '\n'

    print '总共%s条数据'%q

nu=raw_input('获取多少页：')

nu=int(nu)

#设置重新获取数据次数

tries=1

try:

    getinfo(nu)

except:

    print u'爬取数据出错。。。'
相关文章