基于Python的一个简单爬虫(糗事百科爬虫)

时间:2023-01-06 18:35:24
# -*- coding: utf-8 -*-
import urllib2
from bs4 import BeautifulSoup
import thread
import time
class qiushibaike:
    """docstring for ClassName"""
    def __init__(self):
        self.page = 1   #下载了的页数
        self.pages = [] #保存已下载的html
        self.enable = False #标志位
        self.url='http://m.qiushibaike.com/hot/page/'
    # 用于加载新的页面
    def LoadPage(self):
        # 如果用户未输入quit则一直运行
        while self.enable:
            # 如果pages数组中的内容小于5个
            if len(self.pages) < 5:
                try:
                    # 获取新的页面,加入到数组中
                    url=self.url+str(self.page)
                    newPage = self.GetHtml(url)
                    self.page += 1
                    self.pages.append(newPage)
                except:
                    print '无法链接糗事百科!'
            else:
                time.sleep(1)
    
    def ParseHtml(self,html):
        items=self.GetContenBlock(html)
        for item in items:
            content=self.ParseContent(item)
            try:
                print u"作者",content['author'],u"时间:",content["time"]
                print content["content"]
                print '------------------------------------------'
            except:
                print u'尼玛这样都有错啊!'
            
    #用来获取html
    def GetHtml(self,url):
        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        headers = { 'User-Agent' : user_agent } 
        req = urllib2.Request(url,None,headers)  
        response = urllib2.urlopen(req)  
        html = response.read()
        return html
    def GetContenBlock(self,html):
        soup = BeautifulSoup(str(html))
        items=soup.findAll('div',{'class':'article block untagged mb15'})
        return items;

    def ParseContent(self,item):
        #soup = BeautifulSoup(str(item))
        content=item.find('div',{'class':'content'})
        result={}
        if content!=None:
            try:
                result["content"]=content.text.strip()
                result["time"]=content.get("title").strip()
            except:
                result["content"]=None
                result["time"]=None
        else:
            result["content"]=None
            result["time"]=None
        #author=item.find('div',{'class':'author clearfix'})
        #if author!=None:
        #    result["author"]=author.findAll('a')[1].string
        #else:
        #    result["author"]=None
        result['author']=self.ParseAuthor(item)
        return result;

    def ParseAuthor(self,item):
        try:
            #soup = BeautifulSoup(str(item))
            item=item.find('div',{'class':'author clearfix'})
            if item!=None:
                return item.findAll('a')[1].text
            else:
                return None;
        except:
            return None;
        
    def Start(self):
        self.enable = True
        page = self.page
        print u'正在加载中请稍候......'
        # 新建一个线程在后台加载页面并存储
        thread.start_new_thread(self.LoadPage,())
        #----------- 加载处理糗事百科 -----------
        while self.enable:
            # 如果self的page数组中存有元素
            if self.pages:
                nowPage = self.pages[0]
                del self.pages[0]
                self.ParseHtml(nowPage)
                page += 1

print u"""
---------------------------------------
   程序:糗百爬虫
   版本:0.1
   作者:zz
   日期:2013-05-15
   语言:Python 2.7
   功能:按下回车依次浏览今日的糗百热点
---------------------------------------
"""


print u'请按下回车浏览今日的糗百内容:'
raw_input(' ')
myModel = qiushibaike()
myModel.Start()