简单爬虫入门:Python 爬虫之糗事百科

时间:2022-09-14 18:34:49

本博客为转载的哈,原博客内容代码由于糗事百科版本的变化,在正则表达式方面我稍稍做了改进,同时删除了含有图片的笑话的判断(主要是本宝宝怎么都没办法识别出来到底哪里有错,一怒之下我就删除了这个功能)

原博客网址:http://cuiqingcai.com/990.html

原博客对于每个函数以及变量都有详细的解释,我在这里就不一一赘述了。

运行结果附在了代码下方

<span style="font-size:18px;">__author__= 'CQC'
# -*-coding:utf-8 -*-
import urllib
import urllib2
import re
import thread
import time


class QBSK:
def __init__(self):
self.pageindex = 1
self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
self.headers = { 'User-Agent' : self.user_agent }
#存放段子的变量,每一个元素是每一页的段子
self.stories = []
#存放程序是否继续运行的变量
self.enable =False

def getPage(self,pageindex):
try:
url = 'http://www.qiushibaike.com/hot/page/' + str(pageindex)
request = urllib2.Request(url,headers=self.headers)
response = urllib2.urlopen(request)
pagecode = response.read().decode('utf-8')
return pagecode
except urllib2.URLError, e:
if hasattr(e,"reason"):
print u"连接糗事百科失败",e.reason
return None

def getPageItems(self, pageindex):
pagecode = self.getPage(pageindex)
if not pagecode:
print "页面加载失败"
return None
pattern = re.compile('<div class="author clearfix">.*?<h2>(.*?)</h2>.*?"content">(.*?)</div>.*?number">(.*?)</.*?number">(.*?)</.*?<div class="cmtMain">.*?"main-text">(.*?)</div>',re.S)
items = re.findall(pattern, pagecode)
pagestories = []
for item in items:
replacebr = re.compile('<br/>')
text=re.sub(replacebr, "\n", item[1])
replacespan = re.compile('<span>')
text=re.sub(replacespan, "\n", text)
replacespan2 = re.compile('</span>')
text=re.sub(replacespan2, "\n", text)

pagestories.append([item[0].strip(), text.strip(), item[2].strip()])
return pagestories

def loadPage(self):
if self.enable == True:
if len(self.stories) < 2:
pagestories = self.getPageItems(self.pageindex)
if pagestories:
self.stories.append(pagestories)
self.pageindex+=1


def getOneStory(self,pagestories,page):
for story in pagestories:
input = raw_input()
if input == "Q":
self.enable = False
return
else:
self.loadPage()
print u"第%s页\t发布人:%s\t内容%s\t赞数:%s" % (page,story[0],story[1],story[2])


def start(self):
print u"在读取糗事百科,按回车读取下一条,按Q退出"
self.enable = True
self.loadPage()
nowpage = 0
while self.enable:
if len(self.stories)>0:
pagestories1 = self.stories[0]
nowpage+=1
del self.stories[0]
self.getOneStory(pagestories1,nowpage)


spider = QBSK()
spider.start()</span>