python静态网页爬虫之xpath(简单的博客更新提醒功能)

时间:2023-03-09 04:29:21
python静态网页爬虫之xpath(简单的博客更新提醒功能)

直接上代码:

#!/usr/bin/env python3
#antuor:Alan
#-*- coding: utf-8 -*- import requests
from lxml import etree
import datetime,time
import os class xxoohelper(object): #易读
def __init__(self):
self.url = 'http://www.cnblogs.com/alan-babyblog/' #初始化
def getSource(self):
html = requests.get(self.url).content #content比text好用,一个返回的是byte,一个返回的是str
return html
def getContent(self,html): #先大后小
selector = etree.HTML(html)
title = selector.xpath('//div[1]/div[2]/a/text()')[0].strip() #从列表提取文本
content = selector.xpath('//div[1]/div[2]/div[1]/div/div[1]/div[3]/div/text()')[0].strip()
post_time = selector.xpath('//div[1]/div[2]/div[1]/div/div[1]/div[5]/text()')[0].strip()
send_text = title+content+post_time #类型是str
return send_text
def tosave(self,text):
with open('myblog.txt','a') as f:
f.write(('{0}\n').format(text)) #换行
def tocheck(self,data):
if not os.path.exists('myblog.txt'): #判断是否存在文件
return True
else:
with open ('myblog.txt','r') as f:
existblog = f.readlines()
#print(data+'\n')
if data +'\n' in existblog: #判断是否已经纪录过内容
return False
else:
return True
if __name__ == '__main__': #程序入口
helper = xxoohelper() #实例化
while True : #while循环不断监控页面
source = helper.getSource()
content = helper.getContent(source)
if helper.tocheck(content):
post_time = str(datetime.datetime.now())
print(post_time,'有新内容\n',content)
helper.tosave(content)
else:
print('扫描中......')
pass
time.sleep(30)