爬取字段 spider_text

时间:2023-03-09 06:27:59
爬取字段 spider_text

__author__ = 'sus'
import urllib
import urllib2
import re

def getPage(url):        #获取网页
request = urllib2.Request(url)
response = urllib2.urlopen(request)
page=response.read().decode('utf-8')
return page

def getTitle(page):       #获取目录
pattern = re.compile('<a href="http://news.bistu.edu.cn.*?>(.*?)</a>',re.S)
items = re.findall(pattern,page)
for item in items:
return item

page=getPage("http://www.bistu.edu.cn")
print getTitle(page)