抓取oschina上面的代码分享python块区下的标题和对应URL

# -*- coding=utf-8 -*-

import requests,re

from lxml import etree

import sys

reload(sys)

sys.setdefaultencoding( "utf-8" )

"""

目标：抓取oschina上面的代码分享python块区下的 标题和对应URL

"""

class spiders_oschina:

    def __init__(self):

        print u'开始运行'

    def get_html_obj(self,url = 'http://www.oschina.net/code/list?lang=python&catalog=&show=time&sort=&p=1'):    #传入地址，返回一个xpath对象

        tou = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36'}

        obj = etree.HTML(requests.get(url,headers = tou).content)    # 实例化可以被lxml操作的对像

        return obj

    def get_page(self):

        tmp = '//*[@id="OSC_Content"]/div[1]/ul/li[11]/a/text()'

        obj = self.get_html_obj()

        page = int(obj.xpath(tmp)[0])    #从第一页可取得一共有多少页

        urllist = []

        for i in range(1,page+1):

            urllist.append('http://www.oschina.net/code/list?lang=python&catalog=&show=time&sort=&p=' + str(i))

        return urllist

    def get_result(self,obj):    #需要传入一个xpath对像

        tmp = '//*[@id="OSC_Content"]/div[1]/div[3]/ul/li/h3/a/text()'   #a标签中的文本

        tmp2 = '//*[@id="OSC_Content"]/div[1]/div[3]/ul/li/h3/a/@href'  #a标签的href属性

        t = obj.xpath(tmp)              #[0].decode('utf-8')      #测试xpath方法

        t2 = obj.xpath(tmp2)

        f = open('res.txt','a')

        str = ''

        for i in t:

            n = 0

            str += i + ' ——>对应的URL是：' + t2[n] + '\n \n --------------------------\n'

            n += 1

        f.write(str)        #把结果最终写入txt文本

        f.close()

if __name__  ==  "__main__":

    oca = spiders_oschina()

    n = 1

    for i in oca.get_page():

        obj = oca.get_html_obj(i)

        oca.get_result(obj)

        print u'第%d页爬取完成' %n

        n += 1

貌似生成的url有问题，待优化。。。

秒客网

抓取oschina上面的代码分享python块区下的标题和对应URL

相关文章

抓取oschina上面的代码分享python块区下的 标题和对应URL

相关文章

抓取oschina上面的代码分享python块区下的标题和对应URL