pyspider使用

#!/usr/bin/env python

# -*- encoding: utf-8 -*-

# Created on 2018-11-08 22:33:55

# Project: qsbk

from pyspider.libs.base_handler import *

from lxml import html

from urlparse import urljoin

import datetime

class Handler(BaseHandler):

    crawl_config = {

    }

    def __init__(self):

        self.start_url='https://www.qiushibaike.com/'

    @every(minutes=24 * 60)

    def on_start(self):

        self.crawl(self.start_url, callback=self.index_page)

    @config(age=10 * 24 * 60 * 60)

    def index_page(self, response):

        root=html.fromstring(response.content.decode('utf-8'))

        content_left_node = root.xpath("//div[@id='content-left']")

        div_node_list = content_left_node[0].xpath("./div")

        tasks=[]

        for div_node in div_node_list:

            title_node = div_node.xpath(

                ".//div[@class='author clearfix']/a[contains(@onclick,'web-list-author-text')]/h2/text()")

            __content_url =div_node.xpath("./a[@class='contentHerf']/@href")

            content_url = urljoin(self.start_url, __content_url[0])

            content_node = div_node.xpath(".//div[@class='content']/span[1]")

            content = content_node[0].xpath('string(.)')

            name = title_node[0]

            info = ''.join(content)

            crawldate = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

            item = {}

            item['name'] = name.strip() if name else name

            item['info'] = info.strip() if info else info

            item['crawldate'] = crawldate

            item['url'] = content_url

            tasks.append(item)

        return {'data':tasks}
相关文章