在Celery任务中运行Scrapy蜘蛛(django项目)

时间:2022-08-20 19:17:39

I'm trying to run scrapy (spider/crawl)from django project (task in the admin interrface using celery). this is my code . this is the error when I try to call the task from a python shell 在Celery任务中运行Scrapy蜘蛛(django项目)

我正在尝试从django项目运行scrapy(spider / crawl)(使用celery在admin interrface中执行任务)。这是我的代码。当我尝试从python shell调用任务时,这是错误

djangoproject:

-monapp:        

   -tasks.py
   -spider.py
   -myspider.py            '
   -models.py
         .....

tasks.py:

  from djcelery import celery
  from demoapp.spider import *
  from demoapp.myspider import *

  @celery.task
  def add(x, y):
    return x + y

  @celery.task
  def scra():
        result_queue = Queue()
        crawler = CrawlerWorker(MySpider(), result_queue)
        crawler.start()
        return "success"

spider.py:

         from scrapy import project, signals
         from scrapy.settings import Settings
         from scrapy.crawler import Crawler
         from scrapy.xlib.pydispatch import dispatcher
         from multiprocessing.queues import Queue
         import multiprocessing

         class CrawlerWorker(multiprocessing.Process):

            def __init__(self, spider, result_queue):
                multiprocessing.Process.__init__(self)
                self.result_queue = result_queue
                self.crawler = Crawler(Settings())
                if not hasattr(project, 'crawler'):
                self.crawler.install()
                self.crawler.configure()

                self.items = []
                self.spider = spider
                dispatcher.connect(self._item_passed, signals.item_passed)

             def _item_passed(self, item):
                self.items.append(item)

             def run(self):
                self.crawler.crawl(self.spider)
                self.crawler.start()
                self.crawler.stop()
                self.result_queue.put(self.items)

myspider.py

        from scrapy.selector import HtmlXPathSelector
        from scrapy.contrib.spiders import CrawlSpider, Rule
        from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
        from scrapy.item import Item, Field

        class TorentItem(Item):

         title = Field()
         desc = Field()
        class MySpider(CrawlSpider):
         name = 'job'
         allowed_domains = ['tanitjobs.com']
         start_urls = [\
                 'http://tanitjobs.com/browse-by-category/Nurse/',]
         rules = (
        Rule (SgmlLinkExtractor(allow=('page=*',)
                      ,restrict_xpaths=('//div[@class="pageNavigation"]',), 
                       unique = True)
           , callback='parse_item', follow= True),
             )
        def parse_item(self, response):
           hxs = HtmlXPathSelector(response)
           items= hxs.select('\
                     //div[@class="offre"]/div[@class="detail"]')
           scraped_items =[]

               for item in items:
                 scraped_item = TorentItem()

                         scraped_item['title']=item.select(\
                               'a/strong/text()').extract() 
                 scraped_item['desc'] =item.select(\
                          './div[@class="descriptionjob"]/text()').extract()

                 scraped_items.append(scraped_item) 
                 return scraped_items 

1 个解决方案

#1


1  

I got it work mine on the shell using django management command. Below is my code snippet. Feel free to modify to fit your needs.

我使用django管理命令在shell上工作了。下面是我的代码片段。随意修改以满足您的需求。

from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import signals
from scrapy.utils.project import get_project_settings

from django.core.management.base import BaseCommand

from myspiderproject.spiders.myspider import MySpider

class ReactorControl:
    def __init__(self):
    self.crawlers_running = 0

    def add_crawler(self):
        self.crawlers_running += 1

    def remove_crawler(self):
        self.crawlers_running -= 1
        if self.crawlers_running == 0:
            reactor.stop()

def setup_crawler(domain):
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    crawler.signals.connect(reactor_control.remove_crawler, signal=signals.spider_closed)

    spider = MySpider(domain=domain)
    crawler.crawl(spider)
    reactor_control.add_crawler()
    crawler.start()

reactor_control = ReactorControl()

class Command(BaseCommand):
    help = 'Crawls the site'

    def handle(self, *args, **options):
        setup_crawler('somedomain.com')
        reactor.run()  # the script will block here until the spider_closed signal was sent

hope this helps.

希望这可以帮助。

#1


1  

I got it work mine on the shell using django management command. Below is my code snippet. Feel free to modify to fit your needs.

我使用django管理命令在shell上工作了。下面是我的代码片段。随意修改以满足您的需求。

from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import signals
from scrapy.utils.project import get_project_settings

from django.core.management.base import BaseCommand

from myspiderproject.spiders.myspider import MySpider

class ReactorControl:
    def __init__(self):
    self.crawlers_running = 0

    def add_crawler(self):
        self.crawlers_running += 1

    def remove_crawler(self):
        self.crawlers_running -= 1
        if self.crawlers_running == 0:
            reactor.stop()

def setup_crawler(domain):
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    crawler.signals.connect(reactor_control.remove_crawler, signal=signals.spider_closed)

    spider = MySpider(domain=domain)
    crawler.crawl(spider)
    reactor_control.add_crawler()
    crawler.start()

reactor_control = ReactorControl()

class Command(BaseCommand):
    help = 'Crawls the site'

    def handle(self, *args, **options):
        setup_crawler('somedomain.com')
        reactor.run()  # the script will block here until the spider_closed signal was sent

hope this helps.

希望这可以帮助。