rules = [
Rule(SgmlLinkExtractor(allow=('/u012150179/article/details'),
restrict_xpaths=('//li[@class="next_article"]')),
callback='parse_item',
follow=True)
] def parse_item(self, response): #print "parse_item>>>>>>"
item = CsdnblogcrawlspiderItem()
blog_url = str(response.url)
blog_name = response.xpath('//div[@id="article_details"]/div/h1/span/a/text()').extract() item['blog_name'] = [n.encode('utf-8') for n in blog_name]
item['blog_url'] = blog_url.encode('utf-8') return item
相关文章
- scrapy代理的设置
- scrapy实战9动态设置ip代理从数据库中随机获取一个可用的ip:
- scrapy——7 scrapy-redis分布式爬虫,用药助手实战,Boss直聘实战,阿布云代理设置
- 网络爬虫之scrapy框架设置代理
- scrapy框架设置代理ip,headers头和cookies
- scrapy设置随机User-agent、scrapy爬取大量网页、设置爬虫请求并发数
- scrapy设置headers,cookies
- Pycharm调试scrapy报错:Unknown command: crawl Use "scrapy" to see available commands
- 运行scrapy crawl (文件名)时显示invalid syntax和no modle 'win32api'解决方案
- scrapy抓取拉勾网职位信息(三)——爬虫rules内容编写