NodeJs_1 爬取某网站首页博客的爬虫Demo

时间:2022-02-07 07:49:24
刚开始学js, 在 https://github.com/alsotang/node-lessons/tree/master/lesson3 教程的基础上, 将所抓取的标题对应的也一并文章进行抓取。 由于异步事件模型的原因,如果利用for循环的话
, 对文章进行抓取时,直接for循环执行get事件,但由于异步造成访问间隔太短会报出503的错误,暂时不了解有什么通用的手段避免, 因此利用events库的信号来模拟实现同步抓取, 源码如下。
 
var superagent = require('superagent');var cheerio = require('cheerio');var events = require('events');var app = require('express')();var emitter = new events.EventEmitter();var srcHost = 'https://cnodejs.org';var items = [];var titleAry = [];var indexPage = null;var curIndex = 0;emitter.on('childEvnts', function(data){var $element = indexPage(titleAry[curIndex]);var $ = cheerio.load(data.text);var curContext = $("#main .markdown-text");     items.push({ title : $element.attr('title'), href : $element.attr('href'), context : $(curContext[0]).text()});console.log(curIndex, items.length);if (items.length === titleAry.length)emitter.emit("End");else {curIndex++;console.log(indexPage(titleAry[curIndex]).attr('href'));ownGet(srcHost + indexPage(titleAry[curIndex]).attr('href'));}});function ownGet (url) {superagent.get(url).end(function(err, articleData) {if (err){console.log(err.message);return;}emitter.emit('childEvnts', articleData);});}app.get('/', function (req, res, next) {if (items.length !== 0){res.send(items);return;}    superagent.get(srcHost)        .end(function(err, data){            if (err) {                return;            }            var $ = cheerio.load(data.text);            indexPage = $;            titleAry = $("#topic_list .topic_title");            var $element = $(titleAry[0]);            ownGet(srcHost + $element.attr('href'));        });(function(res){emitter.on('End', function(){            res.send(items);        });    })(res);});app.listen(3000, function () {console.log("start listen port 3000");});