nodejs .http模块, cheerio模块 实现 小爬虫.

时间:2022-05-05 15:13:36

代码:

 var http = require("http");

 var cheerio = require("cheerio");

 var url = 'http://www.imooc.com/learn/348';

 http.get(url, function(res){
var html = ''; res.on('data', function(data){
html += data;
}); res.on('end', function(){
var courseData = filterChapters(html); printCourseInfo(courseData);
console.log(courseData);
});
}).on('error', function(){
console.log("获取课程数据出错!");
}); function filterChapters(html)
{
var $ = cheerio.load(html); //所有章节
var chapters = $('.chapter'); var courseData = []; chapters.each(function(item){
var chapter = $(this);
var chapterTitle = chapter.find('h3 strong').text().replace(/\r|\n/ig,"").trim();
var videos = chapter.find(".video").children('li'); var chapterData = {
chapterTitle: chapterTitle,
videos: []
}; videos.each(function(index, item2) {
var video = $(this).find('.J-media-item');
var videoTitle = video.text().replace(/\r|\n/ig,"").trim();
var id = video.attr('href').split('video/')[1]; chapterData.videos.push({
title: videoTitle,
id: id
})
}); courseData.push(chapterData);
}); return courseData;
} function printCourseInfo(courseData)
{
courseData.forEach(function(item){
var chapterTitle = item.chapterTitle;
console.log(chapterTitle + '\n'); item.videos.forEach(function(video){
console.log(' [' + video.id+ ']' + video.title);
});
});
}

运行:

nodejs  .http模块, cheerio模块 实现 小爬虫.

----------------------------------------------------------------------

参考链接: