HtmlAgilityPack组件

时间:2023-03-09 15:05:01
HtmlAgilityPack组件

HtmlAgilityPack组件用于解析Html字符串,一个典型的应用场景是用于网页爬虫。

示例程序

using Common.Tools;
using Datebase.Entity;
using HtmlAgilityPack;
using Http.Extension;
using ServiceStack.Orm.Extension.Imples;
using ServiceStack.Orm.Extension.Interface;
using ServiceStack.OrmLite;
using System;
using System.Collections.Generic;
using System.Configuration;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks; namespace WebSpider
{
class Program
{
public static IOrmClient dbClient = new OrmClient(ConfigurationManager.ConnectionStrings["mssql"].ConnectionString, SqlServerDialect.Provider);
static void Main(string[] args)
{
List<Task> tasks = FetchSinger();
Task.WaitAll(tasks.ToArray());
Console.WriteLine("歌手信息抓取完毕!");
Console.ReadLine();
} /// <summary>
/// 网页爬虫程序,从音乐网站获取最热的前100位歌手的信息
/// </summary>
private static List<Task> FetchSinger()
{
List<Task> tasks = new List<Task>();
HttpResult result = HttpCore.Send(new HttpItem()
{
URL = "http://mp3.sogou.com/static_new/topsinger_remen.html",
Method = MethodType.GET
});
HtmlDocument document = new HtmlDocument();
document.LoadHtml(result.Html);
var rootNode = document.DocumentNode;
//获取第1到第10位歌手
var top10Nodes = rootNode.SelectNodes("//div[@id='right2']/ul[@class='singerlist2']/li/a");
if (top10Nodes != null)
{
Task t = new Task(nodes =>
{
var singerNodes = nodes as HtmlNodeCollection;
if (singerNodes != null)
{
foreach (var hrefNode in singerNodes)
{
//歌手链接
var link = hrefNode.GetAttributeValue("href", "");
//歌手的序列号码
var noNode = hrefNode.SelectSingleNode("./strong[@class='singertop10']");
if (noNode != null)
{
int sNo = -;
int.TryParse(noNode.InnerText.Replace("Top", "").Trim(), out sNo);
SingerDetail(sNo, link);
}
}
}
}, top10Nodes);
t.Start();
tasks.Add(t);
}
//获取第11到第100位歌手
var tbNodes = rootNode.SelectNodes("//table[@class='indextable']");
//遍历捕获的所有的table对象
foreach (var e in tbNodes)
{
Task t = new Task(p =>
{
var tbNode = p as HtmlNode;
if (tbNode != null)
{
var hrefNodes = tbNode.SelectNodes("./tbody/tr/td/a");
if (hrefNodes != null)
{
foreach (var href in hrefNodes)
{
//序号
var sNo = -;
var trNode = href.ParentNode.PreviousSibling.PreviousSibling;
if (trNode != null)
{
int.TryParse(trNode.InnerText.Trim().TrimEnd('.'), out sNo);
}
var link = href.GetAttributeValue("href", "");
if (!string.IsNullOrEmpty(link))
{
SingerDetail(sNo, link);
}
}
}
}
}, e);
t.Start();
tasks.Add(t);
}
return tasks;
} /// <summary>
/// 通过歌手链接访问歌手详细信息
/// </summary>
/// <param name="sNo">序列号</param>
/// <param name="link">歌手的链接地址</param>
private static void SingerDetail(int sNo, string link)
{
var linkResult = HttpCore.Send(new HttpItem()
{
URL = link,
Method = MethodType.GET
});
if (!string.IsNullOrEmpty(linkResult.Html))
{
T_Singer user = new T_Singer();
user.ID = Utility.GenerateId();
user.SerialNumber = sNo;
user.IsApprove = true;
user.CreateBy = "admin";
user.CreateDate = DateTime.Now;
user.ModifyBy = "admin";
user.ModifyDate = DateTime.Now;
HtmlDocument linkDoc = new HtmlDocument();
linkDoc.LoadHtml(linkResult.Html);
//姓名/昵称
var name = linkDoc.DocumentNode.SelectSingleNode("//div[@class='song_tit']");
if (name != null)
{
user.RealName = user.NickName = name.InnerText.Trim().Replace("<br>", System.Environment.NewLine);
}
//包含个人信息的所有的li元素
var lis = linkDoc.DocumentNode.SelectNodes("//ul[@class='song_detail']/li");
//国籍
var Nationality = linkDoc.DocumentNode.SelectSingleNode("//ul[@class='song_detail']/li[1]/span");
user.Nationality = Search(lis, "国籍");
//出生地
user.Birthplace = Search(lis, "出生地");
//出生日期
//出生日期
var temp = Search(lis, "出生日期");
var match = Regex.Match(temp, @"\d{0,4}年\d{1,2}月\d{1,2}日");
var bir = string.Empty;
if (match != null)
{
var birArr = match.Value.Split(new string[] { "年", "月", "日" }, StringSplitOptions.RemoveEmptyEntries);
if (birArr.Length > )
bir += birArr[];
if (birArr.Length > )
bir += "-" + birArr[];
if (birArr.Length > )
bir += "-" + birArr[];
}
DateTime bDay = new DateTime(, , );
if (DateTime.TryParse(bir, out bDay))
user.Birthday = bDay;
//星座
user.Constellation = Search(lis, "星座");
//简介
var selfDescNode = linkDoc.GetElementbyId("desc_long");
selfDescNode = selfDescNode ?? linkDoc.GetElementbyId("desc_short");
if (selfDescNode != null)
user.BriefIntroduction = selfDescNode.InnerText.Replace("<br>", "").Trim();
dbClient.Insert(user);
}
} /// <summary>
/// 从节点中查找指定数据方法
/// </summary>
private static string Search(HtmlNodeCollection nodes, string key)
{
if (nodes != null)
{
foreach (var node in nodes)
{
if (node.FirstChild.InnerText.Trim().StartsWith(key))
{
var spanNode = node.SelectSingleNode("./span");
if (spanNode != null)
{
return spanNode.InnerText.Trim().Replace("<br>", System.Environment.NewLine);
}
}
}
}
return string.Empty;
}
}
}