最近lz在写抓工商公式系统(http://www.gsxt.gov.cn/index.html)的爬虫,其中的难点就是在怎么过极验验证码,搞的我不要不要的!如下:
简直是各种坑,被搞的死去活来以后还是解决了。现在回到主题!
我们不是要抓工商公式系统的数据吗?所以我们先建两个实体BaseInfo(基本信息)和LegInfo(股东信息)
public partial class BaseInfo { public BaseInfo() { } public BaseInfo(string html) { } #region Model /// <summary> /// /// </summary> public int Id { get; set; } /// <summary> /// 成立日期 /// </summary> public string ApprDate { get; set; } /// <summary> /// 公司全称 /// </summary> public string EntName { get; set; } /// <summary> /// 公司类型 /// </summary> public string EntType { get; set; } /// <summary> /// 住所 /// </summary> public string Dom { get; set; } /// <summary> /// 核准日期 /// </summary> public string EstDate { get; set; } /// <summary> /// 法人 /// </summary> public string Lerep { get; set; } /// <summary> /// 营业期限自 /// </summary> public string OpFrom { get; set; } /// <summary> /// 营业期限至 /// </summary> public string OpTo { get; set; } /// <summary> /// 经营范围 /// </summary> public string OpScope { get; set; } /// <summary> /// 注册号 /// </summary> public string RegNo { get; set; } /// <summary> /// 登记机关 /// </summary> public string RegOrg { get; set; } /// <summary> /// 登记状态 /// </summary> public string RegState { get; set; } /// <summary> /// 注册资本 /// </summary> public string RegCap { set; get; } /// <summary> /// 行业领域 /// </summary> public string IndcodeNameLv2 { get; set; } /// <summary> /// 省 /// </summary> public string Province { get; set; } /// <summary> /// 市 /// </summary> public string City { get; set; } /// <summary> /// 网址 /// </summary> public string Weburl { get; set; } /// <summary> /// 评级 /// </summary> public string Rating { get; set; } /// <summary> /// /// </summary> public int CompanyInfoId { get; set; } #endregion Model #region 导航属性 public virtual CompanyInfo CompanyInfo { get; set; } #endregion } public partial class LegInfo : SpiderModel { public LegInfo() { } #region Model /// <summary> /// /// </summary> public int Id { get; set; } /// <summary> /// /// </summary> public string BlicNo { get; set; } /// <summary> /// /// </summary> public string BlicType { get; set; } /// <summary> /// /// </summary> public string ItemId { get; set; } /// <summary> /// /// </summary> public string Inv { get; set; } /// <summary> /// /// </summary> public string InvType { get; set; } /// <summary> /// /// </summary> public int CompanyInfoId { get; set; } public string CreateTimeStr { get; set; } public string MoneyRange { get; set; } public string Renjiao { get; set; } #endregion Model #region 导航属性 /// <summary> /// 导航属性,公司。 /// </summary> public virtual CompanyInfo CompanyInfo { get; set; } #endregion }
先破解验证码,获取需要查询的公司的URL,然后抓取公司详情也的HTML(过程略);关键代码有两个方法GetBaseInfo和GetLegInfoes
如下:
/// <summary> /// 获取工商基本数据 /// </summary> /// <param name="url"></param> /// <param name="companyInfo"></param> public static BaseInfo GetGsxtInfo(string url, out string html) { HttpItem item = new HttpItem() { URL = url,//URL 必需项 Method = "get", Referer = "http://www.gsxt.gov.cn/corp-query-homepage.html", Timeout = }; html = GetHtml(item); string companyName = GetXpathNode(html, "//h1[@class=\"fullName\"]"); string companyNo = GetXpathNode(html, "//*[@class=\"nameBoxColor\"]"); if (companyNo != "") { //CompanyInfo companyInfo = new CompanyInfo(); //companyInfo.CompanyName = companyName; //companyInfo.CompanyNo = companyNo; //companyInfo.State = 1; //companyInfo.AddTime = DateTime.Now; //companyInfo.NextTime = DateTime.Now; //companyInfo.BaseInfos = new List<BaseInfo>(); var baseInfo = new BaseInfo(); baseInfo.EntName = companyName; baseInfo.RegNo = companyNo; baseInfo.ApprDate = GetXpathNode(html, "//*[@class=\"companyDetail clearfix\"]/span[4]/span[1]"); baseInfo.RegState = GetXpathNode(html, "//*[@class=\"companyStatus\"]"); baseInfo.EntType = GetXpathNode(html, "//div[@class=\"overview\"]/dl[3]/dd[1]"); baseInfo.Lerep = GetXpathNode(html, "//div[@class=\"overview\"]/dl[4]/dd[1]"); baseInfo.RegCap = GetXpathNode(html, "//div[@class=\"overview\"]/dl[5]/dd[1]"); baseInfo.OpFrom = GetXpathNode(html, "//div[@class=\"overview\"]/dl[7]/dd[1]"); baseInfo.OpTo = GetXpathNode(html, "//div[@class=\"overview\"]/dl[8]/dd[1]"); baseInfo.RegOrg = GetXpathNode(html, "//*[@class=\"companyDetail clearfix\"]/span[3]/span[1]"); baseInfo.EstDate = GetXpathNode(html, "//div[@class=\"overview\"]/dl[10]/dd[1]"); baseInfo.Dom = GetXpathNode(html, "//div[@class=\"overview\"]/dl[12]/dd[1]"); baseInfo.OpScope = GetXpathNode(html, "//div[@class=\"overview\"]/dl[13]/dd"); return baseInfo; } else { return null; } } /// <summary> /// 股东信息 /// </summary> /// <param name="html"></param> /// <param name="companyInfo"></param> /// <param name="draw"></param> /// <param name="start"></param> , ) { string url = string.Format("http://www.gsxt.gov.cn{0}", GetFirstInnerText(html, "var shareholderUrl = \"", "\"")); //HttpHelper http = new HttpHelper(); HttpItem item = new HttpItem() { URL = url,//URL 必需项 Method = "post", Referer = "http://www.gsxt.gov.cn/corp-query-search-1.html", Postdata = string.Format("draw={0}&start={1}&length=5", draw, start), ContentType = "application/x-www-form-urlencoded", Timeout = }; string rhtml = GetHtml(item); if (rhtml.Equals("")) { return; } var legInfoesListPage = JObject.Parse(rhtml); var legInfoesListList = legInfoesListPage["data"].ToList(); //删除数据库中的数据 //if (draw == 1) //{ // if (legInfoesListList.Count > 0) // { // foreach (var leginfo in companyInfo.LegInfos.ToList()) // { // companyInfo.LegInfos.Remove(leginfo); // } // } //} //add foreach (var legInfoes in legInfoesListList) { reflegInfos.Add(new LegInfo { BlicNo = GetText(legInfoes["bLicNo"].ToString().Replace("\"", "")), BlicType = legInfoes["blicType_CN"].ToString().Replace("\"", ""), ItemId = legInfoes["invId"].ToString().Replace("\"", ""), Inv = legInfoes["inv"].ToString().Replace("\"", ""), InvType = GetText(legInfoes["invType_CN"].ToString().Replace("\"", "")) }); } ///下页 if (int.Parse(legInfoesListPage["totalPage"].ToString()) > draw) //获取下一页的数据 { draw++; start += ; Console.WriteLine(string.Format("查询股东信息第{0}页", draw)); GetLegInfoes(html, ref reflegInfos, draw, start); } }
到这里为了完成任务写的代码,如果需要对代码让它更加优美,就需要用IOC的模式是重构它
先创建父类:
public class SpiderModel { public SpiderModel() { } public SpiderModel(JToken token) { ToObje(token); } public virtual void ToObje(JToken token) { } public virtual SpiderModel ToObje(string html) { return new SpiderModel(); } /// <summary> /// Xpath获取值 /// </summary> /// <param name="html"></param> /// <param name="xpath"></param> /// <returns></returns> public static string GetXpathNode(string html, string xpath) { string result = string.Empty; #region Xpath提取 try { HtmlDocument htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(html); HtmlNode node = htmlDoc.DocumentNode.SelectSingleNode(xpath); if (node != null) { result = node.InnerHtml; result = new Regex("\\t", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(result, string.Empty); result = TextRemover.RemoveHTML(result);//去除HTML标签 result = TextRemover.RemoveWhiteSpace(result).Trim();//去空白字符 } } catch (Exception) { } return result; #endregion } public static string GetText(string result) { result = new Regex(@"<(p|br)[^<]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(result, "[$1]"); result = new Regex("\\[p]", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(result, "\r\n\r\n"); result = new Regex("\\[br]", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(result, "\r\n"); result = new Regex("\\t", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(result, " "); result = TextRemover.RemoveHTML(result);//去除HTML标签 result = result.Replace("+", ""); result = result.Trim(); result = RegexHelper.RegexFilter(result.ToString().Replace("\"", ""), "([a-zA-Z0-9]+)", false, RegexOptions.None); return result; } }
然后实体BaseInfo(基本信息)和LegInfo(股东信息)继承自SpiderModel
然后给BaseInfo(基本信息)和LegInfo(股东信息)重写函数
BaseInfo:
public override SpiderModel ToObje(string html) { string companyName = GetXpathNode(html, "//h1[@class=\"fullName\"]"); string companyNo = GetXpathNode(html, "//*[@class=\"nameBoxColor\"]"); if (companyNo != "") { EntName = companyName; RegNo = companyNo; ApprDate = GetXpathNode(html, "//*[@class=\"companyDetail clearfix\"]/span[4]/span[1]"); RegState = GetXpathNode(html, "//*[@class=\"companyStatus\"]"); EntType = GetXpathNode(html, "//div[@class=\"overview\"]/dl[3]/dd[1]"); Lerep = GetXpathNode(html, "//div[@class=\"overview\"]/dl[4]/dd[1]"); RegCap = GetXpathNode(html, "//div[@class=\"overview\"]/dl[5]/dd[1]"); OpFrom = GetXpathNode(html, "//div[@class=\"overview\"]/dl[7]/dd[1]"); OpTo = GetXpathNode(html, "//div[@class=\"overview\"]/dl[8]/dd[1]"); RegOrg = GetXpathNode(html, "//*[@class=\"companyDetail clearfix\"]/span[3]/span[1]"); EstDate = GetXpathNode(html, "//div[@class=\"overview\"]/dl[10]/dd[1]"); Dom = GetXpathNode(html, "//div[@class=\"overview\"]/dl[12]/dd[1]"); OpScope = GetXpathNode(html, "//div[@class=\"overview\"]/dl[13]/dd"); } return this; }
LegInfo:
public override void ToObje(JToken token) { BlicNo = SpiderModel.GetText(token["bLicNo"].ToString().Replace("\"", "")); BlicType = token["blicType_CN"].ToString().Replace("\"", ""); ItemId = token["invId"].ToString().Replace("\"", ""); Inv = token["inv"].ToString().Replace("\"", ""); InvType = GetText(token["invType_CN"].ToString().Replace("\"", "")); }
最后要一个IOC管理类
public class SpiderManage { public SpiderManage(HttpItem item) { this.Item = item; } public SpiderManage(HttpItem item,SpiderModel spiderModel) { this.Item = item; this.SpiderModel = spiderModel; } public string Html { get; set; } public HttpItem Item { get; set; } public List<SpiderModel> SpiderModelList{ get; set; } public SpiderModel SpiderModel { get; set; } public virtual string GetHtml() { ; ) { i--; HttpHelper http = new HttpHelper(); HttpResult result; object oj = new object(); lock (oj) { Thread.Sleep(); result = http.GetHtml(Item); } if (result.StatusCode == System.Net.HttpStatusCode.OK) { string rhtml = result.Html; if (!rhtml.Equals("<script>window.location.href='/index/invalidLink'</script>")) { Html = result.Html; return Html; } } } Html = ""; return Html; } public SpiderModel GetOjb() { return SpiderModel.ToObje(Html); } public void toList() { // SpiderModelList //SpiderModel } }
最后面我就只要 List<SpiderManage> sManageList用于保存对象就可以了
重新改写前面的GetBaseInfo和GetLegInfoes函数。
public static List<SpiderManage> sManageList = new List<SpiderManage>(); ////////////////////////////////////////////// , ) { string url = string.Format("http://www.gsxt.gov.cn{0}", GetFirstInnerText(html, "var shareholderUrl = \"", "\"")); HttpItem item = new HttpItem() { URL = url,//URL 必需项 Method = "post", Referer = "http://www.gsxt.gov.cn/corp-query-search-1.html", Postdata = string.Format("draw={0}&start={1}&length=5", draw, start), ContentType = "application/x-www-form-urlencoded", Timeout = }; SpiderManage sManage = new SpiderManage(item); sManage.SpiderModel = new LegInfo(); sManage.GetHtml(); string rhtml = sManage.Html; if (rhtml.Equals("")) { return; } var legInfoesListPage = JObject.Parse(rhtml); sManageList.Add(sManage); //var legInfoesListList = legInfoesListPage["data"].ToList(); //删除数据库中的数据 //add //foreach (var legInfoes in legInfoesListList) // { // reflegInfos.Add(new LegInfo { BlicNo = GetText(legInfoes["bLicNo"].ToString().Replace("\"", "")), BlicType = legInfoes["blicType_CN"].ToString().Replace("\"", ""), ItemId = legInfoes["invId"].ToString().Replace("\"", ""), Inv = legInfoes["inv"].ToString().Replace("\"", ""), InvType = GetText(legInfoes["invType_CN"].ToString().Replace("\"", "")) }); // } ///下页 while (int.Parse(legInfoesListPage["totalPage"].ToString()) > draw) //获取下一页的数据 { draw++; start += ; sManageList.Add(new SpiderManage(new HttpItem() { URL = url,//URL 必需项 Method = "post", Referer = "http://www.gsxt.gov.cn/corp-query-search-1.html", Postdata = string.Format("draw={0}&start={1}&length=5", draw, start), ContentType = "application/x-www-form-urlencoded", Timeout = }, new LegInfo())); //Console.WriteLine(string.Format("查询股东信息第{0}页", draw)); //GetLegInfoes(html, ref reflegInfos, draw, start); } }
后面怎么用就不讨论了,只要是把sManageList拿过去调度分配抓取就可以了