基于webmagic的java网页爬虫,抓取网页指定节点,然后使用dom4j分析xml数据

时间:2023-01-14 08:12:30

1、webmagic是一个非常好用的网页爬虫,功能丰富,强悍,可以按照jquery类似的css选择器,选择节点,也可以按照xpath抓取指定节点。抓取数据后,可以分析数据。

更详细的请看官方网站,传送门:http://git.oschina.net/flashsword20/webmagic

2、下面给出一个具体的实例,可以直接运行哦。

import java.io.ByteArrayInputStream;
import java.util.List;

import org.apache.commons.lang.StringUtils;
import org.dom4j.Document;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;

//import com.ee.weixin.spider.model.Inventory;

/**
* 网页抓取器。
* @author yinlei
* 2014-3-5 下午4:27:51
*/
public class SteelHomePageProcessor implements PageProcessor {
private Site site = Site.me().setDomain("www.steelhome.cn").setCharset("gb2312");

@Override
public void process(Page page) {
List<String> links = page.getHtml().links().regex("http://www\\.steelhome\\.cn/biz/biz_search\\.php\\?view\\=search_xh\\&page\\=\\d+").all();
page.addTargetRequests(links);
page.putField("content", page.getHtml().xpath("//tbody").all());
}

@Override
public Site getSite() {
return site;
}

@SuppressWarnings("unchecked")
public static void main(String[] args) {
//String str = "<a href=\"\" tjminghao')\"=\"\"><img class=\"img_164453\"";
////String pattern = "w+'\\)\"=\"\"";
//String pattern = "[A-Za-z0-9]+'\\)\"=\"\"";
//System.out.println(pattern);
//boolean result = str.matches(pattern);
//System.out.println(result);
//
//str = str.replaceAll(pattern, "");
//System.out.println(str);
//
//Pattern regex = Pattern.compile("[A-Za-z0-9]+'\\)\"=\"\"");
//
//Matcher matcher = regex.matcher(str);
//
//while (matcher.find()) {
//System.out.println(matcher.group());
//}

String html = "http://www.steelhome.cn/biz/biz_search.php?view=search_xh&page=1";
//boolean result = html.matches("http://www\\.steelhome\\.cn/biz/biz_search\\.php\\?view\\=search_xh\\&page\\=\\d+");
//System.out.println(result);

Spider spider = Spider.create(new SteelHomePageProcessor());

FilePipeline filePipeline = new FilePipeline("c:\\steelhome");
spider.addPipeline(filePipeline);
ResultItems resultItems = spider.get(html);
List<String> bodyList = resultItems.get("content");

System.out.println(bodyList.size());

String tbody = bodyList.get(12);
System.out.println(tbody);


tbody = tbody.replace(" ", " ").replaceAll("[A-Za-z0-9]+'\\)\"=\"\"", "");//这个是因为,页面中有一些错误,通过正则表达式将其删去

try {
SAXReader saxReader = new SAXReader();
ByteArrayInputStream inputStream = new ByteArrayInputStream(tbody.getBytes());
Document document = saxReader.read(inputStream);
Element rootElement = document.getRootElement();
List<Element> trList = rootElement.elements("tr");

//List<Inventory> inventoryList = new ArrayList<Inventory>();
int omit = 0;
for (Element trElement : trList) {
if (omit++ == 0) {
continue;
}
//Inventory inventory = new Inventory();
//inventoryList.add(inventory);

List<Element> tdList = trElement.elements("td");

if (tdList == null) {
continue;
}

int j = 1;
for (Element tdElement : tdList) {
if (j == 1) {//checkbox 省略
j++;
continue;
} else if (j == 2) {//公司 联系人 电话
String value = tdElement.elementTextTrim("a");//公司简称
System.out.println(value);
Element tbodyElement = (Element)tdElement.element("div").element("table").element("tbody");
List<Element> elements = tbodyElement.elements("tr");

if (elements.size() == 2) {
Element company = elements.get(0).element("td").element("strong");//公司
System.out.println(company.getTextTrim());

Element contactElement = elements.get(1).element("td");
String contactPerson = contactElement.getTextTrim();
System.out.println(contactPerson);
String[] contacts = StringUtils.split(contactPerson, ":");
String contact = contacts[1].replace("电 话", "").trim();//联系人
//inventory.setContact(contact);
String telephone = contacts[2].replace("传 真", "").trim();//电话
//inventory.setTelephone(telephone);
}
} else if (j == 3) {//空行省略
j++;
continue;
} else if (j == 4) {//品名
String value = tdElement.elementTextTrim("a");
System.out.println(value);
} else if (j == 5) {//材质
String value = tdElement.getTextTrim();
System.out.println(value);
} else if (j == 6) {//规格
String value = tdElement.getTextTrim();
System.out.println(value);
} else if (j == 7) {//价格
String value = tdElement.getTextTrim();
System.out.println(value);
} else if (j == 8) {//数量
System.out.println(tdElement.getTextTrim());
} else if (j == 9) {//钢厂
System.out.println(tdElement.getTextTrim());
} else if (j == 10) {//地址/交货点
System.out.println(tdElement.getTextTrim());
}
j++;
}

// if (tdList.size() == 9) {
//
// } else if (tdList.size() == 1) {
// List<Element> elements = tdList.get(0).element("div").elements("span");
// String contact = elements.get(1).getTextTrim().substring(4).trim();
// inventory.setContact(contact);
// System.out.println(contact);
//
// String tele = elements.get(2).getTextTrim().substring(5).trim();
// String[] phones = StringUtils.split(tele, " ");
// if (phones.length == 2) {
// inventory.setMobilephone(phones[0]);
// inventory.setTelephone(phones[1]);
// } else {
// inventory.setMobilephone(tele);
// }
//
// String address = elements.get(3).getTextTrim().substring(5).trim();
// inventory.setAddress(address);
// }
}
} catch (Exception e) {
e.printStackTrace();
}
}

}

上面是一个demo的例子,代码写的比较乱但是都是可以运行。