HTMLParser解析HTMl标签的实例

时间:2022-11-10 18:13:30
public class HtmlParserTest {

/**
* @param args
* @throws ParserException
*/
/**
* @param args
* @throws ParserException
*/
public static void main(String[] args) throws ParserException {
long first = System.currentTimeMillis();

/**
* 解析器
*/
String path_url = "http://mobile.zol.com.cn/";
Parser myParser = new Parser(path_url);
/*设置编码 */
myParser.setEncoding("GBK");

/*HTML标签名定义*/
String div = "div";
String className = "class";
String classValue = "category_nav";
String href="href";

/*
* 过滤到的标签过滤
* NodeFilter filter = new TagNameFilter(div);
*/
/*
* 过滤有属性的HTML
* NodeFilter[] nodeFilters = new NodeFilter[1];
* nodeFilters[0] = new AndFilter(new TagNameFilter(div),new HasAttributeFilter(className,classValue));
*/
NodeFilter nodeFilter = new AndFilter(new TagNameFilter(div),new HasAttributeFilter(className,classValue));

/**
* 进行查询匹配
*/
NodeList nodeList = myParser.extractAllNodesThatMatch(nodeFilter);

/**
* 可执行多次过滤器
* 在NodeList中执行过滤器时,第二个参数为True
*/
nodeList = nodeList.extractAllNodesThatMatch(new TagNameFilter("dl"),true);
nodeList = nodeList.extractAllNodesThatMatch(new TagNameFilter("dt"),true);
nodeList = nodeList.extractAllNodesThatMatch(new AndFilter(new TagNameFilter("a"),new HasAttributeFilter(href)),true);

//得到一个Node数组
Node[] node = nodeList.toNodeArray();
System.out.println("链接名称:" + ((LinkTag) node[0]).getLinkText());
System.out.println("链接地址:"+((LinkTag)node[0]).getLink());
/**
* 计算执行时间
*/
long now = System.currentTimeMillis();
double time = (double) (now - first) / 1000;
System.out.println("消耗时间:" + time);
}

}