HTMLParser解析HTMl标签的实例

public class HtmlParserTest {

/**
 * @param args
 * @throws ParserException
 */
/**
 * @param args
 * @throws ParserException
 */
public static void main(String[] args) throws ParserException {
long first = System.currentTimeMillis();

/**
 * 解析器
 */
String path_url = "http://mobile.zol.com.cn/";
Parser myParser = new Parser(path_url);
/*设置编码 */
myParser.setEncoding("GBK");

/*HTML标签名定义*/
String div = "div";
String className = "class";
String classValue = "category_nav";
String href="href";

/*
 * 过滤到的标签过滤
   * NodeFilter filter = new TagNameFilter(div);
*/
/*
 * 过滤有属性的HTML
 * NodeFilter[] nodeFilters = new NodeFilter[1];
 * nodeFilters[0] = new AndFilter(new TagNameFilter(div),new HasAttributeFilter(className,classValue));
*/
NodeFilter nodeFilter = new AndFilter(new TagNameFilter(div),new HasAttributeFilter(className,classValue));

/**
 * 进行查询匹配
 */
NodeList nodeList = myParser.extractAllNodesThatMatch(nodeFilter);

/**
 * 可执行多次过滤器
 * 在NodeList中执行过滤器时，第二个参数为True
 */
nodeList = nodeList.extractAllNodesThatMatch(new TagNameFilter("dl"),true);
nodeList = nodeList.extractAllNodesThatMatch(new TagNameFilter("dt"),true);
nodeList = nodeList.extractAllNodesThatMatch(new AndFilter(new TagNameFilter("a"),new HasAttributeFilter(href)),true);

//得到一个Node数组
Node[] node = nodeList.toNodeArray();
System.out.println("链接名称：" + ((LinkTag) node[0]).getLinkText());
System.out.println("链接地址："+((LinkTag)node[0]).getLink());
/**
 * 计算执行时间
 */
long now = System.currentTimeMillis();
double time = (double) (now - first) / 1000;
System.out.println("消耗时间:" + time);
}

}
秒客网

HTMLParser解析HTMl标签的实例

相关文章