java版正文抽取基于文字连接比

package cn.tdt.crawl.jdbc;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;

public class HtmlExtract {

    private static double linkTextRadio = 0.25; // 链接文字比

    // 过滤不必要的数据

    public static String filterContent(String str) {

        if (str == "") {

            return "";

        }

        str = str.replaceAll("(?is)<!DOCTYPE.*?>", "");

        str = str.replaceAll("(?is)<!--.*?-->", "");

        str = str.replaceAll("(?is)<script.*?>.*?</script>", "");

        str = str.replaceAll("(?is)<style.*?>.*?</style>", "");

        // str=str.replaceAll("&.{2,5};|&#.{2,5};", " ");

        return str;

    }

    // 计算链接数

    public static int calcLinks(Element node) {

        Elements links = node.select("a[href]");

        return links.size();

    }

    // 计算内容长度

    public static double calcWords(Element node) {

        String con = node.text();

        if (con.length() == 0) {

            return 1 + linkTextRadio;

        } else {

            return con.length();

        }

    }

    // 计算标点符号的个数

    public static int calcSign(Element node) {

        String[] sign = { ",", ";", ".", "\"", "'", "\\?", "。", ":", "，" };

        int i = 0;

        for (String ch : sign) {

            int count = 0;

            count = node.text().split(ch).length - 1;

            i = +count;

        }

        return i;

    }

    // 将所有的空节点全部删除

    public static Element drawCon(Element node) {

        if (node.tagName() == "a") {

            // 这个就不用进去深入了

            return node;

        }

        int links; // 链接数

        double words; // 文字长度

        double cellRatio;

        int signs; // 符号出现的情况

        Elements nodes = node.children();

        for (Element cnode : nodes) {

            if (!cnode.hasText()) {

                // System.out.println("删除"+cnode);

                cnode.remove();

            } else {

                links = calcLinks(cnode);

                words = calcWords(cnode);

                cellRatio = links / words;

                signs = calcSign(cnode);

                if (signs < 1) {

                    // 删除没有标点符号的节点

                    cnode.remove();

                } else if (cellRatio > linkTextRadio) {

                    cnode.remove();

                } else {

                    drawCon(cnode);

                }

            }

        }

        return node;

    }

    // 提取标题

    private String drawTitle(String str) {

        // TODO Auto-generated method stub

        // 先取页面的title部分的值

        if (str.length() < 1) {

            return null;

        }

        String tit = "";

        int xhpos = -1; // 下划线的位置

        int zhpos = -1; // 中横线的位置

        Pattern pt = Pattern.compile("<title>(.*)</title>",

                Pattern.CASE_INSENSITIVE);

        Matcher mc = pt.matcher(str);

        if (mc.find()) {

            tit = mc.group(1).trim();

            // 下面需要过滤一下,有些标题会加上下划线或者中横线

            xhpos = tit.indexOf("_");

            zhpos = tit.indexOf("|");

            if (xhpos > 0) {

                tit = tit.substring(0, xhpos);

            }

            if (zhpos > 0) {

                tit = tit.substring(0, zhpos);

            }

        }

        return tit;

    }

    // 提取作者

    private String[] drawAuthor(String str) {

        if (str.length() < 1) {

            return null;

        }

        // 这种信息一般就是直接用正则就好

        String[] author = new String[2];

        int tPos = 0; // 日期所在的位置

        Pattern pt = Pattern.compile(

                "作者.+(\\d{4}[-|年]\\d{1,2}[-|月]\\d{1,2}[日]?)",

                Pattern.CASE_INSENSITIVE);

        Matcher mc = pt.matcher(str);

        if (mc.find()) {

            // System.out.println("123");

            author[0] = mc.group(1); // 存储日期信息

            tPos = mc.group().trim().indexOf(author[0]);

            author[1] = mc.group().trim().substring(0, tPos);

            return author;

        }

        return null;

    }

    // 核心处理函数

    public String[] extract(String str) {

        String title; // 标题

        //String author = ""; // 作者

        //String uptime = ""; // 发布时间

        String content; // 正文

        //String[] authors = new String[2];

        str = filterContent(str);

        Document doc = Jsoup.parse(str);

        // 取body

        Element bodynode = doc.select("body").first();

        title = drawTitle(str);

        //authors = drawAuthor(str);

        // 开始遍历节点,进行去噪处理,抽取正文

        content = drawCon(bodynode).text();

        // 防止溢出

//        if (authors.length > 1) {

//            author = authors[1];

//            uptime = authors[0];

//        }

//        System.out.println(title);

//        System.out.println(author);

//        System.out.println(uptime);

//        System.out.println(content);

        String[] arr = new String[2];

        arr[0] = title;

        arr[1] = content;

        return arr;

    }

    public static void main(String[] args){

    }

}
秒客网

java版正文抽取基于文字连接比

相关文章

java版 正文抽取 基于文字连接比

相关文章

java版正文抽取基于文字连接比