Jsoup爬取全国行政区域信息并入库

时间:2022-10-31 09:19:21

1 数据来源

*国家统计局 点击打开链接

2 Jsoup解析

//Jsoup.parse()似乎获取不了数据
        Document document = Jsoup.connect("http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201703/t20170310_1471429.html").post();
        //在浏览器在分析其DOM结构得到如下解析步骤
        Elements select = document.select(".MsoNormal");
        List<Area> areas = new ArrayList<>();
        int size =0;
        Area area =null;
        Integer parentCode =null;
        for (Element e:select) {
            Elements span = e.select("span");
             size = span.size();
                area = new Area();
                if(size==3){//是省级或直辖市
                    ////比较坑,String s="110000    ";拿到的文本内容如s,直接用String.trim()出来的字符串还是无法解析成数字所以自己写了一个
                    parentCode=Integer.parseInt(StringUtil.trim(span.get(0).text().trim()));
                    System.out.println(span.get(0).text().trim()+"|");
                    area.setCode(parentCode);
                    area.setName(span.get(2).text().trim());
                    System.out.println(span.get(2).text().trim()+"|");
                }
                else if(size==4){//是省级下面
                    area.setParentCode(parentCode);
                    area.setCode(Integer.parseInt(StringUtil.trim(span.get(1).text().trim())));
                    System.out.println(span.get(1).text().trim()+"|");
                    area.setName(span.get(3).text().trim());
                    System.out.println(span.get(3).text().trim()+"|");
                }
                areas.add(area);
                //System.out.println(span.toString());
        }
        System.out.println("总共解析到"+areas.size()+"个地区数据");
        /**
         * 入库,建表和数据库相关操作不再缀述
         */
        java.sql.Connection connection = MyTest.getConnection();
        connection.setAutoCommit(false);
        PreparedStatement preparedStatement = connection.prepareStatement("insert into area values(?,?,?)");
        for (Area a:areas
             ) {
            preparedStatement.setInt(1,a.getCode());
            preparedStatement.setString(2,a.getName());
            preparedStatement.setObject(3,a.getParentCode());
            preparedStatement.execute();
        }
        connection.commit();
        connection.close();
public static String trim(String s){
        byte[] bytes = s.getBytes();
        StringBuffer stringBuffer =new StringBuffer();
        int cnt =0;
        for (byte b:bytes){
            if(b>=0){
                cnt++;
            }
        }
        byte [] newb = new byte[cnt];
        cnt=0;
        for (int i = 0; i <bytes.length ; i++) {
            if(bytes[i]>=0){
                newb[cnt++]=bytes[i];
            }
        }
        return new String(newb);
    }