springboot2.1入门系列十二 springboot使用jsoup爬取全国的省市数据

时间:2024-03-22 13:31:46

国家统计局的统计用区划和城乡划分代码的网址为http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/ 通过爬取该网址可以获取最新的全国省市区乡镇村的数据。

本文通过springboot结合jsoup技术实现模拟爬虫爬取全国的省市数据,其他层级的数据如县乡镇村的数据可以由同学们自己练习。

一、数据库表

create table province
(
   id                   int not null auto_increment,
   name                 varchar(64),
   code                 varchar(32),
   primary key (id)
)
ENGINE = InnoDB
DEFAULT CHARSET = utf8;

create table city
(
   id                   int not null auto_increment,
   province_id          int,
   name                 varchar(64),
   code                 varchar(32),
   primary key (id)
)
ENGINE = InnoDB
DEFAULT CHARSET = utf8;

二、创建工程demo012

pom.xml的内容为

<project xmlns="http://maven.apache.org/POM/4.0.0"
    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>com.yinww</groupId>
        <artifactId>demo-springboot2</artifactId>
        <version>0.0.1-SNAPSHOT</version>
    </parent>
    <artifactId>demo012</artifactId>

    <dependencies>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-web</artifactId>
        </dependency>
        <dependency>
            <groupId>org.mybatis.spring.boot</groupId>
            <artifactId>mybatis-spring-boot-starter</artifactId>
            <version>1.3.2</version>
        </dependency>
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <scope>runtime</scope>
        </dependency>
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.11.3</version>
        </dependency>
    </dependencies>
</project>

三、Java类

主类

package com.yinww.demo.springboot2.demo012;

import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;

@SpringBootApplication
public class Demo012Application {

    public static void main(String[] args) {
        SpringApplication.run(Demo012Application.class, args);
    }

}

工具类:

public class HttpUtil {

    public static Document get(String url, String charset) throws IOException {
        String userAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36";
        URL url2 = new URL(url);
        HttpURLConnection connection = (HttpURLConnection)url2.openConnection();
        connection.setRequestMethod("GET");
        //是否允许缓存,默认true。
        connection.setUseCaches(Boolean.FALSE);
        //设置请求头信息
        connection.addRequestProperty("Connection", "close");
        connection.addRequestProperty("user-agent", userAgent);
        //设置连接主机超时(单位:毫秒)
        connection.setConnectTimeout(80000);
         //设置从主机读取数据超时(单位:毫秒)  
        connection.setReadTimeout(80000);
        //开始请求
        try {
            Document doc = Jsoup.parse(connection.getInputStream(), charset, url);
            return doc;
        } catch (Exception e) {
            System.out.println("parse error: " + url);
            e.printStackTrace();
        }
        return null;
    }
    
}

控制器

@RestController
public class SpiderController {
    
    @Autowired
    private SpiderService spiderService;
    
    @GetMapping({"/", ""})
    public Object spider() throws Exception {
        String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/";
        String charset = "gb2312";
        Document rootDoc = HttpUtil.get(url, charset);
        if(rootDoc == null) {
            return 0;
        }
        Element firstElement = rootDoc.getElementsByClass("center_list_contlist").get(0);
        String yearHref = firstElement.select("a").get(0).attr("href"); // 最近一个年份的省份链接
        Document doc = HttpUtil.get(yearHref, charset);
        
        // 遍历所有的省
        Elements provinceElements = doc.getElementsByClass("provincetr");
        for (Element element : provinceElements) {
            Elements aEles = element.select("a");
            for (Element aEle : aEles) {
                String name = aEle.text();
                String provincesHref = aEle.attr("href");
                String code = provincesHref.substring(0, provincesHref.indexOf("."));
                int index = yearHref.lastIndexOf("/") + 1;
                provincesHref = yearHref.substring(0, index) + provincesHref;
                Province province = new Province(name, code);
                spiderService.saveProvince(province);
                getCites(provincesHref, charset, province.getId());
            }
        }
        
        return "spider crawl end.";
    }
    
    private void getCites(String url, String charset, int provinceId) throws Exception {
        Document rootDoc = HttpUtil.get(url, charset);
        if(rootDoc != null) {
            Elements cityElements = rootDoc.getElementsByClass("citytr");
            for (Element cityElement : cityElements) {
                Element aEle = cityElement.select("a").get(1); // 第二个是市的名字
                String name = aEle.text();
                String cityHref = aEle.attr("href");
                City city = new City();
                city.setName(name);
                city.setProvinceId(provinceId);
                int start = cityHref.lastIndexOf("/") + 1;
                String code = cityHref.substring(start, cityHref.indexOf("."));
                city.setCode(code);
                spiderService.saveCity(city);
            }
        }
    }

}

四、运行程序

启动程序后,访问 http://localhost:8080/  等待程序运行结束,再查看数据库可获取到31条省的数据,343条市的数据。

springboot2.1入门系列十二 springboot使用jsoup爬取全国的省市数据

springboot2.1入门系列十二 springboot使用jsoup爬取全国的省市数据

整体思路还是比较清晰,重点在于掌握jsoup的几个api的用法,其他的都是常规的springboot代码。

本文内容到此结束,更多内容可关注公众号和个人微信号:

springboot2.1入门系列十二 springboot使用jsoup爬取全国的省市数据springboot2.1入门系列十二 springboot使用jsoup爬取全国的省市数据