Java强大的网络数据抓取能力 -- 解析DOM获取网络数据

时间:2024-04-02 17:18:49

通过请求 获取网页内容

Java通过HTTP请求获取页面内容的 两种方式

  • 基于Apache 的 HttpClient包实现
    通过HttpResponse实例获得请求返回的数据体,具体数据封装在HttpEntity对象中。

  • 基于Java的net工具包实现
    通过HttpURLConnection 对象设置网络连接参数,建立网络连接,与获得请求返回的网络数据输入流,并从中获得数据。对于JSON格式的数据体,可进一步封装为JSONObject。

1. 基于Apache Httpclient实现Java网络访问工具

前面已经介绍了,这种方式的主要思路是通过HttpResponse实例获得请求返回的数据体,然后具体数据封装在HttpEntity对象中返回解析。
第一步,先引入Apache的Httpclient工具包,这里使用的是Httpclient-4.5.5的版本

	<dependency>
		<groupId>org.apache.httpcomponents</groupId>
		<artifactId>httpclient</artifactId>
		<version>4.5.5</version>
	</dependency>

第二步,创建 ApacheHttpUtil 工具类

package com.yxh.demo;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.message.BasicNameValuePair;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

/**
 * @author yangxiaohui
 * @Date: Create by 2018-10-31 12:23
 * @Description: 基于Apache的HttpClient实现的网络访问工具类
 */
public class ApacheHttpUtil {
    public static CloseableHttpClient httpClient = HttpClientBuilder.create().build();

    /**
     * <pre>
     *     发起GET请求 获取网络内容 并转成字符串进行返回(字符串:后面无论是解析dom或者结果是json字符串 都比较方便进行转换)
     * <pre>
     * @author Yangxiaohui
     * @date 2018-10-31 12:28
     * @param url   请求地址
     * @return
     */
    public static String getHttpContent(String url){
        StringBuffer result = new StringBuffer();
        HttpGet httpGet = new HttpGet(url);
        try{
            HttpResponse response = httpClient.execute(httpGet);
            HttpEntity entity = response.getEntity();
            InputStreamReader reader = new InputStreamReader(entity.getContent(),"utf-8");
            char [] charbufer;
            while (0<reader.read(charbufer=new char[10])){
                result.append(charbufer);
            }
        }catch (IOException e){
            e.printStackTrace();
        }finally {
            httpGet.releaseConnection();
        }

        httpGet.releaseConnection();
        return result.toString();
    }
    /**
     * <pre>
     *     发起POST请求 获取网络内容 并转成字符串进行返回(字符串:后面无论是解析dom或者结果是json字符串 都比较方便进行转换)
     * <pre>
     * @author Yangxiaohui
     * @date 2018-10-31 12:31
     * @param url   请求地址
     * @param data  请求参数内容
     * @return
     */
    public static String postHttpContent(String url, Map<String,String> data){
        StringBuffer sb = new StringBuffer();
        HttpPost httpPost = new HttpPost(url);
        List<NameValuePair> valuePairs = new ArrayList<NameValuePair>();
        //解决中文乱码问题
        httpPost.addHeader("Content-type","application/x-www-form-urlencoded; charset=utf-8");
        if(null != data) {
            httpPost.setEntity(new StringEntity(data.toString(), Charset.forName("UTF-8")));
            for (String key : data.keySet()) {
                valuePairs.add(new BasicNameValuePair(key, data.get(key)));
            }
        }
        try {
            httpPost.setEntity(new UrlEncodedFormEntity(valuePairs));
            HttpResponse response = httpClient.execute(httpPost);
            HttpEntity httpEntity = response.getEntity();
            BufferedInputStream bis = new BufferedInputStream(httpEntity.getContent());
            byte [] buffer;
            while (0<bis.read(buffer=new byte[128])){
                sb.append(new String(buffer,"utf-8"));
            }
        }catch (UnsupportedEncodingException e){
            e.printStackTrace();
        }catch (IOException e){
            e.printStackTrace();
        }finally {
            httpPost.releaseConnection();
        }
        return sb.toString();
    }
}

测试代码

 	public static void main(String[] args){
        String getStr=ApacheHttpUtil.getHttpContent("https://stock.tuchong.com/creative");
        String postStr=ApacheHttpUtil.postHttpContent("https://stock.tuchong.com/creative",new HashMap<String, String>());
        System.out.println(getStr);
        System.out.println(postStr);
    }

测试结果
Java强大的网络数据抓取能力 -- 解析DOM获取网络数据

2. 基于net 工具包 实现Java网络访问工具

基于java内置的net工具包开发,不需要引入第三方包
创建NetHttpUtil工具类

package com.yxh.demo;

import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;

/**
* @author yangxiaohui
* @Date: Create by 2018-10-31 16:49
* @Description: 基于java net工具包实现的网络访问工具类
*/
public class NetHttpUtil {
  /**
   * <pre>
   *     发起GET请求 获取网络内容 并转成字符串进行返回(字符串:后面无论是解析dom或者结果是json字符串 都比较方便进行转换)
   * <pre>
   * @author Yangxiaohui
   * @date 2018-10-31 16:50
   * @param url 地址
   * @return
   */
  public static String getHttpContent(String url){
      HttpURLConnection http = null;
      InputStream is = null;
      try {
          URL urlGet = new URL(url);
          http = (HttpURLConnection) urlGet.openConnection();
          http.setRequestMethod("GET");
          http.setRequestProperty("Content-Type","application/x-www-form-urlencoded");
          http.setDoOutput(true);
          http.setDoInput(true);
          System.setProperty("sun.net.client.defaultConnectTimeout", "30000");
          System.setProperty("sun.net.client.defaultReadTimeout", "30000");
          http.connect();
          is =http.getInputStream();
          int size =is.available();
          byte[] jsonBytes =new byte[size];
          is.read(jsonBytes);
          String message=new String(jsonBytes,"UTF-8");
          return message;
      } catch (Exception e) {
          return null;
      }finally {
          if(null != http) http.disconnect();
          try {
              if (null != is) is.close();
          }catch (IOException e){
              e.printStackTrace();
          }
      }
  }
  /**
   * <pre>
   *     发起POST请求 获取网络内容 并转成字符串进行返回(字符串:后面无论是解析dom或者结果是json字符串 都比较方便进行转换)
   * <pre>
   * @author Yangxiaohui
   * @date 2018-10-31 16:50
   * @param url 地址
   * @return
   */
  public static String postHttpContent(String url,String data){
      HttpURLConnection http = null;
      PrintWriter out = null;
      BufferedReader reader = null;
      try {
          //创建连接
          URL urlPost = new URL(url);
          http = (HttpURLConnection) urlPost
                  .openConnection();
          http.setDoOutput(true);
          http.setDoInput(true);
          http.setRequestMethod("POST");
          http.setUseCaches(false);
          http.setInstanceFollowRedirects(true);
          http.setRequestProperty("Content-Type",
                  "application/x-www-form-urlencoded");
          http.connect();
          //POST请求
          OutputStreamWriter outWriter = new OutputStreamWriter(http.getOutputStream(), "utf-8");
          out = new PrintWriter(outWriter);
          out.print(data);
          out.flush();
          out.close();
          out = null;
          //读取响应
          reader = new BufferedReader(new InputStreamReader(
                  http.getInputStream()));
          String lines;
          StringBuffer sb = new StringBuffer("");
          while ((lines = reader.readLine()) != null) {
              lines = new String(lines.getBytes(), "utf-8");
              sb.append(lines);
          }
          reader.close();
          reader = null;
          System.out.println(sb.toString());
          return sb.toString();
      } catch (Exception e) {
          // TODO Auto-generated catch block
          e.printStackTrace();
          return null;
      }finally {
          if(null != http) http.disconnect();
          if(null != out) out.close();
          try{
              if(null != reader) reader.close();
          }catch (IOException e){
              e.printStackTrace();
          }
      }
  }
}

JAVA 基于 Jsoup 对抓取到的网页对象进行DOM解析

jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。

1. 使用Jsoup 对字符串进行处理,并解析DOM

第一步,引入Jsoup 包。这里使用的是jsoup-1.11.2版本

		<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.11.2</version>
        </dependency>

第二步,创建工具类DomPareUtil

  • 在解析这个网页前,我们需要确认一下,最后需要获得这个网页的哪些信息
    打开F12开发者工具,使用鼠标选择元素工具选中我们需要得到的内容
    由此处我们可以得出,我们只需要所有的a标签里的href,就能进而获取到所有的图片详地址
package com.yxh.demo;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.DataInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

/**
 * @author yangxiaohui
 * @Date: Create by 2018-10-31 16:59
 * @Description:
 */
public class DomPareUtil {
    public static void getInfo(String str){
        Document doc = Jsoup.parse(str);
        //定义一个list 存储所有图片访问路径
        List<String> imgSrcs = new ArrayList<String>();
        //先获取所有的图片元素块
        Elements rows = doc.select(".new-search-works-item");
        //遍历取出每一行所有图片块儿
        Iterator<Element> iterator = rows.iterator();
        while (iterator.hasNext()){
            Element element = iterator.next();
            //获取图片跳转路径
            String imgHref = element.select("a").attr("href");
            if (imgHref.indexOf("html")>0){
                imgSrcs.add(imgHref);
            }
        }
        List imgs = new ArrayList();
        //遍历获取每一个图片的真实路径并下载
        for (String imgSrc:imgSrcs){
            String imgInfoStr=ApacheHttpUtil.getHttpContent(imgSrc);
            Document imgDoc = Jsoup.parse(imgInfoStr);
            String img = imgDoc.selectFirst(".works-img").attr("src");
            imgs.add(img);
            System.out.println(img == null ? "":img );
        }
        //下载图片
        downloadPicture(imgs);
    }
    private static void downloadPicture(List<String> urlList) {
        URL url = null;
        int imageNumber = 0;
        for (String urlString : urlList) {
            try {
                url = new URL(urlString);
                DataInputStream dataInputStream = new DataInputStream(url.openStream());
                String imageName = imageNumber + ".jpg";
                File file = new File(imageName);
                FileOutputStream fileOutputStream = new FileOutputStream(file);
                byte[] buffer = new byte[1024];
                int length;
                while ((length = dataInputStream.read(buffer)) > 0) {
                    fileOutputStream.write(buffer, 0, length);
                }
                dataInputStream.close();
                fileOutputStream.close();
                imageNumber++;
            } catch (MalformedURLException e) {
                e.printStackTrace();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}

测试代码

 public static void main(String[] args){
        //测试GET方式抓取图虫信息
        String getStr=ApacheHttpUtil.getHttpContent("http://soso.nipic.com/?q=%E7%BE%8E%E5%A5%B3&or=0&y=40&g=1");
        DomPareUtil.getInfo(getStr);

//        //测试POST方式抓取图虫信息
//        String postStr=ApacheHttpUtil.postHttpContent("https://stock.tuchong.com/creative",new HashMap<String, String>());
//        System.out.println(getStr);
//        System.out.println(postStr);
    }

抓取结果:

Java强大的网络数据抓取能力 -- 解析DOM获取网络数据