【搜索引擎Jediael开发笔记2】使用HttpClient下载网页至本地文件

时间:2021-10-23 18:14:16

本文使用HttpClient根据url进行网页下载。其中

(1)HttpClient的相关知识请参见 HttpClient基础教程

(2)


package org.ljh.search.downloadpage;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.io.Writer;
import java.util.Scanner;

import org.apache.http.HttpEntity;
import org.apache.http.HttpStatus;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;

//本类用于将指定url对应的网页下载至本地一个文件。
public class PageDownloader {

public static void downloadPageByGetMethod(String url) throws IOException {

// 1、通过HttpGet获取到response对象
CloseableHttpClient httpClient = HttpClients.createDefault();
// 注意,必需要加上http://的前缀,否则会报:Target host is null异常。
HttpGet httpGet = new HttpGet(url);
CloseableHttpResponse response = httpClient.execute(httpGet);

InputStream is = null;
if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
try {
// 2、获取response的entity。
HttpEntity entity = response.getEntity();

// 3、获取到InputStream对象,并对内容进行处理
is = entity.getContent();

String fileName = getFileName(url);
saveToFile("D:\\tmp\\", fileName, is);
} catch (ClientProtocolException e) {
e.printStackTrace();
} finally {

if (is != null) {
is.close();
}
if (response != null) {
response.close();
}
}
}
}

//将输入流中的内容输出到path指定的路径,fileName指定的文件名
private static void saveToFile(String path, String fileName, InputStream is) {
Scanner sc = new Scanner(is);
Writer os = null;
try {
os = new PrintWriter(path + fileName);
while (sc.hasNext()) {
os.write(sc.nextLine());
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (sc != null) {
sc.close();
}
if (os != null) {
try{
os.flush();
os.close();
}catch(IOException e){
e.printStackTrace();
System.out.println("输出流关闭失败!");
}
}
}
}

// 将url中的特殊字符用下划线代替
private static String getFileName(String url) {
url = url.substring(7);
String fileName = url.replaceAll("[\\?:*|<>\"/]", "_") + ".html";
return fileName;
}

}