Java多线程爬虫和存储

时间:2022-10-31 08:51:06
import org.apache.http.HttpHeaders;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.util.EntityUtils;
import org.dom4j.Element;

import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class GetBookInfoThread extends Thread{
private CloseableHttpClient httpClient;
private String webAddress;
private Element rootElement;
private Pattern bookAuthorRegex;
private Pattern bookPublishRegex;
private Pattern bookIsbnRegex;
private Pattern bookImgRegex;
private String bookName;

/**
*
* @param httpClient 用这个操作抓取
* @param webAddress 这个是抓取的网址
* @param rootElement 这个是一个xml文档的根节点,用这个来操作加入新的子节点
*/

public GetBookInfoThread(CloseableHttpClient httpClient,String webAddress,String bookName,Element rootElement,Pattern bookAuthorRegex,Pattern bookPublishRegex,Pattern bookIsbnRegex,Pattern bookImgRegex) {
this.httpClient = httpClient;
this.webAddress = webAddress;
this.rootElement = rootElement;
this.bookAuthorRegex = bookAuthorRegex;
this.bookPublishRegex = bookPublishRegex;
this.bookIsbnRegex = bookIsbnRegex;
this.bookName = bookName;
this.bookImgRegex = bookImgRegex;
}

@Override
public void run() {
HttpGet getBookInfo = new HttpGet(webAddress);
getBookInfo.addHeader(HttpHeaders.USER_AGENT, "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30");
CloseableHttpResponse bookInfoResponse;
String bookInfoCode = null;//书籍具体信息网页源码
try {
bookInfoResponse = httpClient.execute(getBookInfo);
if (bookInfoResponse.getStatusLine().getStatusCode() != 200) {
System.out.println("获取书本具体信息时出错,页面地址:" + webAddress + "错误信息" + bookInfoResponse.getStatusLine());
return;
}

bookInfoCode = EntityUtils.toString(bookInfoResponse.getEntity());
} catch (IOException e) {
e.printStackTrace();
}

Matcher bookAuthorMatcher = bookAuthorRegex.matcher(bookInfoCode); //匹配作者
Matcher bookPublishMatcher = bookPublishRegex.matcher(bookInfoCode); //匹配出版商
Matcher bookIsbnMatcher = bookIsbnRegex.matcher(bookInfoCode); //匹配isbn
Matcher bookImgMatcher = bookImgRegex.matcher(bookInfoCode); //匹配图片地址

String bookName = this.bookName;
String bookAuthor = "";
String bookPublish = "";
String bookIsbn = "";
String bookLink = webAddress;
String bookImg = "";

if (bookAuthorMatcher.find()) {
bookAuthor = bookAuthorMatcher.group(1);
}
if (bookPublishMatcher.find()) {
bookPublish = bookPublishMatcher.group(1);
}
if (bookIsbnMatcher.find()) {
bookIsbn = bookIsbnMatcher.group(1);
}
if (bookImgMatcher.find()) {
bookImg = bookImgMatcher.group(1);
}

// System.out.println(bookName + "-" + bookAuthor + "-" + bookPublish + "-" + bookIsbn);

Element bookElement = rootElement.addElement("book");//新建一个书的标签
bookElement.addAttribute("id",String.valueOf(Main.bookId++));
bookElement.addElement("name").setText(bookName);
bookElement.addElement("author").setText(bookAuthor);
bookElement.addElement("publish").setText(bookPublish);
bookElement.addElement("isbn").setText(bookIsbn);
bookElement.addElement("count").setText(String.valueOf((int)(Math.random() * 10) + 3));
bookElement.addElement("link").setText(bookLink);
bookElement.addElement("img").setText(bookImg);

System.out.println("抓取了:" + webAddress + " " + bookName);
}
}
import org.apache.http.HttpHeaders;
import org.apache.http.HttpHost;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.dom4j.Document;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;

import java.io.IOException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Main {

CloseableHttpClient httpClient;
static int bookId = 496;
Map<String,Integer> proxyMap;//ip->端口
List<String> ipList;//从这个list中读出ip,再由ip从map中读出端口
int i = 0;//根据这个从list中取出ip,换上对应的代理

public static void main(String[] args) {
Main m = new Main();

// List<String> tagList = m.getTagList();
List<String> tagList = new LinkedList<String>();
// tagList.add("经典");
// tagList.add("日本文学");
// tagList.add("散文");
// tagList.add("中国文学");
// tagList.add("算法");
// tagList.add("童话");
// tagList.add("外国文学");
// tagList.add("文学");
// tagList.add("小说");
// tagList.add("漫画");
// tagList.add("诗词");
// tagList.add("心理学");
tagList.add("摄影");
tagList.add("理财");
tagList.add("经济学");
m.pullAndWrite(tagList,10);
}

public Main() {
// HttpHost proxy = new HttpHost("122.225.106.35",80);
// httpClient = HttpClients.custom().setProxy(proxy).build();
httpClient = HttpClients.createDefault();
setProxyMap();
}

public void setProxyMap() {
proxyMap = new HashMap<String, Integer>();
ipList = new LinkedList<String>();
proxyMap.put("211.68.122.171",80);ipList.add("211.68.122.171");
}

public List<String> getTagList() {
HttpGet getTag = new HttpGet("http://book.douban.com/tag/");
getTag.addHeader(HttpHeaders.USER_AGENT, "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30");
CloseableHttpResponse tagPageResponse = null;
String tagPageCode = null;//网页源码
try {
tagPageResponse = httpClient.execute(getTag);
tagPageCode = EntityUtils.toString(tagPageResponse.getEntity());
tagPageResponse.close();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
tagPageResponse.close();
} catch (IOException e) {
e.printStackTrace();
}
}

Pattern p = Pattern.compile("class=\"tag\">(.*?)</a>");
Matcher m = p.matcher(tagPageCode);
List<String> resultTagList = new LinkedList<String>();
while (m.find()) {
resultTagList.add(m.group(1));
}

return resultTagList;
}

/**
*
* @param tagList 要抓的图书的类别
* @param maxPageNum 每种图书最多抓取的页数
*/

public void pullAndWrite(List<String> tagList,int maxPageNum) {
Pattern bookAddressRegex = Pattern.compile("href=\"(.*?)\" class=\"title\" target=\"_blank\">(.*?)</a>"); //获取具体书籍网址的正则
Pattern bookAuthorRegex = Pattern.compile("(?s)<span class=\"pl\"> 作者</span>:.*?>(.*?)</a>");//匹配作者
Pattern bookPublishRegex = Pattern.compile("<span class=\"pl\">出版社:</span> (.*?)<br/>");
Pattern bookIsbnRegex = Pattern.compile("<span class=\"pl\">ISBN:</span> (.*?)<br/>");
Pattern bookImgRegex = Pattern.compile("<img src=\"(.*?)\" title=\"点击看大图\"");

//分别抓取每一种类别的书籍
for (String tag:tagList) {
int nowPageNum = 0;//目前正在抓取的页数
Document newDocument = DocumentHelper.createDocument();
Element rootElement = newDocument.addElement("root");

while (nowPageNum < maxPageNum) {
System.out.println(1);
String nowPageAddress = "http://www.douban.com/tag/" + tag + "/book?start=" + nowPageNum * 15;//当前页的网址
HttpGet getBooksPage = new HttpGet(nowPageAddress);
getBooksPage.addHeader(HttpHeaders.USER_AGENT, "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30");
CloseableHttpResponse booksPageResponse;
Matcher m = null;
try {
System.out.println(2);
booksPageResponse = httpClient.execute(getBooksPage);
System.out.println(3);
m = bookAddressRegex.matcher(EntityUtils.toString(booksPageResponse.getEntity()));
booksPageResponse.close();
if (booksPageResponse.getStatusLine().getStatusCode() != 200) {
System.out.println("抓 " + nowPageAddress + " 时出错:");
System.out.println("错误信息:" + booksPageResponse.getStatusLine());
changeProxy();
continue;//换个代理继续爬当前页
}
} catch (IOException e) {
e.printStackTrace();
}
//具体每一本书,具体抓取
int findCount = 0;//找到的书籍的数目
List<Thread> threadList = new LinkedList<Thread>();
while (m.find()) {
threadList.add(new GetBookInfoThread(httpClient, m.group(1), m.group(2), rootElement, bookAuthorRegex, bookPublishRegex, bookIsbnRegex,bookImgRegex));
findCount++;
}
//没有知道到代表这种类别的书都找完了,那么直接退出此类书籍的查找
if (findCount == 0) {
break;
}

for (Thread thread:threadList) {
thread.start();
}
for (Thread thread:threadList) {
try {
thread.join();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
nowPageNum++;
}
//一个类别爬完了再写入
new WriteBookInfoToFile(rootElement,"/home/geekgao/book/" + tag + ".xml").start(); //另开一个线程写入文件

}
}

private void changeProxy() {
if (i >= ipList.size()) {
System.out.println("代理用完了,退出");
System.exit(0);
}
String ip = ipList.get(i++);
httpClient = HttpClients.custom().setProxy(new HttpHost(ip,proxyMap.get(ip))).build();
System.out.println("换代理啦,使用代理:" + ip + ",端口:" + proxyMap.get(ip));
}

}
import org.dom4j.Element;
import org.dom4j.io.XMLWriter;

import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;

public class WriteBookInfoToFile extends Thread {
private Element root;
private String fileAddress;

public WriteBookInfoToFile(Element root,String fileAddress) {
this.root = root;
this.fileAddress = fileAddress;
}

@Override
public void run() {
Writer fileWriter;
try {
fileWriter = new FileWriter(fileAddress);
XMLWriter xmlWriter = new XMLWriter(fileWriter);
xmlWriter.write(root);
xmlWriter.close();
System.out.println("[" + fileAddress + "]写入成功");
} catch (IOException e) {
e.printStackTrace();
}
}
}
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;

import java.io.File;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.List;

public class WriteInfoToDB {
public static void main(String[] args) {
File folder = new File("/home/geekgao/book");
File[] XMLS = folder.listFiles();
SAXReader reader = new SAXReader();
Statement statement = null; //用这个执行sql语句
try {
Class.forName("com.mysql.jdbc.Driver");// 动态加载mysql驱动
statement = DriverManager.getConnection("jdbc:mysql://localhost:3306/BookManage?user=root&password=root").createStatement();
} catch (SQLException e) {
e.printStackTrace();
} catch (ClassNotFoundException e) {
e.printStackTrace();
}

for (File f:XMLS) {
if (f.isDirectory()) {
continue;
}
Document document = null;
try {
document = reader.read(f);
} catch (DocumentException e) {
e.printStackTrace();
}

Element root = document.getRootElement();
List<Element> books = root.elements();
for (Element book:books) {
String name = null;
String author = null;
String publish = null;
String isbn = null;
String count = null;
String link = null;
String img = null;
List<Element> b = book.elements();
for (Element info:b) {
if (info.getName().equals("name")) {
name = info.getText();
} else if (info.getName().equals("author")) {
author = info.getText();
} else if (info.getName().equals("publish")) {
publish = info.getText();
} else if (info.getName().equals("isbn")) {
isbn = info.getText();
} else if (info.getName().equals("count")) {
count = info.getText();
} else if (info.getName().equals("link")) {
link = info.getText();
} else if (info.getName().equals("img")) {
img = info.getText();
}
// System.out.println(info.getName() + ": " + info.getText());
}
String sql = "INSERT INTO Book(bookPublish,bookName,bookAuthor,bookTag,bookIsbn,bookCount,bookRestCount,bookLink,bookImg) VALUES ('" + publish + "','" + name + "','" + author + "','" + f.getName().split("\\.")[0] + "','" + isbn + "','" + count + "','" + count + "','" + link + "','" + img + "');";
try {
statement.execute(sql);
} catch (SQLException e) {
System.err.println("sql语句处错误:" + e.getMessage());
System.err.println("sql语句:" + sql);
}
}
}
}
}