简单的爬取网站

时间:2022-02-09 02:59:51

简单的爬取网站



配置文件:

静态配置

<?xml version="1.0" encoding="UTF-8"?>
<myenv>
<!-->起点<-->
<qidiannet>
<reStartSpan>1</reStartSpan><!-->每次抓取间隔时间<-->
<threadNum>4</threadNum><!-->开启的线程数<-->
<parseUrl>http://www.cc222.com/novel</parseUrl><!-->要抓取的网站前缀<-->
<classNm>com.crawl.parsehtml.ParseQidian</classNm><!-->要抓取的类实例<-->
</qidiannet>
<!-- -->



<!-->music365net,zolmobilenet,pipinet,qidiannet,zhuangnet<-->
<startWorkName>qidiannet</startWorkName>

</myenv>

动态配置

<?xml version="1.0" encoding="UTF-8"?>
<myenv>
<qidiannet>
<qidiannetDate>2010-02-09 18:48:54</qidiannetDate>
<startNum>723819</startNum>
<endNum>1000000</endNum>
</qidiannet>
</myenv>



消费者线程

package com.crawl;

import java.util.concurrent.BlockingQueue;

import org.apache.log4j.Logger;

import com.crawl.parsehtml.IParseHtml;
import com.model.Model;
import com.util.XmlUtil;


/**
* 抓取的消费类,线程
* @author Administrator
*
*/
public class CrawConsumer implements Runnable{
Logger logger = Logger.getLogger(CrawConsumer.class);

private String startWorkName;
private BlockingQueue<Integer> queue;
private IParseHtml parseHtml;
private String url;
private ICrawlComplete iCrawcomplete;
private boolean isRunning;

/**
* @param startWorkName 要抓取的网站名字
* @param queue 要抓取的网址队列
* @param parseHtml 要抓取的类
* @param url 要抓取的网址前缀
* @param iCrawcomplete 抓取完毕后回调接口
*/
public CrawConsumer(String startWorkName,BlockingQueue<Integer> queue,IParseHtml parseHtml,String url, ICrawlComplete iCrawcomplete) {
super();
this.queue = queue;
this.parseHtml = parseHtml;
this.url=url;
this.startWorkName=startWorkName;
this.iCrawcomplete=iCrawcomplete;
this.isRunning=true;
}

@Override
public void run() {
Integer data=null;
while(Thread.currentThread().isInterrupted()==false && isRunning)
{
//System.out.println("运行线程数"+Thread.getAllStackTraces().size());
try {
//因为BlockingQueue是线程安全的,所以不用考虑同步问题
data = queue.take();

//开始抓取
Model model=parseHtml.extract(url+"\\"+data+".html");

//回调处理
if(model!=null)
{
iCrawcomplete.save(true,model);
iCrawcomplete.otherDeal(true,model);
}
else
{
iCrawcomplete.save(false,null);
iCrawcomplete.otherDeal(false,null);
}

//把抓取到的最新id保存到运行时文件
XmlUtil.updateDataXml(startWorkName,"startNum",data+"",CrawlMain.crawl_RunDate);
//System.out.println(Thread.currentThread().getName()+"当前抓取网址"+url+data);


if(queue.isEmpty()){
try {
Thread.sleep(360*1000);
} catch (Exception e) {
}
Thread.currentThread().interrupt();
isRunning = false;
}

Thread.sleep(1000);
}catch (Exception e) {
logger.info("抓取"+url+data+"出现异常"+e.getStackTrace());
}
}

}

}

回调类

package com.crawl;

import com.model.Model;

/**
* 抓取完毕后回调接口
* @author gt
*
*/
public interface ICrawlComplete {
/**
* 抓取完毕后保存
* @param isSuc 是否抓取成功
* @param model 要处理的数据
* @return
*/
public boolean save(boolean isSuc,Model model);

/**
* 抓取完毕后其他处理
* @param isSuc 是否抓取成功
* @param model 要处理的数据
* @return
*/
public boolean otherDeal(boolean isSuc,Model model);
}

抓取类

package com.crawl.parsehtml;

import java.io.FileOutputStream;
import java.io.PrintWriter;
import java.sql.Timestamp;
import java.text.SimpleDateFormat;
import java.util.Date;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.util.NodeList;

import com.config.Config;
import com.model.Model;
import com.model.Novel;
import com.model.NovelType;
import com.service.NovelService;
import com.util.DateUtil;
import com.util.FileUtil;

public class ParseQidian implements IParseHtml {

@Override
public Model extract(String url) {
Novel novel=null;
try {
novel=new Novel();

NodeFilter nameFilter = new AndFilter(new TagNameFilter("div"),
new HasAttributeFilter("class", "bookTitle"));

NodeFilter imageFilter = new AndFilter(new TagNameFilter("img"),
new HasAttributeFilter("width", "200"));

NodeFilter outlineFilter = new AndFilter(new TagNameFilter("p"),
new HasAttributeFilter("class", "gray"));

NodeFilter typeFilter = new AndFilter(new TagNameFilter("span"),
new HasAttributeFilter("class", "blue"));

NodeFilter clickPointFilter = new AndFilter(new TagNameFilter("span"),
new HasAttributeFilter("class", "red"));

NodeFilter hotFilter = new AndFilter(new TagNameFilter("span"),
new HasAttributeFilter("class", "blue"));

Parser parser = new Parser(url);
parser.setEncoding("utf-8");
// 名字和作者
NodeList nodes = parser.parse(nameFilter);
//解析样式《小渔村》文/刘皇弟
String parseStr= nodes.elementAt(0).toPlainTextString();
String name=parseStr.split("《")[1].split("》")[0];
String author=parseStr.split("/")[1];
novel.setName(name);
novel.setAuthor(author);
//图片
parser.reset();
nodes = parser.parse(imageFilter);
String imgPath="";
if (nodes.size() != 0)
{
ImageTag profileTag = (ImageTag) nodes.elementAt(0);
imgPath="http://www.cc222.com"+profileTag.getAttribute("src");
}
novel.setImgPath(imgPath);
//简介
parser.reset();
nodes = parser.parse(outlineFilter);
String outline=nodes.elementAt(0).toPlainTextString();
novel.setOutline(outline);
//类型
parser.reset();
nodes = parser.parse(typeFilter);
String typeName=nodes.elementAt(0).toPlainTextString();
NovelType novelType=new NovelType();
novelType.setName(typeName);
novel.setNovelType(novelType);
//点击率
parser.reset();
nodes = parser.parse(clickPointFilter);
//System.out.println(nodes.elementAt(0).toPlainTextString());
String clickPoint=nodes.elementAt(0).toPlainTextString();
novel.setClickPoint(Long.parseLong(clickPoint));
//热度
parser.reset();
nodes = parser.parse(typeFilter);
String hot=nodes.elementAt(2).toPlainTextString().split("次")[0];
novel.setHot(Long.parseLong(hot));
// System.out.println(hotName);
//其他保存
//System.out.println(new Timestamp(new Date().getTime()));
novel.setUpdateTime(DateUtil.convert_DateToTimestamp(new Date()));
novel.setLink_url(url);

System.out.println(name+":"+author+":"+imgPath+":"+":"+typeName+":");



SimpleDateFormat sDateFormat = new SimpleDateFormat(
"yyyy-MM-dd HH:mm:ss");
FileUtil.writeFile(sDateFormat.format(new Date()) + ":"
+ Thread.currentThread().getName() + "parse:" + url
+ "......Suc",Config.PROJECT_PATH+"\\com\\crawl\\log\\log.txt");

} catch (Exception e) {
//System.out.println(url+"地址不存在"+Thread.currentThread().getName());
//e.printStackTrace();
return null;
}
return novel;
}

public static void main(String[] args)
{
ParseQidian parse=new ParseQidian();
parse.extract("http://www.cc222.com/novel/799274.html");
}

}


主程序

/**
* 启动抓取程序
* @param iCrawcomplete 抓取完毕后回调接口
* @throws Exception
*/
public void start(ICrawlComplete iCrawcomplete) throws Exception{
if(isRunable==false)
{
logger.info("抓取程序启动成功");
isRunable=true;
//获取启动的工程队列
String[] startWorkQueue= XmlUtil.getNodeText("startWorkName", crawl_Config).split(",");
int startWorkNum=startWorkQueue.length;
IParseHtml instance=null;
for(int i=0;i<startWorkNum;i++)
{
//获取抓取工程名
String startWorkName=startWorkQueue[i];
//获取抓取工程的类名
String classType=XmlUtil.getNodeText(startWorkName,"classNm", crawl_Config);
//获取开启线程数
Integer threadNum=Integer.parseInt(XmlUtil.getNodeText(startWorkName,"threadNum", crawl_Config));
//获取网址
String parseUrl=XmlUtil.getNodeText(startWorkName,"parseUrl", crawl_Config);

Integer startNum=Integer.parseInt(XmlUtil.getNodeText(startWorkName,"startNum", crawl_RunDate));
Integer endNum=Integer.parseInt(XmlUtil.getNodeText(startWorkName,"endNum", crawl_RunDate));
//System.out.println(classType+":"+parseUrl+":"+threadNum+":"+startNum+":"+endNum);
//获取每次抓取间隔周期(version1.0没用)
//String reStartSpan=XmlUtil.getNodeText(startWorkName,"reStartSpan", crawl_Config);

//反射,通过类名动态获取抓取类
instance=(IParseHtml)Class.forName(classType).newInstance();

//生成要抓取的队列
LinkedBlockingQueue<Integer> queue = new LinkedBlockingQueue<Integer>();
for(int j=startNum;j<endNum;j++)
{
queue.offer(j);
}

//创建个*带自动回收机制的线程池
ExecutorService threadPool = Executors.newCachedThreadPool();
//创建消费者线程
CrawConsumer consumer=new CrawConsumer(startWorkName,queue,instance,parseUrl,iCrawcomplete);

//根据线程数启动线程
for(int z=0;z<threadNum;z++)
threadPool.execute(consumer);



}
}
else
{
logger.warn("!!抓取程序启动失败,已经在运行");
}
}