Java爬虫学习(2)之用对象保存文件demo(1)

时间:2023-03-10 03:49:44
Java爬虫学习(2)之用对象保存文件demo(1)
 package com.mieba.spider;

 import java.util.ArrayList;
import java.util.List;
import java.util.Vector; import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html; public class WanhoPageProcessor implements PageProcessor
{ private Site site = Site
.me()
.setTimeOut(10000)
.setRetryTimes(3)
.setSleepTime(1000)
.setCharset("UTF-8"); @Override
public Site getSite()
{
// TODO Auto-generated method stub
return site;
} @Override
public void process(Page page)
{
// TODO Auto-generated method stub
//获取当前页的所有喜报
List<String> list = page.getHtml().xpath("//div[@class='main_l']/ul/li").all();
//要保存喜报的集合
Vector<ArticleVo> voLst = new Vector<>();
//遍历喜报
String title;
String content;
String img;
for (String item : list)
{
Html tmp = Html.create(item);
//标题
title = tmp.xpath("//div[@class='content']/h4/a/text()").toString();
//内容
content = tmp.xpath("//div[@class='content']/p/text()").toString();
//图片路径
img = tmp.xpath("//a/img/@src").toString();
//加入集合
ArticleVo vo = new ArticleVo(title, content, img);
voLst.add(vo);
}
//保存数据至page中,后续进行持久化
page.putField("e_list", voLst);
//加载其它页
page.addTargetRequests( getOtherUrls());
} //其它页
public List<String> getOtherUrls()
{
List<String> urlLsts = new ArrayList<>();
for(int i=2;i<7;i++){
urlLsts.add("http://www.wanho.net/a/jyxb/list_15_"+i+".html");
}
return urlLsts;
} }
 package com.mieba.spider;

 import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.Vector; import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline; public class WanhoPipeline implements Pipeline
{ @Override
public void process(ResultItems resultItems, Task arg1)
{
// TODO Auto-generated method stub
// 获取抓取过程中保存的数据
Vector<ArticleVo> voLst = resultItems.get("e_list");
// 持久到文件中
PrintWriter pw = null;
try
{
pw = new PrintWriter(new FileWriter("wanho.txt", true));
for (ArticleVo vo : voLst)
{
pw.println(vo);
pw.flush();
saveImg(vo.getImg());
}
} catch (FileNotFoundException e)
{
e.printStackTrace();
} catch (IOException e)
{
e.printStackTrace();
} finally
{
pw.close();
}
} private void saveImg(String img)
{
// TODO Auto-generated method stub
String imgUrl = "http://www.wanho.net" + img;
InputStream is = null;
BufferedInputStream bis = null;
BufferedOutputStream bos = null;
try
{
URL url = new URL(imgUrl);
URLConnection uc = url.openConnection();
is = uc.getInputStream();
bis = new BufferedInputStream(is);
File photoFile = new File("photo");
if (!photoFile.exists())
{
photoFile.mkdirs();
}
String imgName = img.substring(img.lastIndexOf("/") + 1);
File saveFile = new File(photoFile, imgName);
bos = new BufferedOutputStream(new FileOutputStream(saveFile));
byte[] bs = new byte[1024];
int len;
while ((len = bis.read(bs)) != -1)
{
bos.write(bs, 0, len);
} } catch (MalformedURLException e)
{
// TODO: handle exception
e.printStackTrace();
} catch (IOException e)
{
e.printStackTrace();
} finally
{
try
{
bos.close();
} catch (IOException e)
{
e.printStackTrace();
}
try
{
bis.close();
} catch (IOException e)
{
e.printStackTrace();
}
try
{
is.close();
} catch (IOException e)
{
e.printStackTrace();
} }
} }
 package com.mieba.spider;

 public class ArticleVo
{
private String title;
private String content;
private String img;
public String getTitle()
{
return title;
}
public void setTitle(String title)
{
this.title = title;
}
public String getContent()
{
return content;
}
public void setContent(String content)
{
this.content = content;
}
public String getImg()
{
return img;
}
public void setImg(String img)
{
this.img = img;
}
public ArticleVo(String title, String content, String img)
{
super();
this.title = title;
this.content = content;
this.img = img;
}
@Override
public String toString()
{
return "ArticleVo [title=" + title + ", content=" + content + ", img=" + img + "]";
} }
package com.mieba.spider;

import us.codecraft.webmagic.Spider;

public class Demo
{
public static void main(String[] args)
{ // 爬取开始
Spider
// 爬取过程
.create(new WanhoPageProcessor())
// 爬取结果保存
.addPipeline(new WanhoPipeline())
// 爬取的第一个页面
.addUrl("http://www.wanho.net/a/jyxb/")
// 启用的线程数
.thread(5).run();
}
}

爬取到的照片

Java爬虫学习(2)之用对象保存文件demo(1)

爬取到的简报

Java爬虫学习(2)之用对象保存文件demo(1)

大家如果要使用代码,配置webmagic的依赖包即可使用。