Java爬虫学习（2）之用对象保存文件demo（1）

 package com.mieba.spider;

 import java.util.ArrayList;

 import java.util.List;

 import java.util.Vector;

 import us.codecraft.webmagic.Page;

 import us.codecraft.webmagic.Site;

 import us.codecraft.webmagic.processor.PageProcessor;

 import us.codecraft.webmagic.selector.Html;

 public class WanhoPageProcessor implements PageProcessor

 {

     private Site site = Site

             .me()

             .setTimeOut(10000)

             .setRetryTimes(3)

             .setSleepTime(1000)

             .setCharset("UTF-8");

     @Override

     public Site getSite()

     {

         // TODO Auto-generated method stub

         return site;

     }

     @Override

     public void process(Page page)

     {

         // TODO Auto-generated method stub

         //获取当前页的所有喜报

          List<String> list = page.getHtml().xpath("//div[@class='main_l']/ul/li").all();

         //要保存喜报的集合

         Vector<ArticleVo> voLst = new Vector<>();

       //遍历喜报

         String title;

         String content;

         String img;

         for (String item : list)

         {

             Html tmp = Html.create(item);

             //标题

             title = tmp.xpath("//div[@class='content']/h4/a/text()").toString();

             //内容

             content = tmp.xpath("//div[@class='content']/p/text()").toString();

             //图片路径

             img = tmp.xpath("//a/img/@src").toString();

             //加入集合

             ArticleVo vo = new ArticleVo(title, content, img);

             voLst.add(vo);

         }

       //保存数据至page中，后续进行持久化

         page.putField("e_list", voLst);

       //加载其它页

         page.addTargetRequests( getOtherUrls());

     }

     //其它页

     public List<String> getOtherUrls()

     {

          List<String> urlLsts = new ArrayList<>();

          for(int i=2;i<7;i++){

              urlLsts.add("http://www.wanho.net/a/jyxb/list_15_"+i+".html");

          }

         return urlLsts;

     }

 }

 package com.mieba.spider;

 import java.io.BufferedInputStream;

 import java.io.BufferedOutputStream;

 import java.io.File;

 import java.io.FileNotFoundException;

 import java.io.FileOutputStream;

 import java.io.FileWriter;

 import java.io.IOException;

 import java.io.InputStream;

 import java.io.PrintWriter;

 import java.net.MalformedURLException;

 import java.net.URL;

 import java.net.URLConnection;

 import java.util.Vector;

 import us.codecraft.webmagic.ResultItems;

 import us.codecraft.webmagic.Task;

 import us.codecraft.webmagic.pipeline.Pipeline;

 public class WanhoPipeline implements Pipeline

 {

     @Override

     public void process(ResultItems resultItems, Task arg1)

     {

         // TODO Auto-generated method stub

         // 获取抓取过程中保存的数据

         Vector<ArticleVo> voLst = resultItems.get("e_list");

         // 持久到文件中

         PrintWriter pw = null;

         try

         {

             pw = new PrintWriter(new FileWriter("wanho.txt", true));

             for (ArticleVo vo : voLst)

             {

                 pw.println(vo);

                 pw.flush();

                 saveImg(vo.getImg());

             }

         } catch (FileNotFoundException e)

         {

             e.printStackTrace();

         } catch (IOException e)

         {

             e.printStackTrace();

         } finally

         {

             pw.close();

         }

     }

     private void saveImg(String img)

     {

         // TODO Auto-generated method stub

         String imgUrl = "http://www.wanho.net" + img;

         InputStream is = null;

         BufferedInputStream bis = null;

         BufferedOutputStream bos = null;

         try

         {

             URL url = new URL(imgUrl);

             URLConnection uc = url.openConnection();

             is = uc.getInputStream();

             bis = new BufferedInputStream(is);

             File photoFile = new File("photo");

             if (!photoFile.exists())

             {

                 photoFile.mkdirs();

             }

             String imgName = img.substring(img.lastIndexOf("/") + 1);

             File saveFile = new File(photoFile, imgName);

             bos = new BufferedOutputStream(new FileOutputStream(saveFile));

             byte[] bs = new byte[1024];

             int len;

             while ((len = bis.read(bs)) != -1)

             {

                 bos.write(bs, 0, len);

             }

         } catch (MalformedURLException e)

         {

             // TODO: handle exception

             e.printStackTrace();

         } catch (IOException e)

         {

             e.printStackTrace();

         } finally

         {

             try

             {

                 bos.close();

             } catch (IOException e)

             {

                 e.printStackTrace();

             }

             try

             {

                 bis.close();

             } catch (IOException e)

             {

                 e.printStackTrace();

             }

             try

             {

                 is.close();

             } catch (IOException e)

             {

                 e.printStackTrace();

             }

         }

     }

 }

 package com.mieba.spider;

 public class ArticleVo

 {

     private String title;

     private String content;

     private String img;

     public String getTitle()

     {

         return title;

     }

     public void setTitle(String title)

     {

         this.title = title;

     }

     public String getContent()

     {

         return content;

     }

     public void setContent(String content)

     {

         this.content = content;

     }

     public String getImg()

     {

         return img;

     }

     public void setImg(String img)

     {

         this.img = img;

     }

     public ArticleVo(String title, String content, String img)

     {

         super();

         this.title = title;

         this.content = content;

         this.img = img;

     }

     @Override

     public String toString()

     {

         return "ArticleVo [title=" + title + ", content=" + content + ", img=" + img + "]";

     }

 }

package com.mieba.spider;

import us.codecraft.webmagic.Spider;

public class Demo

{

    public static void main(String[] args)

    { // 爬取开始

        Spider

        // 爬取过程

        .create(new WanhoPageProcessor())

        // 爬取结果保存

        .addPipeline(new WanhoPipeline())

        // 爬取的第一个页面

        .addUrl("http://www.wanho.net/a/jyxb/")

        // 启用的线程数

        .thread(5).run();

        }

    }

爬取到的照片

Java爬虫学习（2）之用对象保存文件demo（1）

爬取到的简报

Java爬虫学习（2）之用对象保存文件demo（1）

大家如果要使用代码，配置webmagic的依赖包即可使用。