爬虫初探--PHP

　　我有收藏的cms网站，偶尔会下载一些资源，老司机都懂的:-D。然后有一次好几天没上，堆了好些没弄，心想：cao，这好麻烦啊，能不能写个脚本自动帮我搞？然后忽然就想到，这是不是就是所谓的爬虫呢？心中一阵激动。

　　因为本人还是小白，只会用PHP，所以只能将就一下吧。网站主页就是罗列各个资源的小图标以及入口，我用自己封装的curl函数get请求过去，获取所有详情页的入口链接，然后看有没有上次请求的锚点记录链接，如果没有，就继续请求下一页的的链接，如果有就停止。然后再遍历这个记录链接的数组，依次请求。详情页内有成对的大图与小图，我只要大图，过滤掉小图，然后就是PHP强大的file_get_contents了和file_put_contents函数了，well，talk is cheap，show my code now。

   1 <?php

   2

   3 // 加载封装好的curl请求函数

   4 require "../curl_request.php";

   5

   6 class grab{

   7

   8     // 网站主页

   9     public $url = "/portal.php";

  10     // 图片详情页的匹配规则

  11     private $content_preg = "/\/content-\d{4}-1-1\.html/i";

  12     // 下一页url

  13     private $page = "https://www.xibixibi.com/portal.php?page=";

  14     // 大图匹配规则

  15     private $bigPic_preg = "/\/data\/attachment\/forum\/20\d{2}[01]\d{1}\/[0123]\d{1}\/[a-zA-Z0-9]{22}\.(jpg|png)/";

  16     // 上一次保存的详情url

  17     public $lastSave = "";

  18     // 图片保存根目录

  19     public $root = "E:/root/";

  20     // 保存grabDetailSites方法的调用次数

  21     private $count = 0;

  22     // 图片详情的集合数组

  23     public $gallery = array();

  24

  25     /**

  26      * 构造函数

  27      *

  28      */

  29     public function __construct(){

  30         set_time_limit(0);

  31     }

  32     /**

  33      * 抓取网站所有详情页链接的方法

  34      * @param @url 网站url

  35      */

  36     public function grabDetailSites($url = ""){

  37         // 发送请求

  38         $result = getRequest($url);

  39         // 匹配详情页url

  40         preg_match_all($this->content_preg, $result, $matches, PREG_PATTERN_ORDER);

  41         // 去重

  42         $matches = array_unique($matches[0]);

  43         // 去掉网站最后一个联系方式的连接

  44         if (count($matches) > 12) {

  45             $matches = array_slice($matches, 0, 12);

  46         }

  47         // 看是否已经找到上一次最新的详情页地址

  48         $offset = array_search($this->lastSave, $matches);

  49         // 保存此次最新的详情页连接

  50         if ($this->count == 0) {

  51             file_put_contents("./lastsave.txt", $matches[0]);

  52         }

  53         ++$this->count;

  54         // 如果找到上次抓取的最新详情url，则保存url并停止

  55         if ($offset !== FALSE) {

  56             $matches = array_slice($matches, 0, $offset);

  57             $this->gallery = array_merge($this->gallery, $matches);

  58             return TRUE;

  59         }else{

  60              // 否则递归下一页查找

  61             $this->gallery = array_merge($this->gallery, $matches);

  62             $this->grabDetailSites($this->page . ($this->count + 1));

  63             return TRUE;

  64         }

  65     }

  66

  67     /**

  68      * 根据gallery的详情url获取其内部大图

  69      *

  70      */

  71     public function grabBigPic(){

  72         // 循环gallery详情数组

  73         foreach ($this->gallery as $key => $value) {

  74             // 获取大图的url

  75             $result = getRequest($value);

  76             preg_match_all($this->bigPic_preg, $result, $matches);

  77             $matches = array_unique($matches[0]);

  78             // 循环获取大图的数据

  79             foreach ($matches as $key1 => $value1) {

  80                 $pic = getRequest($value1);

  81                 $month = date("Y/m/");

  82                 if (!is_dir($this->root . $month)) {

  83                     mkdir($this->root . $month, 777, TRUE);

  84                 }

  85                 // 保存图片数据

  86                 file_put_contents($this->root . $month . basename($value1), $pic);

  87             }

  88         }

  89     }

  90

  91     /**

  92      * 整理旧的图片文件

  93      *

  94      */

  95     public function sortPic(){

  96         $allPics = scandir($this->root);

  97         // 删除.和..

  98         unset($allPics[0]);

  99         unset($allPics[1]);

 100         foreach ($allPics as $key => $value) {

 101             $time = date("Y/m/", filemtime($this->root . $value));

 102             if (!is_dir($this->root . $time)) {

 103                 mkdir($this->root . $time, 777, TRUE);

 104             }

 105             // 移动文件

 106             rename($this->root . $value, $this->root . $time . $value);

 107         }

 108     }

 109

 110     public function __set($key, $value){

 111         $this->$key = $value;

 112     }

 113 }

　　因为网站不是很复杂，所以这个类写的还算比较简单吧，本来想做个定时任务的，不过还是等以后我的老爷机换了Ubuntu吧。

相关文章