PHP抓取豆瓣读书爬虫代码

时间:2023-03-09 04:15:30
PHP抓取豆瓣读书爬虫代码
<?php
//演示地址 http://asizu.sinaapp.com/reptile_douban.php
//数据量不是特别大,没有写抓完数据便停止。 喜欢的朋友拿去自己改改就好了
header("Content-Type:text/html;charset=utf-8"); define("MYSQL_HOST",SAE_MYSQL_HOST_M);
define("MYSQL_NAME","douban");
define("MYSQL_USER",SAE_MYSQL_USER);
define("MYSQL_PASSWORD",SAE_MYSQL_PASS);
define("MYSQL_PORT",SAE_MYSQL_PORT); $action = $_GET['url']; if(!empty($action))
{
$data = getLink($action);
$data = str_substr('<div class="article">','<div class="aside">',$data);
$dataArray = explode('<dl>',$data); array_splice($dataArray,0,1); foreach($dataArray as $key => $item)
{
$bookArray[$key]['title'] = str_substr('class="title" target="_blank">','</a>',$item);
$bookArray[$key]['rating'] = str_substr('<span class="rating_nums">','</span>',$item);
$bookArray[$key]['book_id'] = str_substr('/subject/','/?from',$item); if(!selectBookSaveed($bookArray[$key]['title']))
{
insertMysql($bookArray[$key]);
} //$dataArray[$key] = htmlspecialchars($item);
} $page = explode('book?start=',$action);
$new_url = $page[0] ."book?start=". ($page[1] + 15);
// var_dump($page); // var_dump($bookArray);
} //获取连接
function getLink($url)
{
$data = getData($url);
return $data;
} //抓取数据的函数
function getData($url)
{
//初始化
$ch = curl_init(); //设置选项,包括URL
curl_setopt($ch,CURLOPT_URL,$url);
curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
curl_setopt($ch,CURLOPT_HEADER,0);
curl_setopt($ch,CURLOPT_USERAGENT,'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.47 Safari/536.11'); //执行并获取HTML文档内容
$output = curl_exec($ch); //释放curl句柄
curl_close($ch); //返回数据
return $output;
} //保存入库
function insertMysql($dataArray)
{
$field = "";
$value = "";
foreach($dataArray as $key => $item)
{
$field .= ",".$key;
$value .= ",'".$item."'";
}
$field = substr($field,1,strlen($field)-1);
$value = substr($value,1,strlen($value)-1); $sql = "INSERT INTO douban (".$field.") VALUES(".$value.")";
// var_dump($sql);
// exit;
$result = mysqlOperation($sql); return $result;
} //查询是否已经录入
function selectBookSaveed($title)
{
$sql = "SELECT * FROM douban WHERE title = '".$title."'";
$result = mysqlOperation($sql,"select");
if(empty($result))
{
return false;//未存在
}
return true;
} //数据库操作方法
function mysqlOperation($sql,$method = "query")
{
$mysqli = new mysqli(
SAE_MYSQL_HOST_M,
SAE_MYSQL_USER,
SAE_MYSQL_PASS,
SAE_MYSQL_DB,
SAE_MYSQL_PORT
);
if( mysqli_connect_errno() )
echo 'error';
// return false; if($method == "select")
return mysqli_fetch_assoc($mysqli->query($sql)); if($method == "query")
return $mysqli->query($sql); } // 字符串截取函数
function str_substr($start, $end, $str)
{
$temp = explode($start, $str, 2);
$content = explode($end, $temp[1], 2);
return $content[0];
}
?>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<script src="http://ajax.aspnetcdn.com/ajax/jQuery/jquery-1.4.4.min.js"></script>
<script>
$(document).ready(function(){
if($("#input").val() != ""){
$("#froms").submit();
}
});
</script>
<form action="?" method="get" id="froms">
<input id="input" value="<?php echo $new_url; ?>" type="text" name="url">
<input type="submit">
</form>