利用jsoup解析网站网页

时间:2022-11-01 09:17:17

用到的工具为 jsoup-1.7.2.jar包,具体jsoup的相关文档,请去这边看http://jsoup.org/,这里有全部Api可以查询。

首先请求网页,

Document doc = Jsoup.connect(search).timeout(5000).get();

 

获取html:

<!DOCTYPE html>
<!--STATUS OK-->
<html>
<head>
<meta charset="utf-8" />
<meta content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no;" name="viewport" />
<meta name="apple-mobile-web-app-capable" content="yes" />
<meta name="apple-mobile-web-app-status-bar-style" content="black" />
<title>口袋_站内搜索</title>
<link href="/static/css/mobileDefault.css?v=0.9" rel="stylesheet" type="text/css" />
<link href="/static/css/mobileGameresult.css?v=0.9" rel="stylesheet" type="text/css" />
<script type="text/javascript" src="/static/js/jquery-1.7.2.min.js"></script>
<script type="text/javascript" src="/static/js/baidu-ajax.js"></script>
<script type="text/javascript" src="/static/js/statistic.js"></script>
</head>
<body>
<div id="wrap" style="position: relative;">
<div id="head">
<div id="head_d">
<form name="f" action="http://zhannei.baidu.com/cse/search" class="fm" id="bdcs-search-form">
<span class="s_back_wr">
<div onclick="backRefer();" class="head-a">
<img src="/static/img/back.png" width="12px" height="20px" border="0" class="s_back" id="s_back_i" />
</div> </span>
<span class="s_ipt_wr" id="search_input_container" style="background-color:#f8f8f8;"> <b class="icon" id="icon_remove" onclick="removeText();"></b> <input name="q" id="kw" class="s_ipt" style="background-color:#f8f8f8;" value="口袋" maxlength="100" autocomplete="off" /> <input name="click" value="1" type="hidden" /> </span>
<div onclick="submitMobile();" class="head-a">
<span class="s_btn_wr"> <input name="s" value="287293036948159515" type="hidden" /> <input name="nsid" value="" type="hidden" /> <img src="/static/img/fdj2.png" width="20px" height="20px" border="0" id="su" class="s_btn" /> </span>
</div>
</form>
<ul id="bdcs-search-sug-list" class="bdcs-search-sug-list"></ul>
</div>
</div>
<style type="text/css">
.bdcs-search-sug-list
{
display
:none;
height
:auto;
border
: 1px solid #DDD;
position
: absolute;
z-index
: 2147483647;
background-color
: #FFF;
}
.bdcs-search-hot-list
{
height
:40%;
border
: 1px solid #DDD;
position
: absolute;
z-index
: 2147483647;
background-color
: #FFF;
}
.hotRadious
{
border
:2px solid;
border-radius
:25px;
-moz-border-radius
:25px;
}
.bdcs-search-sug-list-item
{
display
: block;
list-style
: none;
cursor
: pointer;
padding-top
:9px;
padding-left
:8px;
vertical-align
:middle;
white-space
: nowrap;
text-overflow
: ellipsis;
overflow
: hidden;
height
: 46px;
line-height
: 36px;
font-family
: arial;
font-size
: 18px;
}
.fdj
{
display
:block;
vertical-align
:middle;
float
:left;
margin-top
:4px;
margin-right
:8px;
}
.bdcs-search-sug-list-item-current
{
background-color
: #EBEBEB;
}

</style>
<script type="text/javascript">

if ($("#kw").attr("value") == "") {
$(
"#icon_remove").hide();
}
function removeText() {
$(
"#kw").attr("value", "");
$(
"#icon_remove").hide();
}
function submitMobile() {
var frm = document.getElementById('bdcs-search-form');
frm.submit();
}

function backRefer() {
var refer = document.referrer;
if (refer && refer.indexOf('cse/search') == -1) {
window.location.href
= refer;
}
else {
history.go(
-1);
}

}
</script>
<script>
$(
"#kw").bind("input", function() {
if ($("#kw").attr("value") == "" || $("#kw").attr("value").match(/^\s+$/)) {
$(
"#icon_remove").hide();
}
else {
$(
"#icon_remove").show();
}
});
</script>
<!-- 兼容新老数据-->
<div id="container" class="clearfix ">
<div id="center" class="content">
<div id="results" class="content-main">
<div class="result-list" id="result-list-game">
<div class="result-item result-game-item">
<div onclick="window.location='http://www.biquge.com/32_32224/'" class="game-legend-a" ontouchstart="this.className = 'result_content_div'" ontouchend="this.className = 'game-legend-a result_content_end';" ontouchmove="this.className = 'game-legend-a result_content_end';">
<div class="result-game-item-pic" style="width:80px;height:110px;">
<img src="http://www.biquge.com/files/article/image/33/32224/32224.jpg" alt="梦想&lt;em&gt;口袋&lt;/em&gt;" onerror="$(this).attr('src', '/static/img/novel-noimg.jpg')" style="border: 1px solid rgb(229,229,229);width:80px;height:110px;" class="result-game-item-pic-link-img" />
</div>
<div class="result-game-item-detail">
<h3 class="result-item-title result-game-item-title"> 梦想<em>口袋</em> </h3>
<p class="result-game-item-desc">第一百六十二章大结局(下)轮回! 时光飞逝,又过了三十三年。W wW、 Qb ⑸.C0M \ 天…</p>
<div class="result-game-item-info">
<p class="result-game-item-info-tag"> <span class="result-game-item-info-tag-title">作者:</span> <span> 天天不休 </span> </p>
<p class="result-game-item-info-tag"> <span class="result-game-item-info-tag-title">类型:</span> <span class="result-game-item-info-tag-title">都市言情</span> </p>
<p class="result-game-item-info-tag"> <span class="result-game-item-info-tag-title">更新时间:</span> <span class="result-game-item-info-tag-title">2016-08-18</span> </p>
<p class="result-game-item-info-tag"> <span class="result-game-item-info-tag-title">最新章节:</span> <span class="result-game-item-uspan" onclick="clickInner('http://www.biquge.com/32_32224/1796032.html');"> 第六十五章豪华,该死,龙过江 </span> </p>
</div>
</div>
<!--<a href="javascript:void(0);" onclick="clickOut('');" title="梦想口袋" class="result-game-item-title-link result-all-a"></a>-->
<div style="clear: both;"></div>
<!-- </div> -->
</div>
</div>
<div class="result-item result-game-item">
<div onclick="window.location='http://www.biquge.com/20_20518/'" class="game-legend-a" ontouchstart="this.className = 'result_content_div'" ontouchend="this.className = 'game-legend-a result_content_end';" ontouchmove="this.className = 'game-legend-a result_content_end';">
<div class="result-game-item-pic" style="width:80px;height:110px;">
<img src="http://www.biquge.com/files/article/image/21/20518/20518.jpg" alt="超次元&lt;em&gt;口袋&lt;/em&gt;" onerror="$(this).attr('src', '/static/img/novel-noimg.jpg')" style="border: 1px solid rgb(229,229,229);width:80px;height:110px;" class="result-game-item-pic-link-img" />
</div>
<div class="result-game-item-detail">
<h3 class="result-item-title result-game-item-title"> 超次元<em>口袋</em> </h3>
<p class="result-game-item-desc">平安夜,本想告别童贞,却没想到惨遭失恋。 莫小天对着流星许下若干愿望,却没想到获得一个超次元<em>口袋</em>,可以打开任何次元,并且从那里摸出一件东西。 炮姐的安全裤!吾王的咖喱棒!蓝胖子</p>
<div class="result-game-item-info">
<p class="result-game-item-info-tag"> <span class="result-game-item-info-tag-title">作者:</span> <span> 秋子鸣 </span> </p>
<p class="result-game-item-info-tag"> <span class="result-game-item-info-tag-title">类型:</span> <span class="result-game-item-info-tag-title">都市言情</span> </p>
<p class="result-game-item-info-tag"> <span class="result-game-item-info-tag-title">更新时间:</span> <span class="result-game-item-info-tag-title">2016-09-04</span> </p>
<p class="result-game-item-info-tag"> <span class="result-game-item-info-tag-title">最新章节:</span> <span class="result-game-item-uspan" onclick="clickInner('http://www.biquge.com/20_20518/1624811.html');"> 第一百八十章 监控学校 </span> </p>
</div>
</div>
<!--<a href="javascript:void(0);" onclick="clickOut('');" title="超次元口袋" class="result-game-item-title-link result-all-a"></a>-->
<div style="clear: both;"></div>
<!-- </div> -->
</div>
</div>
<div class="result-item result-game-item">
<div onclick="window.location='http://www.biquge.com/26_26070/'" class="game-legend-a" ontouchstart="this.className = 'result_content_div'" ontouchend="this.className = 'game-legend-a result_content_end';" ontouchmove="this.className = 'game-legend-a result_content_end';">
<div class="result-game-item-pic" style="width:80px;height:110px;">
<img src="http://www.biquge.com/files/article/image/27/26070/26070.jpg" alt...

再获取查询的小说列表,通过result-list筛选,获取result-item:

es = doc.getElementsByClass("result-list");
element
= es.get(0);
results
= element.getElementsByClass("result-item");

item的HTML:

<div class="result-item result-game-item"> 
<div onclick="window.location='http://www.biquge.com/32_32224/'" class="game-legend-a" ontouchstart="this.className = 'result_content_div'" ontouchend="this.className = 'game-legend-a result_content_end';" ontouchmove="this.className = 'game-legend-a result_content_end';">
<div class="result-game-item-pic" style="width:80px;height:110px;">
<img src="http://www.biquge.com/files/article/image/33/32224/32224.jpg" alt="梦想&lt;em&gt;口袋&lt;/em&gt;" onerror="$(this).attr('src', '/static/img/novel-noimg.jpg')" style="border: 1px solid rgb(229,229,229);width:80px;height:110px;" class="result-game-item-pic-link-img" />
</div>
<div class="result-game-item-detail">
<h3 class="result-item-title result-game-item-title"> 梦想<em>口袋</em> </h3>
<p class="result-game-item-desc">第一百六十二章大结局(下)轮回! 时光飞逝,又过了三十三年。W wW、 Qb ⑸.C0M \ 天…</p>
<div class="result-game-item-info">
<p class="result-game-item-info-tag"> <span class="result-game-item-info-tag-title">作者:</span> <span> 天天不休 </span> </p>
<p class="result-game-item-info-tag"> <span class="result-game-item-info-tag-title">类型:</span> <span class="result-game-item-info-tag-title">都市言情</span> </p>
<p class="result-game-item-info-tag"> <span class="result-game-item-info-tag-title">更新时间:</span> <span class="result-game-item-info-tag-title">2016-08-18</span> </p>
<p class="result-game-item-info-tag"> <span class="result-game-item-info-tag-title">最新章节:</span> <span class="result-game-item-uspan" onclick="clickInner('http://www.biquge.com/32_32224/1796032.html');"> 第六十五章豪华,该死,龙过江 </span> </p>
</div>
</div>
<!--<a href="javascript:void(0);" onclick="clickOut('');" title="梦想口袋" class="result-game-item-title-link result-all-a"></a>-->
<div style="clear: both;"></div>
<!-- </div> -->
</div>
</div>

 

解析item:

for (Element result : results) {

Book book
= new Book();

try {
// 小说地址
Element titleElement = result.getElementsByClass("game-legend-a").get(0);
String onclick
= titleElement.attr("onclick").trim();
book.url
= onclick.split("'")[1];
}
catch (Exception e) {
e.printStackTrace();
}

try {
// 图片地址
Element imgElement = result.getElementsByClass("result-game-item-pic").get(0);
book.src
= imgElement.select("img").attr("src").trim();
}
catch (Exception e) {
e.printStackTrace();
}

try {
// 小说标题
Element titleElement = result.getElementsByClass("result-item-title").get(0);
book.name
= titleElement.text().replace("<em>", "").replace("</em>", "").trim();
}
catch (Exception e) {
e.printStackTrace();
}

try {
// 简介
Element descElement = result.getElementsByClass("result-game-item-desc").get(0);
book.description
= descElement.text().replace("<em>", "").replace("</em>", "").trim();
}
catch (Exception e) {
e.printStackTrace();
}

try {
// 作者
Element autorElement = result.getElementsByClass("result-game-item-info-tag").get(0);
book.autor
= autorElement.select("span").get(1).text().trim();
}
catch (Exception e) {
e.printStackTrace();
}

try {
// 类型
Element typeElement = result.getElementsByClass("result-game-item-info-tag").get(1);
book.type
= typeElement.select("span").get(1).text().trim();
}
catch (Exception e) {
e.printStackTrace();
}

try {
// 更新时间
Element timeElement = result.getElementsByClass("result-game-item-info-tag").get(2);
book.lastUpdateTime
= timeElement.select("span").get(1).text().trim();
}
catch (Exception e) {
e.printStackTrace();
}

try {
// 最后章节
Element chapElement = result.getElementsByClass("result-game-item-info-tag").get(3);
book.lastUpdateChapter
= chapElement.select("span").get(1).text().trim();
}
catch (Exception e) {
e.printStackTrace();
}

list.add(book);

}

这样就解析完成了。