jsoup 解析html 页面数据

时间:2022-11-01 00:25:31

我html 页面元素:

/html/body/table[2]/tbody/tr[1]/td/table/tbody/tr[1]/td[2]/font
/html/body/table[2]/tbody/tr[1]/td/table/tbody/tr[2]/td[2]/div/font/span
/html/body/table[2]/tbody/tr[3]/td/font/b
/html/body/table[2]/tbody/tr[5]/td/div/table/tbody/tr[1]/td[1]/div/b/font/span

以下是代码实现:

  

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class JsoupParseHtml {

public static String getHtmlByUrl(String url){
String html = null;
//创建httpClient对象
HttpClient httpClient = new DefaultHttpClient();
//以get方式请求该URL
HttpGet httpget = new HttpGet(url);
try {
//得到responce对象
HttpResponse responce = httpClient.execute(httpget);
//返回码
int resStatu = responce.getStatusLine().getStatusCode();
//200正常 其他就不对
if (resStatu==HttpStatus.SC_OK) {
//获得相应实体
HttpEntity entity = responce.getEntity();
if (entity!=null) {
//获得html源代码
html = EntityUtils.toString(entity);

}
}
} catch (Exception e) {
System.out.println("访问【"+url+"】出现异常!");
e.printStackTrace();
} finally {
httpClient.getConnectionManager().shutdown();
}
return html;
}

static String txtpathstr="d:\\one\\";


public static void main(String[] args) throws Exception {

String contents="";
String urlbase="http://localhost:8080/1.htm";

//String urlbase="http://www.qiushibaike.com/8hr/page/8?s=4513032";//1?s=4513032
contents+=gettxtlist(urlbase);

//写入文件
writefile(contents);

}

public static String gettxtlist(String txturl) throws Exception{

String content="";
Document doc=jsoupconnect(txturl,360000);
//Elements els= doc.select("div.content");

Elements els= doc.select("html");


for(Element el:els){
if (el.select("body").size()>1){
continue;
}
content+=el.text()+"\r\n";
System.out.println();
System.out.println(content);
}
return content;
}




public static Document jsoupconnect (String url,int timeout){
Document doc=null;
int retry=5;
while (null==doc&&retry>0){
retry--;
try{
doc= Jsoup.connect(url).userAgent("Mozilla/5.0 (Windows NT 6.1; rv:5.0)").timeout(timeout).get();
}catch(Exception e){
e.printStackTrace();

}
}
return doc;
}


public static void writefile(String txtstr)throws Exception{
File txtpath=new File(txtpathstr);
if (!txtpath.exists()){
txtpath.mkdirs();
}
File htxt=new File(txtpathstr+"test.txt");
BufferedOutputStream outBuff = new BufferedOutputStream(new FileOutputStream(htxt));
outBuff.write(txtstr.getBytes());
outBuff.flush();
outBuff.close();
}

}

  

存在问题:只能一次性读取出来,不能按照要求,按照table分开,下一版本会解决这个问题