java爬虫(Jsoup爬取龙腾网)第一页信息

时间:2022-10-31 11:06:30

如果不太懂Jsoup,看看这里就会了http://blog.csdn.net/disiwei1012/article/details/51614177

要爬取的网站http://www.ltaaa.com

======================下面是正文=========================

先看下需要爬取的网页
java爬虫(Jsoup爬取龙腾网)第一页信息

先看下龙腾网:http://www.ltaaa.com的firebug
java爬虫(Jsoup爬取龙腾网)第一页信息

再看下我抓取后的:
java爬虫(Jsoup爬取龙腾网)第一页信息

实体:

public class News {

private int id;

private String title;
private String href;
private String content;
private String imghref;

public News() {}

public News(String title,String href,String content,String imghref,int id){
this.imghref = imghref;
this.title = title;
this.content = content;
this.href = href;
this.id = id;
}


public int getId() {
return id;
}

public void setId(int id) {
this.id = id;
}

public String getImghref() {
return imghref;
}

public void setImghref(String imghref) {
this.imghref = imghref;
}

public String getTitle() {
return title;
}

public void setTitle(String title) {
this.title = title;
}

public String getHref() {
return href;
}

public void setHref(String href) {
this.href = href;
}

public String getContent() {
return content;
}

public void setContent(String content) {
this.content = content;
}

}

代码

public class JsoupLong {

/**
* @param args
*/

public static void main(String[] args) throws Exception{
getNews();
}

static ArrayList getNews() throws Exception{

ArrayList<News> newsList = new ArrayList<News>();

String url = "http://www.ltaaa.com";
Document doc = Jsoup.connect(url).get();
Element element = doc.getElementById("ngro1");
Elements elements = element.getElementsByTag("li");
int i = 1;

for(Element ele:elements){
News news = new News();
Element img = ele.select("img").first();
Element title = ele.select("a").first();
news.setImghref(img.attr("src"));
news.setTitle(title.text());
news.setHref(url+title.attr("href"));
news.setId(i++);
newsList.add(news);
}

File file = new File("d://longteng.html");
FileWriter fw = new FileWriter(file);
fw.write("<html><head><meta charset='UTF-8'><title>by dqf</title></head><body><center>");
fw.write("<br><br><br><br><table>");
fw.write("<tr style='background-color: gray'>");
fw.write("<td align='center'>id</td>");
fw.write("<td align='center'>国籍</td>");
fw.write("<td align='center'>标题</td>");
fw.write("</tr>");

for(News news:newsList){
fw.write("<tr>");
fw.write("<td>"+news.getId()+"</td>");
fw.write("<td><img src='"+news.getImghref()+"'></td>");
fw.write("<td><a href='"+news.getHref()+"'>"+news.getTitle()+"</td>");
fw.write("</tr>");
}

fw.write("</table></body></center></html>");
fw.flush();
fw.close();
return null;
}

}