JAVA爬取is老师信息并存到数据库

时间:2022-09-18 17:54:57
package CityU;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;

import org.apache.commons.httpclient.*;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.HasChildFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.tags.TableHeader;
import org.htmlparser.tags.TableRow;
import org.htmlparser.tags.TableTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

public class extractTeachersInfos {

// 创建静态全局变量
static Connection conn;
static Statement st;

public static void main(String[] args) {

ArrayList<String> links= extractLinks("http://www.cb.cityu.edu.hk/is/people/academic/");
for (int i = 0; i < links.size(); i++) {
HashMap<String, String> staffInfo = extractStaff(links.get(i));
/*键有address, phone, fax, email, personalweb
* */
insert(staffInfo);


}

}


public static ArrayList<String> extractLinks(String url) {
ArrayList<String> links =new ArrayList<String>();
try {
Parser parser = new Parser();
parser.setURL(url);
parser.setEncoding(parser.getEncoding());
//第一步到staff表格
NodeFilter Tablefilter = new HasAttributeFilter( "class","staff_profile" );
NodeList tableNodelist = parser.extractAllNodesThatMatch(Tablefilter);
TableTag staffTable = (TableTag)tableNodelist.elementAt(0);
TableRow[] rows = staffTable.getRows();
for (int i = 0; i < rows.length; i++) {
//到每一行
TableRow row =rows[i];
TableColumn[] columns =row.getColumns();
for (int j = 0; j < columns.length; j++) {
//到每一列
String tempString = columns[j].toHtml();
Parser parser2 = new Parser(tempString);
//通过图片的父类tag或得链接信息
NodeFilter innerFilter = new TagNameFilter("img");
NodeFilter filter = new HasChildFilter(innerFilter);
NodeList nodes = parser2.extractAllNodesThatMatch(filter);
TagNode node = (TagNode)nodes.elementAt(0);
String nodeString = node.getText();
String nodeInfo =nodeString.substring(nodeString.indexOf("\"")); //获取tag的属性信息
nodeInfo= nodeInfo.replaceAll("\"","");
links.add("http://www.cb.cityu.edu.hk"+nodeInfo);
}
}
} catch (Exception e) {
e.printStackTrace();
}
return links;
}

public static HashMap<String, String> extractStaff(String url){
HashMap<String, String> staffInfo = new HashMap<String, String>();
try {
//第一个parser 获得 cityu-cb-staff-contact所在的那个div
Parser parser = new Parser();
parser.setURL(url);
parser.setEncoding(parser.getEncoding());
NodeFilter filter = new HasAttributeFilter( "id","cityu-cb-staff-contact" );
NodeList nodes = parser.extractAllNodesThatMatch(filter);
String divInfo = nodes.elementAt(0).toHtml();
//第二个parser开始对table解析
Parser tableParser =Parser.createParser(divInfo, "utf-8") ;
//tableParser.setURL(divInfo);
//tableParser.setEncoding(parser.getEncoding());
NodeFilter tableFilter = new NodeClassFilter(TableTag.class);
NodeList tableList = tableParser.extractAllNodesThatMatch(tableFilter);
for (int i=0; i<tableList.size(); i++) {
TableTag table = (TableTag) tableList.elementAt(i);
//取得表中的行集
TableRow[] rows = table.getRows();
//遍历每行
for (int r=0; r<rows.length; r++) {
String keyString=null;
String valueString =null;

TableRow tr = rows[r];
//从而得到<th> tag
TableHeader[] headers = tr.getHeaders();
for (int j = 0; j < headers.length; j++) {
// System.out.println("tag标签为:" + headers[j].getTagName());
// System.out.println("标签的内容为:" + headers[j].getStringText());
keyString = headers[j].getStringText().toLowerCase();
keyString=keyString.replaceAll(" ","");
keyString = keyString.substring(0,keyString.indexOf(":"));
}
//行中的列
TableColumn[] td = tr.getColumns();
if (td.length==0){
r++;
TableRow nextTr = rows[r];
TableColumn[] nextTd = nextTr.getColumns();
valueString=nextTd[0].toPlainTextString().trim();
}else {
valueString=td[0].toPlainTextString().trim();
}
staffInfo.put(keyString, valueString);
}
}

} catch (ParserException e) {
e.printStackTrace();
}

return staffInfo;
}

public static void insert(HashMap<String, String> staffInfo) {
String adress=staffInfo.get("address")==null?null:"\""+staffInfo.get("address")+"\"";
String phone=staffInfo.get("phone")==null?null:"\""+staffInfo.get("phone")+"\"";
String fax =staffInfo.get("fax")==null?null:"\""+staffInfo.get("fax")+"\"";
String email=staffInfo.get("e-mail")==null?null:"\""+staffInfo.get("e-mail")+"\"";
String personal_web= staffInfo.get("personalweb")==null?null:"\""+staffInfo.get("personalweb")+"\"";
conn = getConnection();// 首先要获取连接,即连接到数据库
String sqlString = "VALUES("+adress+","+phone +","+fax+","+email +","+personal_web+")";
try {
String sql = "INSERT INTO cityu_staff(address, phone, fax,email, personal_web)"+ sqlString;// 插入数据的sql语句

st = (Statement) conn.createStatement();// 创建用于执行静态sql语句的Statement对象

int count = st.executeUpdate(sql);// 执行插入操作的sql语句,并返回插入数据的个数

System.out.println("向staff表中插入 " + count + " 条数据");//输出插入操作的处理结果

conn.close();//关闭数据库连接

} catch (SQLException e) {
System.out.println("插入数据失败" + e.getMessage());
}
}
public static Connection getConnection() {
Connection con = null;//创建用于连接数据库的Connection对象
try {
Class.forName("com.mysql.jdbc.Driver");// 加载Mysql数据驱动

con = DriverManager.getConnection(
"jdbc:mysql://localhost:3306/myuser", "root", "1234");// 创建数据连接

} catch (Exception e) {
System.out.println("数据库连接失败" + e.getMessage());
}
return con;//返回所建立的数据库连接
}
}