Jsoup爬取带登录验证码的网站

时间:2023-03-09 23:07:04
Jsoup爬取带登录验证码的网站

  今天学完爬虫之后想的爬一下我们学校的教务系统,可是发现登录的时候有验证码。因此研究了Jsoup爬取带验证码的网站:

大体的思路是:(需要注意的是__VIEWSTATE一直变化,所以我们每个页面都需要重新获取并带着爬取下一个页面)

  1.先爬取网站的主页,由于我们学校的网站是ASP.net,所以需要爬到每个网页的__VIEWSTATE。同时爬取主页也可以获得一个cookie(ASP.sessionId)

  2.带着__VIEWSTATE和ASP.sessionId爬取验证码。(网上说有专门识别验证码的软件,在这里我只是把验证码下载到本地之后,需要用户输入验证码)获取验证码图片的时候需要带着cookie去获取,来标识是本次session请求的验证码,如果不带sessionid下载验证码之后输入验证码也无效。

  3.输入用户名,密码和验证码登录系统,登录系统需要携带一些其他参数(值为空也需要携带)。

  4.登录之后不能直接爬取成绩,需要爬虫登录成功之后的主页面获取__viewstate。

  5.爬完登录成功的主页之后就可以进行爬取成绩,将爬到的成绩收集起来,最后输出到html页面中。

(在这个爬虫的过程中需要注意__viewstate,每个页面都需要获取这个值,这个值是放在input隐藏域中。另外爬取过程中请求头携带REFER参数(也就是表示你从哪个网站过来的),防止盗链)

下面是代码:

1.爬虫的入口

package cn.qlq.craw.JsoupCrawJWXT;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map.Entry;
import java.util.Scanner; /**
* 爬虫主的程序调度器(爬虫教务系统的入口)
*
* @author liqiang
*
*/
public class MainClass { public static void main(String[] args) { // 输入学号和密码
System.out.print("请输入你要查询学号:");
Scanner sc = new Scanner(System.in);
String xuehao = sc.next();
System.out.print("请输入密码:");
String password = sc.next();
// Console con = System.console();
// String pswd = new String(con.readPassword());// 因为读取的是字符数组,所以需要用new try {
DownloadLoginfo downloadLoginfo = new DownloadLoginfo();
LoginClass loginClass = new LoginClass();
GradeOutput gradeOutput = new GradeOutput();
// 1.访问主页,获取验证码与viewstate
downloadLoginfo.getLogInfo();
// 2.登录
loginClass.login(downloadLoginfo, xuehao, password);
for (Entry<String, String> entry : loginClass.getCookies().entrySet()) {
System.out.println("key:" + entry.getKey() + ";value" + entry.getValue());
}
CrawGrade crawGrade = new CrawGrade();
//3. 爬取成绩的上一个页面
crawGrade.crawGradeLastPage(downloadLoginfo.getCookies(), downloadLoginfo.getViewState(), xuehao);
List<String> condition = geneQueryCondition();
//4.循环分学年爬取成绩
for (String xuenian : condition) {
String html_content = crawGrade.crawGrade(xuenian, "2", downloadLoginfo.getCookies(),
// 4.1爬取成绩页面
downloadLoginfo.getViewState(), xuehao);
gradeOutput.collectGrade(html_content); }
//5.输出爬到的数据到html文件中
gradeOutput.outputDatas2Html();
} catch (IOException e) {
System.out.println("无法连接学校服务器");
} catch (Exception e) {
e.printStackTrace();
}
} /**
* 构造需要查询的年份和学期
*
* @return
*/
public static List<String> geneQueryCondition() {
List<String> condition = new ArrayList<String>();
condition.add("2014-2015");
condition.add("2015-2016");
condition.add("2016-2017");
condition.add("2017-2018");
return condition;
} }

2.爬取学校主页获取__VIEWSTATE和cookie

package cn.qlq.craw.JsoupCrawJWXT;

import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry; import org.jsoup.Connection;
import org.jsoup.Connection.Method;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; /**
* url获取图片并且保存到本地
*
* @author liqiang
*
*/
public class DownloadLoginfo {
/**
* 第一次访问获取的cookie(查看发现就返回一个cookie:ASP.NET_SessionId)
*/
private Map<String, String> cookies = null;
/**
* __viewstate 教务系统用于验证的信息
*/
private String viewState = null; public DownloadLoginfo() {
this.cookies = new HashMap<String,String>();;
this.viewState = "";
} /**
* 获取登录信息
* 主要就是访问一下主页面,获取一个__viewstate与cookie
*/
public void getLogInfo() throws Exception {
String urlLogin = "http://newjwc.tyust.edu.cn/";
Connection connect = Jsoup.connect(urlLogin);
// 伪造请求头
connect.header("Accept", "application/json, text/javascript, */*; q=0.01").header("Accept-Encoding",
"gzip, deflate");
connect.header("Accept-Language", "zh-CN,zh;q=0.9").header("Connection", "keep-alive");
connect.header("Content-Length", "213").header("Content-Type",
"application/x-www-form-urlencoded; charset=UTF-8");
connect.header("Host", "newjwc.tyust.edu.cn").header("Referer", "http://newjwc.tyust.edu.cn/");
connect.header("User-Agent",
"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36")
.header("X-Requested-With", "XMLHttpRequest"); // 请求url获取响应信息
Response res = connect.ignoreContentType(true).method(Method.POST).execute();// 执行请求
// 获取返回的cookie
this.cookies = res.cookies();
for (Entry<String, String> entry : cookies.entrySet()) {
System.out.println(entry.getKey() + "-" + entry.getValue());
}
// 获取响应体
String body = res.body(); // 调用下面方法获取__viewstate
this.getViewState(body);// 获取viewState
//调用下载验证码的工具类下载验证码
JsoupDoloadPicture.downloadImg("http://newjwc.tyust.edu.cn/CheckCode.aspx", cookies);;
} /**
* 获取viewstate
*
* @return
*/
public String getViewState(String htmlContent) {
Document document = Jsoup.parse(htmlContent);
Element ele = document.select("input[name='__VIEWSTATE']").first();
String value = ele.attr("value");
// 获取到viewState
this.viewState = value;
return value;
} public Map<String, String> getCookies() {
return cookies;
} public void setCookies(Map<String, String> cookies) {
this.cookies = cookies;
} public String getViewState() {
return viewState;
} public void setViewState(String viewState) {
this.viewState = viewState;
} }

3.带着验证码爬取验证码,并下载到本地

package cn.qlq.craw.JsoupCrawJWXT;

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Map; import org.apache.commons.io.FileUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup; /**
* Jsoup带着cookie下载验证码到本地(必须带着cookie下载验证码,否则下载的验证码无效)
*
* @author liqiang
*
*/
public class JsoupDoloadPicture { /**
* 带着cookie下载验证码图片
*
* @param url
* @param cookies
* @throws IOException
*/
public static void downloadImg(String url, Map<String, String> cookies) throws IOException {
// TODO Auto-generated method stub
Connection connect = Jsoup.connect(url);
connect.cookies(cookies);// 携带cookies爬取图片
connect.timeout(5 * 10000);
Connection.Response response = connect.ignoreContentType(true).execute();
byte[] img = response.bodyAsBytes();
System.out.println(img.length);
// 读取文件存储位置
String directory = ResourcesUtil.getValue("path", "file");
savaImage(img, directory, "yzm.png");
} /**
* 保存图片到本地
* @param img
* @param filePath
* @param fileName
*/
public static void savaImage(byte[] img, String filePath, String fileName) {
BufferedOutputStream bos = null;
FileOutputStream fos = null;
File file = null;
File dir = new File(filePath);
try {
//判断文件目录是否存在
if(dir.exists() && !dir.isDirectory()){
FileUtils.deleteQuietly(dir);
}
dir.mkdir();
file = new File(filePath + "\\" + fileName);
fos = new FileOutputStream(file);
bos = new BufferedOutputStream(fos);
bos.write(img);
System.out.println("验证码已经下载到:"+filePath);
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
if (bos != null) {
try {
bos.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
if (fos != null) {
try {
fos.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
} }
}

4.登录类

package cn.qlq.craw.JsoupCrawJWXT;

import java.util.Map;
import java.util.Map.Entry;
import java.util.Scanner; import org.jsoup.Connection;
import org.jsoup.Connection.Method;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup; /**
* 登录类(访问登录页面获取登录的cookie)
*
* @author liqiang
*
*/
public class LoginClass {
/**
* 记录返回的cookie
*/
private Map<String, String> cookies = null; /**
* 模拟登录获取cookie和sessionid
*
*/
public void login(DownloadLoginfo downloadLoginfo, String xuehao, String mima) throws Exception {
String urlLogin = "http://newjwc.tyust.edu.cn/default2.aspx";
Connection connect = Jsoup.connect(urlLogin);
connect.timeout(5 * 100000);
// 伪造请求头
connect.header("Content-Length", "213").header("Content-Type", "application/x-www-form-urlencoded");
connect.header("Host", "newjwc.tyust.edu.cn").header("Referer",
"http://newjwc.tyust.edu.cn/xscjcx.aspx?xh=" + xuehao + "&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613");
connect.header("User-Agent",
"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"); // 输入验证码
System.out.println("-----------请输入验证码---------");
Scanner sc = new Scanner(System.in);
String yzm = sc.next();
sc.close();
// 携带登陆信息
connect.data("txtUserName", xuehao).data("__VIEWSTATE", downloadLoginfo.getViewState()).data("TextBox2", mima)
.data("Textbox1", "").data("RadioButtonList1", "").data("Button1", "").data("lbLanguage", "")
.data("hidPdrs", "").data("hidsc", "").data("txtSecretCode", yzm);
connect.cookies(downloadLoginfo.getCookies());
// 请求url获取响应信息
Response res = connect.ignoreContentType(true).method(Method.POST).execute();// 执行请求
// 获取返回的cookie
this.cookies = res.cookies();
for (Entry<String, String> entry : cookies.entrySet()) {
System.out.println(entry.getKey() + "-" + entry.getValue());
}
System.out.println("---------获取的登录之后的页面-----------");
String body = res.body();// 获取响应体
System.out.println(body);
} public Map<String, String> getCookies() {
return cookies;
} public void setCookies(Map<String, String> cookies) {
this.cookies = cookies;
} }

5.爬取登录之后的主页和成绩

package cn.qlq.craw.JsoupCrawJWXT;

import java.io.IOException;
import java.util.Map; import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; /**
* 爬取成绩的类
*
* @author liqiang
*
*/
public class CrawGrade { private String viewState;
/**
* 全局获取viewstate的函数
* @param html
* @return
*/
public String getViewState(String html){
Document document = Jsoup.parse(html);
Element ele = document.select("input[name='__VIEWSTATE']").first();
String value = ele.attr("value");
this.viewState = value;
// 获取到viewState
return value;
} /**
* 爬取获取成绩的上一个页面(也就是刚登陆之后的页面)
* @param cookies
* @param viewStata
* @param xuehao
* @return
* @throws IOException
*/
public String crawGradeLastPage(Map<String,String> cookies,String viewStata,String xuehao) throws IOException{
String urlLogin = "http://newjwc.tyust.edu.cn/xscjcx.aspx?xh="+xuehao+"&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613";
Connection connect = Jsoup.connect(urlLogin);
connect.timeout(5 * 100000);
// 伪造请求头
connect.header("Content-Length", "74642").header("Content-Type", "application/x-www-form-urlencoded");
connect.header("Host", "newjwc.tyust.edu.cn").header("Referer", "http://newjwc.tyust.edu.cn/xscjcx.aspx?xh=201420020123&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613");
connect.header("User-Agent",
"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"); // 携带登陆信息
connect.data("xh","201420020123")
.data("xm", viewStata)
.data("hidLanguage", "")
.data("gnmkdm", "N121613");
//设置cookie
connect.cookies(cookies); Document document = connect.post();
System.out.println("-----------爬到的成绩的上一个页面--------------");
String html = document.toString();
System.out.println(html);
// 重新获取到viewState
this.getViewState(html);
return html; } /**
* 爬取成绩页面
*/
public String crawGrade(String xuenian,String xueqi,Map<String,String> cookies,String viewStata,String xuehao) throws IOException{
String urlLogin = "http://newjwc.tyust.edu.cn/xscjcx.aspx?xh="+xuehao+"&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613";
Connection connect = Jsoup.connect(urlLogin);
connect.timeout(5 * 100000);
// 伪造请求头
connect.header("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8")
.header("Accept-Encoding", "gzip, deflate");
connect.header("Accept-Language", "zh-CN,zh;q=0.9").header("Connection", "keep-alive");
connect.header("Content-Length", "74642").header("Content-Type", "application/x-www-form-urlencoded");
connect.header("Host", "newjwc.tyust.edu.cn").header("Referer", "http://newjwc.tyust.edu.cn/xscjcx.aspx?xh=201420020123&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613");
connect.header("User-Agent",
"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"); // 携带登陆信息
connect.data("__EVENTTARGET","")
.data("__EVENTARGUMENT", "")
.data("__VIEWSTATE", this.viewState)
.data("hidLanguage","")
.data("ddlXN", xuenian)
.data("ddlXQ", xueqi)
.data("btn_xn", "")
.data("ddl_kcxz", ""); connect.cookies(cookies); Document document = connect.post();
System.out.println("-----------爬到的成绩的页面--------------");
String html = document.toString();
//更新viewstate
this.getViewState(html);
System.out.println(html);
return html;
} public void setViewState(String viewState) {
this.viewState = viewState;
} }

6.收集成绩的类

package cn.qlq.craw.JsoupCrawJWXT;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; /**
* 收集成绩与输出成绩
*
* @author liqiang
*
*/
@SuppressWarnings("all")
public class GradeOutput {
/**
* 保存成绩的集合
*/
private List<Map<String, Object>> datas; public GradeOutput() {
this.datas = new ArrayList<Map<String, Object>>();
} /**
* 收集成绩
*
* @param html
* @return
*/
public String collectGrade(String html) {
// 解析html
Document document = Jsoup.parse(html);
// 获取成绩表格
Element table = document.select("#Datagrid1").first();
// 选择除表格表头之外的元素
Elements trs = table.select("tr:gt(0)");
for (Element ele : trs) {
Map result = new LinkedHashMap();
Elements ele0 = ele.select("td:eq(0)");// 找到学年
result.put("xuenian", ele0.text());
Elements ele1 = ele.select("td:eq(1)");// 找到学期
result.put("xueqi", ele1.text());
Elements ele3 = ele.select("td:eq(3)");// 找到课程名称
result.put("kecheng", ele3.text());
Elements ele8 = ele.select("td:eq(8)");// 找到成绩
result.put("chengji", ele8.text());
this.datas.add(result);
}
return null;
} /**
* 输出成绩到控制台
*/
public void outPutGrade() {
if (this.datas == null || this.datas.size() == 0) {
return;
}
System.out.println("-------下面是提取到的成绩--------");
for (Map result : datas) { System.out.println(result.get("xuenian") + "\t" + result.get("xueqi") + "\t" + result.get("kecheng") + "\t"
+ result.get("chengji") + "\t");
} } /**
* 最后处理所有的数据,写出到html或者保存数据库
*
* @throws IOException
*/
public void outputDatas2Html() throws IOException {
if (datas != null && datas.size() > 0) {
// 读取文件存储位置
String directory = ResourcesUtil.getValue("path", "file"); File file = new File(directory+"\\gradeOut.html");
// 如果文件不存在就创建文件
if (!file.exists()) {
file.createNewFile();
}
// 构造FileWriter用于向文件中输出信息(此构造方法可以接收file参数,也可以接收fileName参数)
FileWriter fileWriter = new FileWriter(file);
// 开始写入数据
fileWriter.write("<html>");
fileWriter.write("<head>");
fileWriter.write("<title>xxx成绩单</title>");
fileWriter
.write("<style>table{width:100%;table-layout: fixed;word-break: break-all; word-wrap: break-word;}"
+ "table td{border:1px solid black;width:300px}</style>");
fileWriter.write("</head>");
fileWriter.write("<body>");
fileWriter.write("<table cellpadding='0' cellspacing='0' style='text-align:center;'>");
fileWriter.write(
"<tr style='background-color:#95caca;font-size:20px'><td>学年</td><td>学期</td><td>课程名字</td><td>成绩</td></tr>"); for (Map<String, Object> data : datas) {
String xuenian = (String) data.get("xuenian");
String xueqi = (String) data.get("xueqi");
String kecheng = (String) data.get("kecheng");
String chengji = (String) data.get("chengji");
fileWriter.write("<tr>");
fileWriter.write("<td>" + xuenian + "</td>");
fileWriter.write("<td>" + xueqi + "</td>");
fileWriter.write("<td>" + kecheng + "</td>");
fileWriter.write("<td>" + chengji + "</td>");
fileWriter.write("</tr>"); }
fileWriter.write("</table>");
fileWriter.write("</body>");
fileWriter.write("</html>");
// 关闭文件流
fileWriter.close();
}
} public List<Map<String, Object>> getDatas() {
return datas;
} public void setDatas(List<Map<String, Object>> datas) {
this.datas = datas;
} }

path.properties (设置验证码图片和最后的成绩单输出到哪个位置)

#fileToSave
#yzm
file=C:\\Users\\liqiang\\Desktop

读取上述配置文件的工具类:

package cn.qlq.craw.JsoupCrawJWXT;

import java.io.Serializable;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.ResourceBundle;
import java.util.Set; /**
* 资源文件读取工具类
*
*/
public class ResourcesUtil implements Serializable { private static final long serialVersionUID = -7657898714983901418L; /**
* 系统语言环境,默认为中文zh
*/
public static final String LANGUAGE = "zh"; /**
* 系统国家环境,默认为中国CN
*/
public static final String COUNTRY = "CN";
private static Locale getLocale() {
Locale locale = new Locale(LANGUAGE, COUNTRY);
return locale;
} /**
* 根据语言、国家、资源文件名和key名字获取资源文件值
*
* @param language
* 语言
*
* @param country
* 国家
*
* @param baseName
* 资源文件名
*
* @param section
* key名字
*
* @return 值
*/
private static String getProperties(String baseName, String section) {
String retValue = "";
try {
Locale locale = getLocale();
ResourceBundle rb = ResourceBundle.getBundle(baseName, locale);
retValue = (String) rb.getObject(section);
} catch (Exception e) {
e.printStackTrace();
// TODO 添加处理
}
return retValue;
} /**
* 通过key从资源文件读取内容
*
* @param fileName
* 资源文件名
*
* @param key
* 索引
*
* @return 索引对应的内容
*/
public static String getValue(String fileName, String key) {
String value = getProperties(fileName,key);
return value;
} public static List<String> gekeyList(String baseName) {
Locale locale = getLocale();
ResourceBundle rb = ResourceBundle.getBundle(baseName, locale); List<String> reslist = new ArrayList<String>(); Set<String> keyset = rb.keySet();
for (Iterator<String> it = keyset.iterator(); it.hasNext();) {
String lkey = (String)it.next();
reslist.add(lkey);
} return reslist; } /**
* 通过key从资源文件读取内容,并格式化
*
* @param fileName
* 资源文件名
*
* @param key
* 索引
*
* @param objs
* 格式化参数
*
* @return 格式化后的内容
*/
public static String getValue(String fileName, String key, Object[] objs) {
String pattern = getValue(fileName, key);
String value = MessageFormat.format(pattern, objs);
return value;
} public static void main(String[] args) {
System.out.println(getValue("resources.messages", "101",new Object[]{100,200})); //根据操作系统环境获取语言环境
/*Locale locale = Locale.getDefault();
System.out.println(locale.getCountry());//输出国家代码
System.out.println(locale.getLanguage());//输出语言代码s //加载国际化资源(classpath下resources目录下的messages.properties,如果是中文环境会优先找messages_zh_CN.properties)
ResourceBundle rb = ResourceBundle.getBundle("resources.messages", locale);
String retValue = rb.getString("101");//101是messages.properties文件中的key
System.out.println(retValue); //信息格式化,如果资源中有{}的参数则需要使用MessageFormat格式化,Object[]为传递的参数,数量根据资源文件中的{}个数决定
String value = MessageFormat.format(retValue, new Object[]{100,200});
System.out.println(value);
*/ }
}

git地址:https://github.com/qiao-zhi/javaCraw