分享一个爬取HUST(哈理工)学生成绩的Python程序(OCR自动识别验证码)

时间:2023-01-17 14:58:54

Python版本:3.5.2

日期:2018/1/21

__Author__ = "Lance#"

# -*- coding = utf-8 -*-

from urllib import request
from urllib import parse
from http import cookiejar
from aip.ocr import AipOcr
import re class Hust(object):
def __init__(self, stu_id, passwd):
#登录地址,验证码地址,成绩查询地址
self.__url_check = "http://jwzx.hrbust.edu.cn/academic/getCaptcha.do"
self.__url_login = "http://jwzx.hrbust.edu.cn/academic/j_acegi_security_check"
self.__url_score = "http://jwzx.hrbust.edu.cn/academic/manager/score/studentOwnScore.do"
#信息头,模拟浏览器
self.__headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:57.0) Gecko/20100101 Firefox/57.0"
} self.__captcha = ''
#这里是自已在AI中申请到的ID和KEY
self.__APP_ID = 'xxxxxx'
self.__API_KEY = 'xxxxxx'
self.__SECRET_KEY = 'xxxxxx' #参数信息,在浏览器中可以捕获
self.__post_data = {
"groupId": "",
"j_username": stu_id,
"j_password": passwd,
"j_captcha" : ''
} ##声明一个CookieJar对象实例
self.__cookie = cookiejar.CookieJar()
#利用HTTPCookieProcessor对象来创建cookie处理器
self.__cookieProc = request.HTTPCookieProcessor(self.__cookie)
# 通过handler来构建opener
self.__opener = request.build_opener(self.__cookieProc)
#安装使用这个opener
request.install_opener(self.__opener) def ocr_captcha(self):
'''ocr识别验证码''' Req = request.Request(self.__url_check, headers=self.__headers)
captcha = request.urlopen(Req).read() #AI的接口函数
client = AipOcr(self.__APP_ID, self.__API_KEY, self.__SECRET_KEY)
res = client.basicGeneral(captcha)
self.__captcha = res['words_result'][0]['words'] def get_captcha(self):
'''得到验证码''' return self.__captcha def set_postdata(self):
'''设置要发送的参数,就是修改验证码''' self.__post_data["j_captcha"] = self.__captcha def login(self):
'''模拟登录''' #urlencode的作用:将字符串以URL编码,用于编码处理
data = parse.urlencode(self.__post_data).encode()
Req = request.Request(self.__url_login, headers=self.__headers)
html = request.urlopen(Req, data=data)
#登录页采用的是GBK编码,这个需要注意
return html.read().decode("GBK") def get_score(self):
'''获取到成绩信息,并用正则分解''' Req = request.Request(self.__url_score, headers=self.__headers)
res = request.urlopen(Req).read().decode() #解析HTML采用的正则表达式
pat = re.compile('<td>(.*?)</td>', re.S)
list = re.findall(pat, res) #对采集到的数据进行整理
for i, con in enumerate(list):
list[i] = con.replace("\n ", "") return list def display(self, list):
'''显示成绩信息''' cnt = len(list)
new_list = []
cnt -= 3
y = int(cnt / 13) for m in range(y):
new_list.insert(m, [list[j] for j in range(3 + m * 13, 16 + m * 13)]) print("学年 学期 及格标志 分数 学分 课程名") for item in new_list:
print("{} {} {:>5s} {:5s} {:^5s} {:^20s}".format(
item[0], item[1], item[12], item[6].replace('<span style=" color:#FF0000">', "").replace("</span>", ""),
item[7], item[3])) if __name__ == '__main__':
cnt = 1
err_str = "输入的验证码不正确!" #此处是自己的学号和密码
stu = Hust("xxxxxx", "xxxxxx")
while True:
stu.ocr_captcha()
print("识别到的验证码为: %s ------ " % stu.get_captcha(), end="")
stu.set_postdata()
html = stu.login()
if err_str not in html:
print("验证码正确")
break
cnt += 1
print("验证码错误,启动第%d次识别" % cnt)
print()
print("Score Info".center(70, "-"))
list = stu.get_score()
stu.display(list)
print("End".center(70, "-"))

完成效果图:

分享一个爬取HUST(哈理工)学生成绩的Python程序(OCR自动识别验证码)

请自动忽略这个人挂科的消息,0.0