Python3批量爬取指定微博中的图片
import requests
import json
import re
import time
def downloadImage(href, cnt):
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36",
}
dimg = requests.get(href, headers=headers).content
with open("E:\\pics\\" + str(cnt) + ".jpg", 'wb') as f:
f.write(dimg)
f.close()
print("图片" + str(cnt) + "已下载")
def pic_spider(headers, weibo_id, surl="/mblog/picAll/Ku9M9q9cQ?rl=1"):
base_url = "/"
url = base_url + surl
res = requests.get(url, headers=headers)
one_pic_compile = re.compile('<img src="(.*?)"')
pic_url_ls = one_pic_compile.findall(res.text)
for i, pic_url in enumerate(pic_url_ls):
downloadImage(pic_url, str(weibo_id) + str(i))
def main():
with open("weibo_id.json", "r") as f:
weibo_id_ls = json.load(f)
base_url = ""
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36",
"cookie": "SCF=AqVSTARJXRNZouo6nNF9xKz9Al9c_XbFdUndXfHBZANMf_O3I1wzz_pEtetOy0hNNNfGEZdvePHWT6mws0tpf34.; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5-_hGmCeI-rYRjN3uql2sS5NHD95Qf1hz0S0nR1KMRWs4Dqcj-i--fi-z7iKysi--fi-2RiKnp9Jqt; _T_WM=9f3070657cd64c960c880a9e53056c8c; SUB=_2A25MsDbuDeRhGeNG6VEW-CbFzDSIHXVsW1qmrDV6PUJbkdANLUnekW1NS3quSjDPfopWfmhP9G3AZ7_MLspv1O1D; SSOLoginState=1639204542"
}
pic_compile = re.compile('\[<a href="(.*?)">组图共')
for i, weibo_id in enumerate(weibo_id_ls):
# if i < 69:
# continue
time.sleep(10)
url = f"{base_url}/comment/{weibo_id}"
print(i, url)
res = requests.get(url, headers=headers)
if "如果没有自动跳转" in res.text:
print(weibo_id + "网址已失效")
continue
pic_url = pic_compile.findall(res.text)
if len(pic_url) > 0:
pic_spider(headers, weibo_id, pic_url[0])
else:
print(res.text)
pic = re.findall('<img src="http:(.*?)\.jpg', res.text)
if len(pic) == 0:
pic = re.findall('<img src="http:(.*?)\.gif', res.text)
pic_url = "http:" + pic[0] + ".gif"
else:
pic_url = "http:" + pic[0] + ".jpg"
downloadImage(pic_url, str(weibo_id) + "0")
if __name__ == '__main__':
main()