Python3批量爬取指定微博中的图片

时间:2025-04-24 17:01:29
import requests import json import re import time def downloadImage(href, cnt): headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36", } dimg = requests.get(href, headers=headers).content with open("E:\\pics\\" + str(cnt) + ".jpg", 'wb') as f: f.write(dimg) f.close() print("图片" + str(cnt) + "已下载") def pic_spider(headers, weibo_id, surl="/mblog/picAll/Ku9M9q9cQ?rl=1"): base_url = "/" url = base_url + surl res = requests.get(url, headers=headers) one_pic_compile = re.compile('<img src="(.*?)"') pic_url_ls = one_pic_compile.findall(res.text) for i, pic_url in enumerate(pic_url_ls): downloadImage(pic_url, str(weibo_id) + str(i)) def main(): with open("weibo_id.json", "r") as f: weibo_id_ls = json.load(f) base_url = "" headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36", "cookie": "SCF=AqVSTARJXRNZouo6nNF9xKz9Al9c_XbFdUndXfHBZANMf_O3I1wzz_pEtetOy0hNNNfGEZdvePHWT6mws0tpf34.; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5-_hGmCeI-rYRjN3uql2sS5NHD95Qf1hz0S0nR1KMRWs4Dqcj-i--fi-z7iKysi--fi-2RiKnp9Jqt; _T_WM=9f3070657cd64c960c880a9e53056c8c; SUB=_2A25MsDbuDeRhGeNG6VEW-CbFzDSIHXVsW1qmrDV6PUJbkdANLUnekW1NS3quSjDPfopWfmhP9G3AZ7_MLspv1O1D; SSOLoginState=1639204542" } pic_compile = re.compile('\[<a href="(.*?)">组图共') for i, weibo_id in enumerate(weibo_id_ls): # if i < 69: # continue time.sleep(10) url = f"{base_url}/comment/{weibo_id}" print(i, url) res = requests.get(url, headers=headers) if "如果没有自动跳转" in res.text: print(weibo_id + "网址已失效") continue pic_url = pic_compile.findall(res.text) if len(pic_url) > 0: pic_spider(headers, weibo_id, pic_url[0]) else: print(res.text) pic = re.findall('<img src="http:(.*?)\.jpg', res.text) if len(pic) == 0: pic = re.findall('<img src="http:(.*?)\.gif', res.text) pic_url = "http:" + pic[0] + ".gif" else: pic_url = "http:" + pic[0] + ".jpg" downloadImage(pic_url, str(weibo_id) + "0") if __name__ == '__main__': main()