【pyhon】nvshens按目录图片批量下载爬虫1.00(多线程版)

时间:2024-03-09 14:57:56
# nvshens按目录图片批量下载爬虫1.00(多线程版)
from bs4 import BeautifulSoup
import requests
import datetime
import urllib.request
import os
import threading

user_agent=\'Mozilla/4.0 (compatible;MEIE 5.5;windows NT)\'
headers={\'User-Agent\':user_agent}


# 下载图片到本地
def downloadPics(pictures):
    while(len(pictures)>0):
        pic=pictures.pop()

        name=pic.split(\'/\')[-1]
        folder=pic.split(\'/\')[-2]

        # 判断目录是否存在,不存在则创建之
        if os.path.exists(\'./\'+folder)==False:
            os.makedirs(\'./\'+folder)

        try:
            rsp=urllib.request.urlopen(pic)
            img=rsp.read()
            with open(\'./\'+folder+"/"+name,\'wb\') as f:
                f.write(img)
            print(\'图片\'+pic+\'下载完成\')
        except Exception as e:
            print(\'图片\'+pic+\'下载异常,塞回重试\')
            pictures.append(pic);


#下载线程类
class dldThread(threading.Thread):
    def __init__(self,name,url):
        threading.Thread.__init__(self,name=name)
        self.name=name
        self.url=url
        self.pictures=[]
    
    def run(self):
        while(self.url!="none"):
            print("线程"+self.name+"开始爬取页面"+self.url);

            try:
                rsp=requests.get(self.url,headers=headers)
                self.url="none"#用完之后置空,看下一页能否取到值
                soup= BeautifulSoup(rsp.text,\'html.parser\',from_encoding=\'utf-8\')                

                for divs in soup.find_all(class_="gallery_wrapper"):
                    # 把找到的图片放到数组里去
                    for img in divs.find_all(\'img\'):
                        print(img.get("src"))
                        self.pictures.append(img.get("src"))

                    #找下一页
                    for link in divs.find_all(\'a\',class_=\'a1\'):
                        if link.string==\'下一页\' and link.get("href").find(\'.html\')!=-1:
                            self.url=\'https://www.nvshens.com\'+link.get("href")


                if self.url!="none":
                    print("线程"+self.name+"前往下一页")
                    continue
                else:
                    print("线程"+self.name+\'爬取结束,开始下载...\')
                    downloadPics(self.pictures)
                    print("线程"+self.name+\'下载图片结束.\')
            except Exception as e:
                print("线程"+self.name+"发生异常。重新爬行")# 不管怎么出现的异常,就让它一直爬到底
                continue


# 循环下载图片
def main():
    for i in range(10000,20000):#范围自己调整
        url=\'https://www.nvshens.com/g/\'+str(i)+\'/\'

        th=dldThread(name=str(i),url=url)
        th.start()

# Kickoff Start
main()