20170912多线程Python爬取图片

import threading               #导入线程

from urllib import request #导入网页请求模块

import re                            #导入正则表达式模块

import os                           # 引入模块

from openpyxl import Workbook

from openpyxl import load_workbook

class customThread(threading.Thread):

    def __init__(self,imgurl,imgpath):

        threading.Thread.__init__(self)

        self.imgurl=imgurl

        self.imgpath=imgpath

    def run(self):

        #print('downloading : ',self.imgpath)

        downloadimg(self.imgurl,self.imgpath)

def downloadimg(imgurl,imgpath):

    try:                   #实践中发现会出现网页访问失败返回404的情况

        response=request.urlopen(imgurl) #访问图片地址

        imgcontents=response.read()      #获取图片内容

    except:

        print(imgpath +'下载出错')

    else:

        f=open(imgpath,'wb')  #打开文件

        f.write(imgcontents)    #写入内容

        f.close                         #关闭文件

        print('保存成功>>>>'+ imgpath )

def getimageurl(weburl,folder,imgname):

    response=request.urlopen(weburl)#打开网页，获取响应文本

    page=response.read()            #读取网页源码

    js= page.decode('utf-8')        #转码

    print(js)

    pat=re.compile(r'(?<="//)www.dhresource.com/.*?\.jpg')  #编译匹配模式

    match=re.findall(pat,js)        #匹配网页源码

    if match:                       #若匹配则输出

        #print ('匹配成功')

        n=0

        for each_match in match:

            n+=1

            imgurl='http://'+each_match

            imgpath=folder +"\\"+ imgname+"_" +str(n)+'.jpg'

            customThread(imgurl,imgpath).start()  #调用下载图片函数

def mkdir(path):

    # 去除首位空格

    path=path.strip()

    # 去除尾部 \ 符号

    path=path.rstrip("\\")

    # 判断路径是否存在

    isExists=os.path.exists(path)

    # 判断结果

    if not isExists:

        # 如果不存在则创建目录

        # 创建目录操作函数

        os.makedirs(path)

        print(path+' 创建成功')

        return True

    else:

        # 如果目录存在则不创建，并提示目录已存在

        print(path+' 目录已存在')

        return False

if __name__ == "__main__":

    print('!!!!!!开始运行!!!!!!')

    wb = load_workbook('URL.xlsx')

    ws=wb.active

    for i in range(2,51):

        #print(ws.cell(row=i,column=1).value)

        if ws.cell(row=i,column=1).value!=None:

            imgname=str(ws.cell(row=i,column=1).value)

            folder=os.getcwd() +"\\"+imgname

            print(mkdir(folder))

            weburl=ws.cell(row=i,column=2).value

            getimageurl(weburl,folder,imgname)

    print('!!!!!!运行结束!!!!!!')
秒客网

20170912多线程Python爬取图片

相关文章