20170912多线程Python爬取图片

2024-03-30 05:45•python•阅读 4849

import threading               #导入线程
from urllib import request #导入网页请求模块
import re                            #导入正则表达式模块
import os                           # 引入模块
from openpyxl import Workbook
from openpyxl import load_workbook
class customThread(threading.Thread):
    def __init__(self,imgurl,imgpath):
        threading.Thread.__init__(self)
        self.imgurl=imgurl
        self.imgpath=imgpath
    def run(self):
        #print('downloading : ',self.imgpath)
        downloadimg(self.imgurl,self.imgpath)
def downloadimg(imgurl,imgpath):
    try:                   #实践中发现会出现网页访问失败返回404的情况
        response=request.urlopen(imgurl) #访问图片地址
        imgcontents=response.read()      #获取图片内容
    except:
        print(imgpath +'下载出错')
    else:
        f=open(imgpath,'wb')  #打开文件
        f.write(imgcontents)    #写入内容
        f.close                         #关闭文件
        print('保存成功>>>>'+ imgpath )
        
def getimageurl(weburl,folder,imgname):
    response=request.urlopen(weburl)#打开网页，获取响应文本
    page=response.read()            #读取网页源码
    js= page.decode('utf-8')        #转码
    print(js)
    pat=re.compile(r'(?<="//)www.dhresource.com/.*?\.jpg')  #编译匹配模式
    match=re.findall(pat,js)        #匹配网页源码
    if match:                       #若匹配则输出
        #print ('匹配成功')
        n=0
        for each_match in match:
            n+=1
            imgurl='http://'+each_match
            imgpath=folder +"\\"+ imgname+"_" +str(n)+'.jpg'
            customThread(imgurl,imgpath).start()  #调用下载图片函数
def mkdir(path):
    # 去除首位空格
    path=path.strip()
    # 去除尾部 \ 符号
    path=path.rstrip("\\")
    # 判断路径是否存在
    isExists=os.path.exists(path)
    # 判断结果
    if not isExists:
        # 如果不存在则创建目录
        # 创建目录操作函数
        os.makedirs(path) 
        print(path+' 创建成功')
        return True
    else:
        # 如果目录存在则不创建，并提示目录已存在
        print(path+' 目录已存在')
        return False
    
if __name__ == "__main__":
    print('!!!!!!开始运行!!!!!!')
    wb = load_workbook('URL.xlsx')
    ws=wb.active
    for i in range(2,51):
        #print(ws.cell(row=i,column=1).value)
        if ws.cell(row=i,column=1).value!=None:
            imgname=str(ws.cell(row=i,column=1).value)
            folder=os.getcwd() +"\\"+imgname
            print(mkdir(folder))
            weburl=ws.cell(row=i,column=2).value
            getimageurl(weburl,folder,imgname)
            
    print('!!!!!!运行结束!!!!!!')

上一篇 »python3 scrapy 爬取腾讯招聘
下一篇 »爬虫实战，三用Python爬取拉勾网

20170912多线程Python爬取图片

相关推荐

java===TCP，多线程多客户端同时上传字节数据：图片为例

python学习，七--豆瓣爬取电影名，评分以及演员

python爬取网页数据并存储到mysql数据库

python3爬取拉钩招聘数据

python爬取图片遇见src乱码： data:image/png;base64

解决python3爬取网页，GB2312编码中文乱码问题

Python爬取王者荣耀图片，安琪拉

Python之爬取网页时遇到的问题——BeautifulSoup Python之爬取网页时遇到的问题——BeautifulSoup