python3 多进程下载指定列表

本代码实现多进程下载指定下载列表的功能。

注意事项有:

1、下载过程中,显示总数、已存在、已下载、出错、剩余等信息,以便随时掌握进度。

2、可以指定重试次数(在程序中指定)

3、进程数、下载列表由命令行参数指定

4、保存位置需要在程序中指定

# -*- coding: utf-8 -*-
"""
Created on Sat Nov 16 07:52:40 2019

@author: mi
"""
import requests
import os
import csv

exist_count=0#已存在
downloaded_count=0#已下载
total_count=0#总数
error_count=0#出错

def downloading_over(arg):
    global downloaded_count
    global total_count
    global exist_count
    global error_count
    print("返回状态:",arg)
    if arg=='EXISTS':
        exist_count+=1
    if arg=='SUCCESS':
        downloaded_count+=1
    if arg=='ERROR':
        error_count+=1
    print('总数:%s / 已存在:%s / 已下载:%s / 出错:%s / 剩余:%s' % (str(total_count),str(exist_count),str(downloaded_count),str(error_count),str(total_count-exist_count-downloaded_count-error_count)))


def get_page(link):
    url=link[0]
    savePath=link[1]
    print(savePath)
    if os.path.exists(savePath):
        print('已存在')
        return 'EXISTS'
    times=3
    while (times>0):
        times=times-1
        try:
            resp=requests.get(url,timeout=30)
        except requests.RequestException as e:
            print(e)
            continue
        if not os.path.exists(os.path.dirname(savePath)):
            os.makedirs(os.path.dirname(savePath))

        with open(savePath,'wb')as fw:
            fw.write(resp.content)
        return 'SUCCESS'
    else:
        return 'ERROR'

from multiprocessing import Pool
import sys


if __name__ == "__main__":
    __spec__ = "ModuleSpec(name='builtins', loader=<class '_frozen_importlib.BuiltinImporter'>)"
    process_num=sys.argv[1]
    print('进程数量:'+process_num)
    download_list=sys.argv[2]
    print('下载列表:'+download_list)
    
    pool = Pool(processes=int(process_num))    # set the processes max number
    with open(download_list,'r',encoding='utf-8') as downlist:
        lines=csv.reader(downlist)
        #下载列表,第一列为下载链接,第二列为保存位置
        for line in lines:
            total_count+=1
            link=[]
            url='http://www.xxx.com/'+line[0]
            link.append(url)
            savePath='D:/saveFolder/'+line[1]+'.htm'
            link.append(savePath)
            pool.apply_async(func=get_page, args=(link,),callback=downloading_over)
    pool.close()
    pool.join()