python_crawler,批量下载文件

这个第一个python3网络爬虫,参考书籍是《python网络数据采集》。该爬虫的主要功能是爬取某个网站,并将.rar,.doc,.docx,.zip文件批量下载。

后期将要改进的是,用后缀名来识别并下载文件,但面对大数据量的网站,需要用到BloomFilter,再者还需要了解网站的反爬虫机制。

# -*- coding: utf-8 -*-

import os

from urllib.request import urlretrieve

from urllib.request import urlopen

from bs4 import BeautifulSoup

import re

from urllib.parse import quote

import string

downloadDirectory = "downloaded"

baseUrl = "http://computer.hdu.edu.cn"

def is_chinese(uchar):

if uchar >= u'\u2E80' and uchar <= u'\uFE4F':

return True

else:

return False

def getAbsoluteURL(baseUrl, source):

if source.startswith("http://www."):

url = "http://"+source[11:]

elif source.startswith("http://"):

url = source

elif source.startswith("www."):

url = source[4:]

url = "http://"+source

else:

url = baseUrl+source

if baseUrl not in url:

return None

return url

def getDownloadPath(baseUrl, absoluteUrl, downloadDirectory):

path = absoluteUrl.replace("www.", "")

path = path.replace(baseUrl, "")

path = downloadDirectory+path

directory = os.path.dirname(path)

if not os.path.exists(directory):

os.makedirs(directory)

print(path)

return path

pages = set()

def getLinks(pageUrl):

global pages

html = urlopen("http://computer.hdu.edu.cn"+pageUrl)

bsObj = BeautifulSoup(html, "html.parser")

try:

print(bsObj.h1.get_text())

print(bsObj.h2.get_text())

print(bsObj.h3.get_text())

# my_docs = bsObj.findAll("a", {"href":re.compile("\/uploads\/attachments\/.*\.doc")})

my_files = bsObj.findAll("a", {"href":re.compile("\/uploads\/attachments/")})

for my_file in my_files:

if is_chinese(my_file["href"]):

my_file["href"]=quote(my_file["href"])

print("τݾ"+my_file["href"])

url = getAbsoluteURL(baseUrl, my_file["href"])

# url="http://computer.hdu.edu.cn"+ my_file["href"]

print(url)

if url is not None:

# print(url)

# url=url.encode("utf-8")

# url=quote(url,safe=string.printable)

# url=quote(url)

# print(url)

urlretrieve(url, getDownloadPath(baseUrl, url, downloadDirectory))

# print(bsObj.find(id ="mw-content-text").findAll("p")[0])

# print(bsObj.find()