利用beautifulsoup下载网页html代码中的css, js, img文件并保存

# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup as BS 
import urllib.request as rqst
import os
url = 'http://xxxxxxx'
headers = {'User-Agent': 'xxxxxx(这个网上随便找一个都可以)','Accept-Encoding':'utf-8'}
r = rqst.Request(url, headers=headers)
html = rqst.urlopen(url) #网页用bs解析 bs = BS(req, 'lxml') #获取css,js,img文件的路由 elc = bs.find_all('link', type='text/css') elj = bs.find_all('script') eli = bs.find_all('img') #保存css,js,img文件

for c in elc:

url = c['href'] #如果href不完整需要自己调整,下面的一样

file = url.split('/')[-1] #获取文件名

if(os.path.exists (file)==False):

try:

res = rqst.urlopen(url)

txt = res.read()

with open(file, 'wt', encoding='utf-8') as f:

f.write(txt)

f.close()

except Exception:

pass

for j in elj:

if(i.has_attr('src')):

url = j['src']

file = url.split('/')[-1]

if(os.path.exists(file)==False):

try:

res = rqst.urlopen(url)

txt = res.read()

with open(file, 'wt', encoding='utf-8') as f:

f.write(txt)

f.close()

except Exception:

pass

for i in eli:

url = i['src']

url = 'http://www.fmhhqb.com'+url

file = url.split('/')[-1]

if(os.path.exists(file)==False):

try:

r = getRequest(url)

res = rqst.urlopen(r)

txt = res.read()

with open(file, 'wb') as f:

f.write(txt)

f.close()

except Exception:

pass