python应用:爬虫实例,静态网页

爬取起点中文网某本小说实例:

 1 # -*-coding:utf8-*-
 2 import requests
 3 import urllib
 4 import urllib2
 5 from bs4 import BeautifulSoup
 6 import sys
 7 reload(sys)
 8 sys.setdefaultencoding('utf-8')       #读写中文内容
 9 
10 class QDZW:
11     def __init__(self, url):
12         self.url = url           #网址
13         self.user_agent = 'Mozilla/5.0 (Windows NT 6.1; (Windows NT 6.1; WOW64; rv:62.0) Gecko/20100101 Firefox/61.0'      #请求头属性(模拟浏览器访问)
14         self.headers = {'User-Agent' : self.user_agent}
15 
16     #获取网址静态HTML(如果未访问成功,则持续访问,直到访问成功为止)
17     def gethtml(self):
18         
19             try:
20                 #return requests.get(self.url,headers = self.headers,time_out = 2)
21                 request = urllib2.Request(self.url,headers = self.headers)
22                 response = urllib2.urlopen(request,time_out = 2)    #设置访问时长限制
23                 temp = False
24                 return response
25             except Exception as e:
26                 print e
27     #解析HTML内容,获取需要的内容           
28     def novel(self):
29         while True:
30             r = self.gethtml()
31             while r == 'None':         #r内容为空表示访问超时,则继续访问
32                 r = self.gethtml()     
33             #print r.encoding    #输出编码类型
34             soup = BeautifulSoup(r ,"html.parser")     #利用BeautifulSoup解析网址内容,class前加 . id前加 #
35             #select输出内容为数组,select参数从body中一级结点开始即可,可逐级叠加定位
36             #XXX.text为读取标签中的内容  XXX.attrs['xx']为读取 XXX中属性为xx的值 ,XXX为HTML格式的文本
37             section_name = soup.select(' .wrap #j_readMainWrap #j_chapterBox .text-wrap .main-text-wrap .text-head .j_chapterName')[0].text  
38             print section_name
39             section_text = soup.select('.wrap #j_readMainWrap #j_chapterBox .text-wrap .main-text-wrap .read-content')[0].text
40             section_text = self.spliting(section_text)
41             #print section_text
42             self.writing(section_name + "\r\n" + section_text)
43 
44             next_url = soup.select('.wrap #j_readMainWrap .chapter-control #j_chapterNext')[0]
45             #print next_url.text 
46             if next_url.text == "下一章":
47                 self.url = "https:" + next_url.attrs['href']
48             else:
49                 break
50     #在去掉<p></p>的位置添加换行符号“\r\n”
51     def spliting(self, string):
52         str_split = string.split()  #以split()中参数为标准,切割字符串
53         string = ""
54         for text in str_split:
55             if text != "":
56                 string = string + "    " + text + "\r\n"
57         return string
58     #输出读取的novel内容
59     def writing(self, novel):
60         outputs = open(unicode('至尊剑皇.txt','utf8'),'a')      #a为追加型写入
61         outputs.write(novel)
62         outputs.close()
63         
64 qdzw = QDZW('https://read.qidian.com/chapter/JDViC81SWM41/pskSqRrpbXDgn4SMoDUcDQ2')     #参数为初始网址  
65 qdzw.novel()