爬虫2:html页面+beautifulsoap模块+post方式+demo

  爬取html页面,有时需要设置参数post方式请求,生成json,保存文件中。

1)引入模块

import requests
from bs4 import BeautifulSoup
url_ = 'http://www.c.....................'

2)设置参数

 datas = {
        'yyyy':'2014',
        'mm':'-12-31',
        'cwzb':"incomestatements",
        'button2':"%CC%E1%BD%BB",
    }

3)post请求

r = requests.post(url,data = datas)

4)设置编码

r.encoding = r.apparent_encoding

5)BeautifulSoup解析request请求

soup = BeautifulSoup(r.text)

6)find_all筛选

soup.find_all('strong',text=re.compile(u"股票代码"))[0].parent.contents[1]

7)css选择select

soup.select("option[selected]")[0].contents[0]

beautifulsoap的API请查看  https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html#beautifulsoup

Demo:文件读url,设置参数,post方式,beautifulsoap解析,生成json,保存文件中

import requests
from bs4 import BeautifulSoup
import re
import json
import time

fd = open(r"E:\aa.txt","r")
mylist = []
for line in fd:
    mylist.append(line)
url_pre = 'http://www.c.....................'
code = open(r"E:\a---.txt", "a")
for index in xrange(0,len(mylist)):

    print index
    url_id = mylist[index].split('?')[-1]   
    url_id = url_id[-7:-1]

    datas = {
        'yyyy':'2014',
        'mm':'-12-31',
        'cwzb':"incomestatements",'button2':"%CC%E1%BD%BB",
    }
    url = url_pre + str(url_id)
    print url
    print datas

    r = requests.post(url,data = datas)
    r.encoding = r.apparent_encoding
    print r
    soup = BeautifulSoup(r.text)

    r.encoding = r.apparent_encoding
    soup = BeautifulSoup(r.text)
    
    if len(soup.find_all("td",text=re.compile(u"营业收入"))) == 0:
        continue

    jsonMap = {}

    jsonMap[u'股票代码'] = soup.find_all('strong',text=re.compile(u"股票代码"))[0].parent.contents[1]
    jsonMap[u'股票简称'] = soup.find_all('strong',text=re.compile(u"股票代码"))[0].parent.contents[3]

    jsonMap[u'年度'] = soup.select("option[selected]")[0].contents[0]
    jsonMap[u'报告期'] = soup.select("option[selected]")[1].contents[0]


    yysr = soup.find_all("td",text=re.compile(u"营业收入"))[0].parent
    yysrsoup = BeautifulSoup(str(yysr))
    jsonMap[u'营业收入'] = yysrsoup.find_all('td')[1].contents[0]

    yylr = soup.find_all("td",text=re.compile(u"营业利润"))[0].parent
    yylrsoup = BeautifulSoup(str(yylr))
    jsonMap[u'营业利润'] = yylrsoup.find_all('td')[3].contents[0]

    strJson = json.dumps(jsonMap, ensure_ascii=False)
    print strJson
    #code.write(strJson)
    code.write(strJson.encode('utf-8') + '\n')
    time.sleep(0.1)
    code.flush()