python3爬虫再探之豆瓣影评数据抓取

    一个关于豆瓣影评的爬虫,涉及:模拟登陆,翻页抓取。直接上代码:

import re
import time
import requests
import xlsxwriter
from bs4 import BeautifulSoup


headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36',
           'Referer':'https://www.douban.com/accounts/login?source=movie'}
s = requests.Session()
def log_in(login_url):
    # 获取验证码并保存到本地
    imgdata = s.get("https://www.douban.com/accounts/login?source=movie", headers=headers, verify=False).text
    print(imgdata)
    pa = re.compile(r'<img />')
    img_url = re.findall(pa, imgdata)[0]
    print(img_url)
    picdata = s.get(img_url).content
    with open("douban.jpg", 'wb') as f:
        f.write(picdata)

    # 获取随机ID
    pa_id = re.compile(r'<input type="hidden" name="captcha-id" value="(.*?)"/>')
    capid = re.findall(pa_id, imgdata)[0]
    print(capid)


    capimg = input("输入验证码:")

    payload = {
    "source":"movie",
    "redir":"https://movie.douban.com/",
    "form_email":"你的邮箱",
    "form_password":"你的密码",
    "captcha-solution":capimg,
    "captcha-id":capid,
    "login":"登录"
    }

    # log_url = "https://accounts.douban.com/login"
    data1 = s.post(login_url, data=payload, verify=False)  # 绕过了SSL验证
    print(data1.status_code)


i = 0
def get_data(url):
    time.sleep(2)
    print("#"*50)
    global i
    print(i)
    try:
        data = s.get(url, headers = headers).text
        print(data)
    except:
        try:
            time.sleep(3)
            print("正在尝试重新加载页面...")
            data = s.get(url, headers= headers).text
        except:
            workbook.close()
            pass
    # print(data)

    # 解析网页
    soup = BeautifulSoup(data, "lxml")
    comments = soup.findAll("div", {"class":"comment-item"})

    # print(len(comments))
    for comment in comments:
        i += 1
        info = comment.find("span",{"class":"comment-info"})

        # get date
        date = info.find("span",{"class":""}).get_text()
        pa_date = re.compile("\d\d\d\d-\d\d-\d\d")
        date = re.findall(pa_date, date)[0]
        # print(date)
        worksheet.write(i,0,date)

        # get star
        star = info.find("span")["class"][0][-2:-1]
        # print(star)
        worksheet.write(i,1,star)

        # get vote
        vote = comment.find("span", {"class":"comment-vote"}).find("span").get_text()
        # print(vote)
        worksheet.write(i,2,vote)

        # get content
        content = comment.find("div", {"class":"comment"}).find("p").get_text()
        print(content)
        worksheet.write(i,3,content)

    # 获取下一页的url,递归抓取
    pa = re.compile('<a href="?(.*?)" .*? class="next">后一页</a>')
    try:
        next = str(pa.findall(data)[0]).replace("amp;","")
        next_url = "https://movie.douban.com/subject/25958717/comments" + next
        print("正在抓取"+next_url+"...")
        get_data(next_url)
    except:
        workbook.close()
        pass


workbook = xlsxwriter.Workbook('海蒂和爷爷影评.xlsx')
worksheet = workbook.add_worksheet()
worksheet.set_column('A:A', 20)
worksheet.set_column('B:B', 10)
worksheet.set_column('C:C', 10)
worksheet.set_column('D:D', 500)

login_url = "https://accounts.douban.com/login"
log_in(login_url)
comment_data = get_data("https://movie.douban.com/subject/25958717/comments")
workbook.close()

    这里有两个问题:

  1.首先,登陆的时候,可能会不需要验证码(当然也不会抓到验证码的图片。。),加上try就可以了。

  2.数据抓取不全。。。总是剩下1/5左右的数据抓不到,,目前还未解决,请看到的大神指点!