python多线程爬取-今日头条的街拍数据,附源码加思路注释

  这里用的是json+re+requests+beautifulsoup+多线程
1 import json 2 import re 3 from multiprocessing.pool import Pool 4 5 import requests 6 from bs4 import BeautifulSoup 7 from config import * 8 from requests import RequestException 9 10 11 def get_page_index(offset, keyword): 12 '''得到一个页面的索引''' 13 data = { 14 'offset': offset, 15 'format': 'json', 16 'keyword': keyword, 17 'autoload': 'true', 18 'count': '20', 19 'cur_tab': '1', 20 'from': 'search_tab' 21 } 22 # 请求方式一 23 # url = 'https://www.toutiao.com/search_content/?'+urlencode(data) 24 # response = requests.get(url) 25 26 # 请求方式二 27 url = 'https://www.toutiao.com/search_content/' 28 try: 29 response = requests.get(url, params=data) 30 if response.status_code == 200: 31 return response.text 32 return None 33 except RequestException: 34 return None 35 36 37 def parse_page_index(html): 38 '''解析json数据''' 39 data = json.loads(html) 40 if data and 'data' in data.keys(): 41 for item in data.get('data'): 42 yield item.get('article_url') 43 44 45 def get_page_detail(url): 46 '''得到详情页的数据''' 47 # 添加的请求头 48 headers = { 49 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 50 } 51 try: 52 response = requests.get(url, headers=headers) 53 if response.status_code == 200: 54 return response.text 55 return None 56 except RequestException: 57 return None 58 59 60 def parse_page_detail(html, url): 61 '''解析详情页数据''' 62 soup = BeautifulSoup(html, 'lxml') 63 t = soup.select('title') 64 for i in t: 65 title = i.get_text() 66 67 pattern = re.compile('gallery: JSON.parse\("(.*?)"\),', re.S) 68 result = re.search(pattern, html) 69 if result: 70 71 # print(result.group(1)) 72 d = re.sub('\\\\', '', result.group(1)) 73 # print(d) 74 data = json.loads(d) 75 if data: 76 images = [item.get('url') for item in data.get('sub_images')] 77 for image in images: 78 download_image(image, title) 79 return { 80 'title': title, 81 'url': url, 82 'images': images 83 } 84 else: 85 None 86 87 88 def download_image(url, title): 89 ''' 90 图片下载 91 :param url: 下载的连接 92 :return: 93 ''' 94 print('正在下载', url) 95 try: 96 response = requests.get(url) 97 if response.status_code == 200: 98 content = response.content 99 save_to_image(content, title) 100 return None 101 except RequestException: 102 return None 103 104 105 count = 0 106 107 108 def save_to_image(content, title): 109 global count 110 ''' 111 保存图片文件 112 :param content: 图片文件的内容 113 :return: 114 ''' 115 name = title + str(count) 116 file_path = './头条/{}.{}'.format(name, 'jpg') 117 with open(file_path, 'wb') as f: 118 count += 1 119 f.write(content) 120 121 122 def main(offset): 123 '''主程序入口''' 124 html = get_page_index(offset, '街拍') 125 126 # print(html) 127 for url in parse_page_index(html): 128 129 if url: 130 # print(url) 131 html = get_page_detail(url) 132 if html: 133 # print(parse_page_detail(html, url)) 134 result = parse_page_detail(html, url) 135 if result: 136 print(result) 137 # save_to_mongo(result) 138 139 140 GROUP_START = 1 141 GROUP_END = 20 142 if __name__ == '__main__': 143 groups = [i * 20 for i in range(GROUP_START, GROUP_END)] 144 pool = Pool() 145 pool.map(main, groups)