目录
(1)抓取今日头条街拍图片
(2)分析今日头条街拍图片结构
keyword: 街拍 pd: atlas dvpf: pc aid: 4916 page_num: 1 search_json: {\"from_search_id\":\"20220104115420010212192151532E8188\",\"origin_keyword\":\"街拍\",\"image_keyword\":\"街拍\"} rawJSON: 1 search_id: 202201041159040101501341671A4749C4
可以找到规律,page_num从1开始累加,其他参数不变
(3)按功能不同编写不同方法组织代码
获取网页json格式数据
def get_page(page_num): global headers headers = { \'Host\': \'so.toutiao.com\', #\'Referer\': \'https://so.toutiao.com/search?keyword=%E8%A1%97%E6%8B%8D&pd=atlas&dvpf=pc&aid=4916&page_num=0&search_json={%22from_search_id%22:%22202112272022060101510440283EE83D67%22,%22origin_keyword%22:%22%E8%A1%97%E6%8B%8D%22,%22image_keyword%22:%22%E8%A1%97%E6%8B%8D%22}\', \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36\', \'X-Requested-With\': \'XMLHttpRequest\', \'Cookie\': \'msToken=S0DFBkZ9hmyLOGYd3_QjhhXgrm38qTyOITnkNb0t_oavfbVxuYV1JZ0tT5hLgswSfmZLFD6c2lONm_5TomUQXVXjen7CIxM2AGwbhHRYKjhg; _S_DPR=1.5; _S_IPAD=0; MONITOR_WEB_ID=7046351002275317255; ttwid=1%7C0YdWalNdIiSpIk3CvvHwV25U8drq3QAj08E8QOApXhs%7C1640607595%7C720e971d353416921df127996ed708931b4ae28a0a8691a5466347697e581ce8; _S_WIN_WH=262_623\' } params = { \'keyword\': \'街拍\', \'pd\': \'atlas\', \'dvpf\': \'pc\', \'aid\': \'4916\', \'page_num\': page_num, \'search_json\': \'%7B%22from_search_id%22%3A%22202112272022060101510440283EE83D67%22%2C%22origin_keyword%22%3A%22%E8%A1%97%E6%8B%8D%22%2C%22image_keyword%22%3A%22%E8%A1%97%E6%8B%8D%22%7D\', \'rawJSON\': 1, \'search_id\': \'2021122721183101015104402851E3883D\' } url = \'https://so.toutiao.com/search?\' + urlencode(params) print(url) try: response=requests.get(url,headers=headers,params=params) if response.status_code == 200: #if response.content: #print(response.json()) return response.json() except requests.ConnectionError: return None
从json格式数据提取街拍图片
def get_images(json): images = json.get(\'rawData\').get(\'data\') for image in images: link = image.get(\'img_url\') yield link
将街拍图片以其md5码命名并保存图片
实现一个保存图片的方法 save_image(),其中 item 就是前面 get_images() 方法返回的一个字典。在该方法中,首先根据 item 的 title 来创建文件夹,然后请求这个图片链接,获取图片的二进制数据,以二进制的形式写入文件。图片的名称可以使用其内容的 MD5 值,这样可以去除重复。相关代码如下:
def save_image(link): data = requests.get(link).content with open(f\'./image/{md5(data).hexdigest()}.jpg\', \'wb\')as f:#使用data的md5码作为图片名 f.write(data)
main()调用其他函数
def main(page_num): json = get_page(page_num) for link in get_images(json): #print(link) save_image(link)
(4)抓取20page今日头条街拍图片数据
这里定义了分页的起始页数和终止页数,分别为 GROUP_START 和 GROUP_END,还利用了多线程的线程池,调用其 map() 方法实现多线程下载。
if __name__ == \'__main__\': GROUP_START = 1 GROUP_END = 20 pool = Pool() groups = ([x for x in range(GROUP_START, GROUP_END + 1)]) #print(groups) pool.map(main, groups) pool.close() pool.join()
import requests from urllib.parse import urlencode from hashlib import md5 from multiprocessing.pool import Pool def get_page(page_num): global headers headers = { \'Host\': \'so.toutiao.com\', #\'Referer\': \'https://so.toutiao.com/search?keyword=%E8%A1%97%E6%8B%8D&pd=atlas&dvpf=pc&aid=4916&page_num=0&search_json={%22from_search_id%22:%22202112272022060101510440283EE83D67%22,%22origin_keyword%22:%22%E8%A1%97%E6%8B%8D%22,%22image_keyword%22:%22%E8%A1%97%E6%8B%8D%22}\', \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36\', \'X-Requested-With\': \'XMLHttpRequest\', \'Cookie\': \'msToken=S0DFBkZ9hmyLOGYd3_QjhhXgrm38qTyOITnkNb0t_oavfbVxuYV1JZ0tT5hLgswSfmZLFD6c2lONm_5TomUQXVXjen7CIxM2AGwbhHRYKjhg; _S_DPR=1.5; _S_IPAD=0; MONITOR_WEB_ID=7046351002275317255; ttwid=1%7C0YdWalNdIiSpIk3CvvHwV25U8drq3QAj08E8QOApXhs%7C1640607595%7C720e971d353416921df127996ed708931b4ae28a0a8691a5466347697e581ce8; _S_WIN_WH=262_623\' } params = { \'keyword\': \'街拍\', \'pd\': \'atlas\', \'dvpf\': \'pc\', \'aid\': \'4916\', \'page_num\': page_num, \'search_json\': \'%7B%22from_search_id%22%3A%22202112272022060101510440283EE83D67%22%2C%22origin_keyword%22%3A%22%E8%A1%97%E6%8B%8D%22%2C%22image_keyword%22%3A%22%E8%A1%97%E6%8B%8D%22%7D\', \'rawJSON\': 1, \'search_id\': \'2021122721183101015104402851E3883D\' } url = \'https://so.toutiao.com/search?\' + urlencode(params) print(url) try: response=requests.get(url,headers=headers,params=params) if response.status_code == 200: #if response.content: #print(response.json()) return response.json() except requests.ConnectionError: return None def get_images(json): images = json.get(\'rawData\').get(\'data\') for image in images: link = image.get(\'img_url\') yield link def save_image(link): data = requests.get(link).content with open(f\'./image/{md5(data).hexdigest()}.jpg\', \'wb\')as f:#使用data的md5码作为图片名 f.write(data) def main(page_num): json = get_page(page_num) for link in get_images(json): #print(link) save_image(link) if __name__ == \'__main__\': GROUP_START = 1 GROUP_END = 20 pool = Pool() groups = ([x for x in range(GROUP_START, GROUP_END + 1)]) #print(groups) pool.map(main, groups) pool.close() pool.join()
到此这篇关于Python抓取今日头条街拍图片数据的文章就介绍到这了,更多相关Python抓取今日头条图片内容请搜索以前的文章或继续浏览下面的相关文章希望大家以后多多支持!
© 版权声明
THE END
暂无评论内容