python爬取知乎某话题下图片

最新推荐文章于 2025-07-02 17:14:53 发布

susezj

最新推荐文章于 2025-07-02 17:14:53 发布

阅读量486

点赞数

CC 4.0 BY-SA版权

分类专栏：学习笔记

本文链接：https://blog.youkuaiyun.com/weixin_42320853/article/details/87105183

学习笔记专栏收录该内容

11 篇文章

订阅专栏

本文介绍了一种使用Python实现的知乎图片爬虫技术，通过解析网页源代码，抓取图片链接并下载到本地指定文件夹。文章详细展示了如何设置请求头、处理网页返回的数据以及图片的保存过程。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

新手搞这个真是费精力。
完整代码如下：

import requests
import re
import time
import os


def getHTMltext(url_, header={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1;'
 							  'Win64; x64) AppleWebKit/537.36 (KHTML, like'
 							  'Gecko) Chrome/69.0.3497.100'
  							  'Safari/537.36','Referer': "https://www.zhihu.com"
  							  "/question/37787176"}):
    try:
        r = requests.get(url_, headers=header)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r
    except:
        return print('连接异常')


def main():
    url0 = input("请输入合规网址：")
    # 诸如https://www.zhihu.com/api/v4/questions/305888519/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B*%5D.topics&offset=&limit=&sort_by=default&platform=desktop
    html0 = getHTMltext(url0, header1).json()
    page = int(html0['paging']['totals'])
    #获得页码数
    n = 0
    while n <= page:
    	try:
        	url1 = url0[:-48] +'&offset={}&limit=20&sort_by=default&platform=desktop'.format(n)
        	# 获得问题,以便新建文件夹
        	html1 = getHTMltext(url1,header1).json()
        	questitle = html1['data'][0]['question']['title']
        	for datas in html1['data']:
            	urlfile = re.findall(r'data-original="(https://.*?)"', datas['content'])
            	# 获得网址
            	urls = list(set(urlfile))
            	# 去除重复的网址
            	if urls.__len__() == 0:
                	continue
                	#没有发图片的回答则终止
            	for url in urls:
            	#保存每一个图片，并命名
                    local_path = url[8:].replace('/', '')[32:]
                    # 取网址的一部分进行命名
                    url = url[:11] + '3' + url[12:]
                    # 带有***pic3***的网址才能爬取，暂不知为何
                    rs = getHTMltext(url)
                    time.sleep(0.1)
                    # 慢慢来别急
                    dir_path = 'F://{}//'.format(questitle)
                    # 用问题作为创建文件夹的名称
                    if not os.path.exists(dir_path):
                        os.mkdir(dir_path)
                        # 判断有无此文件夹，无则创建，有就不管
                    path = dir_path + local_path
                    # 图片地址
                    with open(path, "wb") as f:
                        f.write(rs.content)
                        print("正在保存{}".format(path))
                        # 将图片保存到本地
		except:
			pass
        n +=20
        time.sleep(2)


header1 = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) '
                        'AppleWebKit/537.36 (KHTML, like Gecko) '
                        'Chrome/69.0.3497.100 Safari/537.36', 'Host': 'www.zhihu.com',
                        'Referer': "https://www.zhihu.com/question/37787176"
           }


if __name__ == '__main__':
    main()