1、代码总览
import itertools
from array import array
import numpy as np
import requests
from bs4 import BeautifulSoup
import os
url = 'https://www.zhihu.com/explore'
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'}
cookies = {'Cookie': 'YD00517437729195%3AWM_TID=9XQHePujZ8lBQVRBRRKA3bT%2BW5zVHR7q; YD00517437729195%3AWM_NI=tpq0eNA77VFvFgGgRSK0c8X4F%2Fuv4ktLQ7R%2BKSvJuNGmcQ7p8YAWAj11UklcnfA%2FrfZ5OqqEg1oRd5PjCcUnZ6ssKFLVL0deyXL5LZh9v7XGK3wp8TKaV4XZSEJDHPyQck8%3D; YD00517437729195%3AWM_NIKE=9ca17ae2e6ffcda170e2e6ee97ea398abac0acd170f6b48fa2c15a929f8f83d5658cbd9982f65ff4f1b9ace92af0fea7c3b92ab8b0bfd1e84683e9f9b7c57e86b98988d15989f0add9db4086f1bb97b761b8949fd8d121898dbaabf84685b3adb2b67cfc878fb8c462f3ef9b8cef69aeba8fd0f16e9aeb8192f44fa5efa1b3e73f94b68d84d8669aeabd94ca4e908a84d1e76fbb8884d6cf52ace7bad8db4d89edb991bb6089bf9ad5db52919b86a5cb5cbb8fad8cbb37e2a3; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1704696788,1705663245,1705888329,1706926989; _xsrf=Xn2fnEjzsoTSFiJeSzI3LQ8SPSicj8qp; _zap=c052d10c-dc64-4d7e-857e-cd79941d3287; d_c0=AGBSjA2nRBmPTlvtCxRw-lfz5CPygmNrewY=|1726883924; __snaker__id=Wi4A0wL5T6Tp5f9R; q_c1=24f86e68d263413c914f026306b89ea7|1727229845000|1727229845000; __zse_ck=003_bGA93Nn0RH7JG2M=kmCgnqdO3p1AtCozgD/q28/ILzb6okMtjvqWtWONR56vJ1TuNb00hdAV0mAYphiiJmTnpVRMDPHzo6bvKZusv9ma9dvH; SESSIONID=m7QM6iRf9Fi9dWoxV6fMRagucK9FjnIKehNUnMJdS6a; BEC=b7b0f394f3fd074c6bdd2ebbdd598b4e; gdxidpyhxdE=ESipWCcLeN6kHuArNf3lCD%2F1pbYGut5PbXJuQenUi%2BdKygIi5O5uSc1fEYGLYBfuBwL6kEd0ZLEl5%2FGh6xKXKQrU2Obr91kJfiJqGmAHRJql3IS%2BZcO0Z0TyhNrYuxhLojgDuclvU%5CSpT0WrgCHJ4PQm4vmNxKQPLZC%2FU%5CWh%5C4dTQrNE%3A1728350688067; captcha_session_v2=2|1:0|10:1728350138|18:captcha_session_v2|88:MnVCcnlBYitBMEZUSTJBTmxocFpUbXliUGtoQzB4enhsNVZDeXJTZk5JQm5tR3FIQ2dhOTRPQngxWWh6cGJCcw==|f62723192f0a534c395516c9357ed627cae24eca94aeb20d962ed72b490be26c'}
res = requests.get(url,headers=headers,cookies=cookies)
html_content = res.content
soup = BeautifulSoup(html_content,"html.parser")
list = soup.find_all(class_ = "css-1g4zjtl")
List = []
for lists in list:
# print(lists.getText())
List.append(lists.getText())
link = soup.find_all('a',target = "_blank",class_ = "css-1u3u1p5")
Link = []
for links in link:
# print(links.get('href'))
Link.append(links.get('href'))
path = os.getcwd() + u'\知乎每日热议爬取'
filename = path +'.txt'
L = [x for pair in zip(List, Link) for x in pair]
a = ",".join(map(str, L))
print(a)
new = open(filename, 'w', encoding="utf-8")
new.write("新闻资讯: "+"\n"+a)
new.close()
2、结果展示
3、遇到的问题和解决
遇到了比较有意思的事情,打开通过爬取的链接点进去后,所跳转的知乎页面存在乱码情况。于是在网上一搜,原来是一个“Cookie”值的问题——“__zse_ck cookie的value值”。总而言之,他是为了防止爬虫而存在的(-_-||)
相关链接:【已解决】Surfing 插件打开知乎二级页面出现乱码(知乎防爬虫,改 cookie 即可) - 疑问解答 - Obsidian 中文论坛
4、代码改进,进行换行输出并保存
import itertools
from array import array
import numpy as np
import requests
from bs4 import BeautifulSoup
import os
url = 'https://www.zhihu.com/explore'
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'}
cookies = {'Cookie': 'YD00517437729195%3AWM_TID=9XQHePujZ8lBQVRBRRKA3bT%2BW5zVHR7q; YD00517437729195%3AWM_NI=tpq0eNA77VFvFgGgRSK0c8X4F%2Fuv4ktLQ7R%2BKSvJuNGmcQ7p8YAWAj11UklcnfA%2FrfZ5OqqEg1oRd5PjCcUnZ6ssKFLVL0deyXL5LZh9v7XGK3wp8TKaV4XZSEJDHPyQck8%3D; YD00517437729195%3AWM_NIKE=9ca17ae2e6ffcda170e2e6ee97ea398abac0acd170f6b48fa2c15a929f8f83d5658cbd9982f65ff4f1b9ace92af0fea7c3b92ab8b0bfd1e84683e9f9b7c57e86b98988d15989f0add9db4086f1bb97b761b8949fd8d121898dbaabf84685b3adb2b67cfc878fb8c462f3ef9b8cef69aeba8fd0f16e9aeb8192f44fa5efa1b3e73f94b68d84d8669aeabd94ca4e908a84d1e76fbb8884d6cf52ace7bad8db4d89edb991bb6089bf9ad5db52919b86a5cb5cbb8fad8cbb37e2a3; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1704696788,1705663245,1705888329,1706926989; _xsrf=Xn2fnEjzsoTSFiJeSzI3LQ8SPSicj8qp; _zap=c052d10c-dc64-4d7e-857e-cd79941d3287; d_c0=AGBSjA2nRBmPTlvtCxRw-lfz5CPygmNrewY=|1726883924; __snaker__id=Wi4A0wL5T6Tp5f9R; q_c1=24f86e68d263413c914f026306b89ea7|1727229845000|1727229845000; __zse_ck=003_bGA93Nn0RH7JG2M=kmCgnqdO3p1AtCozgD/q28/ILzb6okMtjvqWtWONR56vJ1TuNb00hdAV0mAYphiiJmTnpVRMDPHzo6bvKZusv9ma9dvH; SESSIONID=m7QM6iRf9Fi9dWoxV6fMRagucK9FjnIKehNUnMJdS6a; BEC=b7b0f394f3fd074c6bdd2ebbdd598b4e; gdxidpyhxdE=ESipWCcLeN6kHuArNf3lCD%2F1pbYGut5PbXJuQenUi%2BdKygIi5O5uSc1fEYGLYBfuBwL6kEd0ZLEl5%2FGh6xKXKQrU2Obr91kJfiJqGmAHRJql3IS%2BZcO0Z0TyhNrYuxhLojgDuclvU%5CSpT0WrgCHJ4PQm4vmNxKQPLZC%2FU%5CWh%5C4dTQrNE%3A1728350688067; captcha_session_v2=2|1:0|10:1728350138|18:captcha_session_v2|88:MnVCcnlBYitBMEZUSTJBTmxocFpUbXliUGtoQzB4enhsNVZDeXJTZk5JQm5tR3FIQ2dhOTRPQngxWWh6cGJCcw==|f62723192f0a534c395516c9357ed627cae24eca94aeb20d962ed72b490be26c'}
res = requests.get(url,headers=headers,cookies=cookies)
html_content = res.content
soup = BeautifulSoup(html_content,"html.parser")
list = soup.find_all(class_ = "css-1g4zjtl")
List = []
for lists in list:
# print(lists.getText())
List.append(lists.getText())
link = soup.find_all('a',target = "_blank",class_ = "css-1u3u1p5")
Link = []
for links in link:
# print(links.get('href'))
Link.append(links.get('href'))
path = os.getcwd() + u'\知乎每日热议爬取'
filename = path +'.txt'
L = [x for pair in zip(List, Link) for x in pair]
a = "\n".join(map(str, L))
print(a)
new = open(filename, 'w', encoding="utf-8")
new.write("新闻资讯: "+"\n"+a)
new.close()