from lxml import etree
import requests
def get_html(url, headers):
response = requests.get(url, headers = headers)
try:
if response.status_code == 200:
return response.text
except:
pass
def get_parse(html):
tree = etree.HTML(html)
user = tree.xpath('//*/table/tbody/tr/td/div/a/text()')
reply = tree.xpath('//*/table/tbody/tr/td/div/div/table/tbody/tr/td/text()')
for users, replys in zip(user, reply):
print('用户名:'+users, '回复内容:'+ replys.strip())
def main():
url = 'http://www.dxy.cn/bbs/thread/626626'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
}
html = get_html(url, headers)
get_parse(html)
if __name__ == '__main__':
main()
```
爬取结果:
利用xpath爬丁香园论坛帖子的所有回复内容
最新推荐文章于 2023-12-29 12:13:25 发布
部署运行你感兴趣的模型镜像
您可能感兴趣的与本文相关的镜像
Python3.9
Conda
Python
Python 是一种高级、解释型、通用的编程语言,以其简洁易读的语法而闻名,适用于广泛的应用,包括Web开发、数据分析、人工智能和自动化脚本
22万+

被折叠的 条评论
为什么被折叠?



