话不多说,直接上代码,
import csv, requests, re
from bs4 import BeautifulSoup
from lxml import etree
url = 'https://www.v2ex.com/?tab=all'
'''
#soup加正则
html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')
articles = []
for article in soup.find_all(class_='cell item'):
title = article.find(class_='item_title').get_text()
category = article.find(class_='node').get_text()
author = re.findall(r'(?<=<a href="/member/).+(?="><img)', str(article))[0]
#print(author)
u = article.select('.item_title > a')
#print(u)
link = 'https://www.v2ex.com' + re.findall(r'(?<=href=").+(?=")', str(u))[0]
articles.append([title, category, author, link])
print(articles)
'''
#xpath 写
response=requests.get(url).text
html=etree.HTML(response)
#print(html)
tag_div=html.xpath('//div[@class="box"]/div[@class="cell item"]')
#print(tag_div)
articles=[]
for each in tag_div:
title=each.xpath('./table//tr/td[3]/span[1]/a/text()')[0]
h

本文展示了如何使用Python的BeautifulSoup、正则表达式(Re)和XPath库来抓取网站上的文章信息,并将数据导出到CSV文件中,适合初学者学习爬虫基础知识。
最低0.47元/天 解锁文章
5433

被折叠的 条评论
为什么被折叠?



