话不多说,直接上代码,
import csv, requests, re
from bs4 import BeautifulSoup
from lxml import etree
url = 'https://www.v2ex.com/?tab=all'
'''
#soup加正则
html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')
articles = []
for article in soup.find_all(class_='cell item'):
title = article.find(class_='item_title').get_text()
category = article.find(class_='node').get_text()
author = re.findall(r'(?<=<a href="/member/).+(?="><img)', str(article))[0]
#print(author)
u = article.select('.item_title > a')
#print(u)
link = 'https://www.v2ex.com' + re.findall(r'(?<=href=").+(?=")', str(u))[0]
articles.append([title, category, author, link])
print(articles)
'''
#xpath 写
response=requests.get(url).text
html=etree.HTML(response)
#print(html)
tag_div=html.xpath('//div[@class="box"]/div[@class="cell item"]')
#print(tag_div)
articles=[]
for each in tag_div:
title=each.xpath('./table//tr/td[3]/span[1]/a/text()')[0]
h