下面直接爬取所有数据:
import requests
from bs4 import BeautifulSoup
url='http://www.dxy.cn/bbs/thread/626626#626626'
headers = {'Accept':'*/*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN',
'Connection': 'Keep-Alive',
'Host': 'www.dxy.cn',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134',
'Cookie':'td_cookie=18446744069748759520; cms_token=116baec5-d226-4564-9b91-cd533c73ead6; JUTE_TOKEN=adcda86b-4efe-47d2-9641-c93dee0fb3fc; JUTE_SESSION=5c5b88a12ae80054925bc7393dc69743c48afc8a7344c99f908728ce982be11d6c843a36a37c78eb3983844955fe35699c25641dcb40ef3063116d2db88a4dc27841f30fd1563b17; __utmc=1; Hm_lpvt_8a6dad3652ee53a288a11ca184581908=1551957091; __utmz=1.1551939616.1.1.utmcsr=so.com|utmccn=(organic)|utmcmd=organic|utmctr=http%3A%2F%2Fwww.dxy.cn%2Fbbs%2Fthread%2F626626%23626626; Hm_lvt_8a6dad3652ee53a288a11ca184581908=1551939616,1551956252,1551956744,1551957091; DXY_USER_GROUP=72; __utma=1.324699068.1551939616.1551942120.1551956252.4; __auc=8eb3a17e1694280806eca5af6a3; __utmb=1.10.8.1551956906398; _gid=GA1.2.12931236.1551940767; __asc=7330b3c016957cd13e7d51fdb7d; _ga=GA1.2.324699068.1551939616; __utmt=1; bannerData={"banner":false,"message":"banner"}'}
x= requests.get(url, headers=headers, timeout=3)
x.encoding=x.apparent_encoding
html=x.text
soup=BeautifulSoup(html, 'html.parser')
line=[]
name=[]
jiesao=[]
data=[]
new_data=[]
zz=[]
for i in soup.find_all('div', class_="auth"): # 名字
name.append(i.text)
for i in soup.find_all('td', class_="postbody"): # 名字
line.append(i.text)
for i in soup.find_all('div',class_="user-level-area"):
jiesao.append(i.text)
for i in soup.find_all('span',class_="adm"):
zz.append(i.text)
jiesao.append(zz[0])
jiesao[3]=jiesao[2]
jiesao[2]=zz[0]
print(jiesao)
for i in range(0,4):
data.append(name[i]+'***'+jiesao[i]+"###"+line[i])
for i in data:
x=i.replace("\n","").replace("\t","").replace(" ","")
new_data.append(x)
print(new_data)
结果为: