import requests
from bs4 import BeautifulSoup
import time
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"
}
def parse_onepage(page):
global count
global t
global t1
a = requests.get("https://guba.eastmoney.com/default,1_{}.html".format(page*20),headers=headers)
soup = BeautifulSoup(a.text,"lxml")
items = soup.findAll("ul",{"class":"newlist"})[0]
items = items.findAll("li")
for i in items:
read = i.findAll("cite")[0]
read = read.text.replace(" ","")
read = read.replace("\n","")
read = read.replace("\r","")
comment = i.findAll("cite")[1]
comment = comment.text.replace(" ","")
comment = comment.replace("\n","")
comment = comment.replace("\r","")
title = i.span.a.attrs["title"]
source = i.findAll("cite")[2].text
source=source.replace("\n","")
t = i.findAll("cite")[3].text
if k > 1:
if t == t1:
print("跳过此条")
continue
print([read,comment,title,source,t])
count += 1
print(count)
with open("股吧1.csv","a") as f:
f.write("{},{},{},{},{}\n".format(read,comment,title,source,t))
count = 1
for k in range(1400):
if k > 1:
t1=t
parse_onepage(k)
time.sleep(0.5)
python爬虫-股吧
最新推荐文章于 2024-03-30 13:15:46 发布