目标网页:股吧_东方财富网旗下股票社区_东方财富网股吧
网址:https://guba.eastmoney.com/o/list.zssh000001_1.html
爬取代码:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
index_url='https://guba.eastmoney.com/o/list.zssh000001_1.html'
def RequestSoup(url):
#伪装浏览器
header={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36'}
r=requests.get(index_url,headers=header)
soup=BeautifulSoup(r.text,'html.parser')
return soup
def UrlList(index_url):
'''获取多页网址列表'''
urls=[]
soup=RequestSoup(index_url)
urls_ul=soup.find('ul',class_='pagernums')
urls_a=urls_ul.find('span').find_all('a')
for one in urls_a:
aurl=urljoin(index_url,one.get('href'))
if aurl not in urls:
urls.append(aurl)
return urls
def CrawlPages(urls):
items=[]
for url in urls:
soup=RequestSoup(url)
ul_tag=soup.find('ul',class_='newlist')
li_list=ul_tag.find_all('li')
for one in li_list:
aitem=one.find_all()
read_ct=aitem[0].get_text().strip()
comment_ct=aitem[1].get_text().strip()
title=aitem[2].get_text().strip()
title_url=aitem[2].find('a',class_='note').get('href').strip()
author=one.find('cite',class_='aut').get_text().strip()
date=one.find('cite',class_='last').get_text().strip()
items.append([read_ct,comment_ct,title,title_url,author,date])
df=pd.DataFrame(items,columns=['阅读数','评论数','标题','文章网址','作者','日期'])
df['文章网址']=df['文章网址'].apply(lambda x:urljoin(urls[0],x))
df.to_csv(r"C:\Users\Administrator\Downloads\guba.csv",encoding='gbk',index=False)
if __name__=="__main__":
CrawlPages(UrlList(index_url))