爬取宋词写入csv

最新推荐文章于 2025-04-12 09:22:09 发布

原创最新推荐文章于 2025-04-12 09:22:09 发布 · 433 阅读

0 ·

CC 4.0 BY-SA版权

爬虫专栏收录该内容

0 篇文章

订阅专栏

本文介绍了一个使用Python爬取宋词数据并将其整理成CSV文件的过程。爬虫利用requests库获取网页内容，BeautifulSoup解析HTML，正则表达式抽取信息，并用pandas将数据写入CSV文件。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

import requests
import pandas as pd
from bs4 import BeautifulSoup
import re

pname = []
u= "https://so.gushiwen.org"
url = "https://so.gushiwen.org/gushi/songci.aspx"
url_list = []
pintroduction1 = []
pintroduction2 = []
pcontent =[]

def get_herf(url):
con = requests.get(url)
content = BeautifulSoup(con.content, "lxml")
for i in content.find_all("div", class_="typecont"):
list = i.find_all("span")
for j in range(len(list)):
herf = str(list[j])
# 通过正则表达式截取相应的字符
p = "\"/.+?\""
pattern = re.compile(p)
if len(herf) > 41:
new = pattern.findall(herf)
url_list.append(new[0].replace("\"", ""))
# 简单的方法，通过找到某个字符的前两个下标进行截取
# index = herf.find("\"")
# index1 = herf.find("\"", index+1)
# if len(herf) > 41:
# url_list.append(herf[15:41])
# print(url_list)
return url_list
def get_poems(url):
p = "[\u4e00-\u9fa5。：，？！]+"
pattern = re.compile(p)
con = requests.get(url)
content = BeautifulSoup(con.content, "lxml") # 解析html内容
c = content.find("div", class_="sons")
# print(str(c))
n = c.find("h1").string # 诗歌名称
intr = str(c.find("p", class_="source")) # 诗歌介绍数组
cont = str(c.find("div", class_="contson")) # 诗词内容数组
intro = "".join(pattern.findall(intr)) # 转换为诗歌介绍字符串
item=intro.split("：")
conte = "".join(pattern.findall(cont)) # 转换为诗词内容字符串
pname.append(n)
pintroduction1.append(item[0])
pintroduction2.append(item[1])
pcontent.append(conte)
# print(pname)
return pname, pintroduction1,pintroduction1,pcontent

def write_csv(list1, list2, list3, list4):
dataframe = pd.DataFrame({'name':list1, 'chaodai':list2,'zuozhe':list3 ,'content':list4})
dataframe.to_csv("D://songci.csv", index=False, sep=',', encoding="utf_8_sig")
return

if __name__ == "__main__":
url_list = get_herf(url)
for i in url_list:
newu = u + str(i)
get_poems(newu)
print(newu, "已完成！")
write_csv(pname, pintroduction1,pintroduction2, pcontent)