这是一个爬取网易云音乐榜单的代码。
此处和之前发的爬取boss招聘信息的主体思路和方法是一致的,可以按此套路尝试爬取别的网站信息。
--------代码--------
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
#from selenium.webdriver.chrome.options import Options
from selenium.webdriver import ActionChains
from lxml import etree
import time
from datetime import datetime,timedelta
from selenium.webdriver.edge.service import Service
from webdriver_manager.microsoft import EdgeChromiumDriverManager
import csv
from urllib.parse import quote
f = open('./网易云列表.csv', mode='a', encoding='utf-8-sig', newline='')
csv_writer = csv.writer(f)
csv_writer.writerow(['排名', '名称', '时长', '歌手'])
# 打开浏览器
service = Service(EdgeChromiumDriverManager().install())
driver = webdriver.Edge(service=service)
try:
# 2. 打开一个网址
url = r'https://music.163.com/#/discover/toplist?id=19723756'
driver.get(url)
print(url)
_iframe = driver.find_element(By.XPATH,"//iframe[@id='g_iframe']") # 找到iframe标签
driver.switch_to.frame(_iframe)
time.sleep(1)
page_text = driver.execute_script("return document.documentElement.outerHTML")
#获取页面
html = etree.HTML(page_text)
# 等待10秒(注意:这里使用 implicitly_wait 可能不是最佳实践,考虑使用 WebDriverWait)
driver.implicitly_wait(10)
# 3. 获取数据 定位数据
trs = html.xpath('//tbody/tr')
for tr in trs:
rank = tr.xpath(".//span[@class='num']/text()")[0] #注意xpath获取到的是列表,需提取其元素
title = tr.xpath(".//b/@title")[0]
span = tr.xpath(".//td[@class=' s-fc3']/span[@class='u-dur ']/text()")[0]
singer = tr.xpath(".//div[@class='text']/span/@title")[0]
csv_writer.writerow([rank, title, span, singer])
finally:
# 关闭浏览器
driver.quit()
# 关闭CSV文件
f.close()