数据爬取
1.导入库
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from matplotlib import pyplot as plt
from pandas import Series, DataFrame
from time import sleep
import undetected_chromedriver as uc
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.by import By
2.设置 ChromeOptions
option = ChromeOptions()
option.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0")
option.add_argument("--window-size=1920,1080")
3.实例化浏览器对象并发起请求
url = 'https://lishi.tianqi.com/guilin/202412.html'
driver=uc.Chrome(options=option)
driver.get(url)
sleep(3)
以桂林市2024年12月为例
4.滚动页面
target_element=driver.find_element(By.CSS_SELECTOR, 'body > div.main.clearfix > div.main_left.inleft > div.tian_three > ul > div')
#滚动到目标元素的位置
driver.execute_script("arguments[0].scrollIntoView({block:'center'});",target_element)
sleep(3)
block为可选参数,定义垂直方向的对齐,start
、center
、end
或 nearest
之一。默认为 start
。
5.完成点击操作并切换到最新打开的窗口
driver.find_element(By.CSS_SELECTOR, 'body > div.main.clearfix > div.main_left.inleft > div.tian_three > ul > div').click() #点击“查看更多”
# 定位到新开页面
driver.switch_to.window(driver.window_handles[-1])
sleep(3)
6.获取网页源代码并解析
data=driver.page_source
html = BeautifulSoup(data, 'html.parser')
7.关闭浏览器
driver.quit()
8.提取天气信息并存储
data_all = []
tian_three = html.find("div", {"class": "tian_three"})
lishitable_content = tian_three.find_all("li")
for i in lishitable_content:
lishi_div = i.find_all("div")
data = []
for j in lishi_div:
data.append(j.text)
data_all.append(data)
print(data_all)
weather = pd.DataFrame(data_all)
weather.columns = ["当日信息", "最高气温", "最低气温", "天气", "风向信息"]
weather_shape = weather.shape
print(weather)
weather['当日信息'].apply(str)
result = DataFrame(weather['当日信息'].apply(lambda x: Series(str(x).split(' '))))
result = result.loc[:, 0:1]
result.columns = ['日期', '星期']
weather['风向信息'].apply(str)
result1 = DataFrame(weather['风向信息'].apply(lambda x: Series(str(x).split(' '))))
result1 = result1.loc[:, 0:1]
result1.columns = ['风向', '级数']
weather = weather.drop(columns='当日信息')
weather = weather.drop(columns='风向信息')
weather.insert(loc=0, column='日期', value=result['日期'])
weather.insert(loc=1, column='星期', value=result['星期'])
weather.insert(loc=5, column='风向', value=result1['风向'])
weather.insert(loc=6, column='级数', value=result1['级数'])
weather.to_csv("桂林24年12月天气.csv", encoding="utf_8")