http://blog.sina.com.cn/s/articlelist_1973273451_0_1.html

本文为新浪博客的一个示例链接,具体内容未给出,通常包含博主分享的各种心得、教程和技术文章等。
问题有所改善但依然有问题:正在处理第 1: https://blog.sina.com.cn/s/blog_475b3d560102w5rf.html ✅ 成功提取 https://blog.sina.com.cn/s/blog_475b3d560102w5rf.html 数据:阅读1679 | 收藏2 | 喜欢17 | 赠金笔0 正在处理第 2 篇: https://blog.sina.com.cn/s/blog_475b3d560102vkkf.html ✅ 成功提取 https://blog.sina.com.cn/s/blog_475b3d560102vkkf.html 数据:阅读0 | 收藏0 | 喜欢472 | 赠金笔268 正在处理第 3 篇: https://blog.sina.com.cn/s/blog_475b3d560102wdtc.html ✅ 成功提取 https://blog.sina.com.cn/s/blog_475b3d560102wdtc.html 数据:阅读0 | 收藏0 | 喜欢1071 | 赠金笔128 正在处理第 4 篇: https://blog.sina.com.cn/s/blog_475b3d560102w5lw.html ❌ 提取 https://blog.sina.com.cn/s/blog_475b3d560102w5lw.html 数据失败: Message: Stacktrace: GetHandleVerifier [0x0x7ff6e669e8e5+80021] GetHandleVerifier [0x0x7ff6e669e9 正在处理第 5 篇: https://blog.sina.com.cn/s/blog_475b3d560102vpz3.html ❌ 提取 https://blog.sina.com.cn/s/blog_475b3d560102vpz3.html 数据失败: Message: Stacktrace: GetHandleVerifier [0x0x7ff6e669e8e5+80021] GetHandleVerifier [0x0x7ff6e669e9 正在处理第 6 篇: https://blog.sina.com.cn/s/blog_475b3d560102wh5w.html ❌ 提取 https://blog.sina.com.cn/s/blog_475b3d560102wh5w.html 数据失败: Message: Stacktrace: GetHandleVerifier [0x0x7ff6e669e8e5+80021] GetHandleVerifier [0x0x7ff6e669e9 正在处理第 7 篇: https://blog.sina.com.cn/s/blog_475b3d560102v8fk.html ❌ 提取 https://blog.sina.com.cn/s/blog_475b3d560102v8fk.html 数据失败: Message: Stacktrace: GetHandleVerifier [0x0x7ff6e669e8e5+80021] GetHandleVerifier [0x0x7ff6e669e9 正在处理第 8 篇: https://blog.sina.com.cn/s/blog_475b3d560102vns9.html
10-27
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import time import pandas as pd import requests from bs4 import BeautifulSoup from sqlalchemy import create_engine from urllib.parse import quote_plus import re # --- 1. 数据库连接配置 --- DBSERVER = 'DESKTOP-NU4IJON' DBNAME = 'BlogData' DBUSER = 'sa' DBPASSWORD = '1' connection_string = f"DRIVER={{ODBC Driver 17 for SQL Server}};SERVER={DBSERVER};DATABASE={DBNAME};UID={DBUSER};PWD={DBPASSWORD}" connection_url = f"mssql+pyodbc:///?odbc_connect={quote_plus(connection_string)}" try: engine = create_engine(connection_url) print("数据库连接成功!") except Exception as e: print(f"数据库连接失败: {e}") exit() # --- 2. Chrome浏览器配置(适配141.0.7390.122) --- def setup_driver(): chrome_options = Options() chrome_options.binary_location = r"D:\Google\chrome-win64\chrome.exe" # 你的Chrome路径 chrome_options.add_argument("--disable-gpu") chrome_options.add_argument("--window-size=1920,1080") chrome_options.add_argument( "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.7390.122 Safari/537.36" ) service = Service(r"C:\Users\Lenovo\Downloads\chromedriver-win64\chromedriver.exe") # 你的ChromeDriver路径 driver = webdriver.Chrome(service=service, options=chrome_options) driver.implicitly_wait(10) return driver # --- 3. 动态数据提取函数(精准匹配元素结构) --- def get_dynamic_data(driver, url): driver.get(url) time.sleep(10) # 延长等待确保加载 data = {'url': url, 'read_count': 0, 'collect_count': 0, 'like_count': 0, 'gold_pen_count': 0} try: # 阅读数:<span id="r_475b3d560102wq5k" class="SG_txtb">(12872)</span> read_elem = WebDriverWait(driver, 15).until( EC.presence_of_element_located((By.XPATH, '//span[contains(@id, "r_") and contains(@class, "SG_txtb")]')) ) data['read_count'] = int(read_elem.text.strip('()')) # 提取括号内的数字 # 收藏数:<span id="f_475b3d560102wq5k" class="SG_txtb">(39)</span> collect_elem = WebDriverWait(driver, 15).until( EC.presence_of_element_located((By.XPATH, '//span[contains(@id, "f_") and contains(@class, "SG_txtb")]')) ) data['collect_count'] = int(collect_elem.text.strip('()')) # 喜欢数:<p ti_title="..." id="dbox2_475b3d560102wq5k" class="count" mnum="206">206</p> like_elem = WebDriverWait(driver, 15).until( EC.presence_of_element_located((By.XPATH, '//p[contains(@class, "count") and @mnum]')) ) data['like_count'] = int(like_elem.text) # 赠金笔数:<p class="count" id="goldPan-num">36</p> gold_pen_elem = WebDriverWait(driver, 15).until( EC.presence_of_element_located((By.XPATH, '//p[@id="goldPan-num" and @class="count"]')) ) data['gold_pen_count'] = int(gold_pen_elem.text) print(f"✅ 成功提取 {url} 数据:阅读{data['read_count']} | 收藏{data['collect_count']} | 喜欢{data['like_count']} | 赠金笔{data['gold_pen_count']}") except Exception as e: print(f"❌ 提取 {url} 数据失败: {str(e)[:100]}") return data # --- 4. 获取文章链接 --- def get_all_urls(uid, page): url = f'https://blog.sina.com.cn/s/articlelist_{uid}_0_{page}.html' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/141.0.7390.122 Safari/537.36' } try: r = requests.get(url, headers=headers, timeout=15) r.raise_for_status() r.encoding = r.apparent_encoding except requests.RequestException as e: print(f"请求第 {page} 页失败: {e}") return [] soup = BeautifulSoup(r.text, 'html.parser') article_links = [] links = soup.find_all('a', href=re.compile(r'/s/blog_475b3d56[\w]+\.html')) if not links: print(f"第 {page} 页无文章链接") return [] for link in links: full_url = requests.compat.urljoin(url, link['href']) article_links.append(full_url) return list(set(article_links)) # --- 5. 主函数 --- def main(): uid = '1197161814' TARGET_POST_COUNT = 250 post_count = 0 page = 1 all_dynamic_data = [] try: driver = setup_driver() print("Chrome浏览器启动成功,开始爬取动态数据...\n") except Exception as e: print(f"浏览器启动失败: {e}") return while post_count < TARGET_POST_COUNT: article_urls = get_all_urls(uid, page) if not article_urls: print(f"\n第 {page} 页无更多链接,停止爬取") break for url in article_urls: if post_count >= TARGET_POST_COUNT: break print(f"正在处理第 {post_count + 1} 篇: {url}") dynamic_data = get_dynamic_data(driver, url) all_dynamic_data.append(dynamic_data) post_count += 1 time.sleep(3) page += 1 time.sleep(5) driver.quit() print(f"\n--- 爬取结束 ---") valid_data = [d for d in all_dynamic_data if sum(d.values()) > 0] print(f"共处理 {len(all_dynamic_data)} 篇,有效数据 {len(valid_data)} 条") if valid_data: df = pd.DataFrame(valid_data) try: df.to_sql( name='blog_posts', con=engine, if_exists='append', index=False, chunksize=10 ) print(f"✅ 成功存入 {len(df)} 条数据到数据库!") except Exception as e: print(f"❌ 入库失败: {e}") else: print("❌ 无有效数据可入库") if __name__ == '__main__': main()C:\Users\Lenovo\.conda\envs\Anaconda\python.exe F:\dongtaidate\dongtai.py 数据库连接成功! Chrome浏览器启动成功,开始爬取动态数据... 正在处理第 1: https://blog.sina.com.cn/s/blog_475b3d560102vjtc.html ✅ 成功提取 https://blog.sina.com.cn/s/blog_475b3d560102vjtc.html 数据:阅读5627 | 收藏3 | 喜欢29 | 赠金笔5 正在处理第 2 篇: https://blog.sina.com.cn/s/blog_475b3d560102vhae.html ❌ 提取 https://blog.sina.com.cn/s/blog_475b3d560102vhae.html 数据失败: invalid literal for int() with base 10: '' 正在处理第 3 篇: https://blog.sina.com.cn/s/blog_475b3d560102vo42.html ❌ 提取 https://blog.sina.com.cn/s/blog_475b3d560102vo42.html 数据失败: invalid literal for int() with base 10: '' 正在处理第 4 篇: https://blog.sina.com.cn/s/blog_475b3d560102vlbl.html ❌ 提取 https://blog.sina.com.cn/s/blog_475b3d560102vlbl.html 数据失败: invalid literal for int() with base 10: '' 正在处理第 5 篇: https://blog.sina.com.cn/s/blog_475b3d560102dvxh.html ❌ 提取 https://blog.sina.com.cn/s/blog_475b3d560102dvxh.html 数据失败: invalid literal for int() with base 10: '' 正在处理第 6 篇: https://blog.sina.com.cn/s/blog_475b3d560102w5rg.html ❌ 提取 https://blog.sina.com.cn/s/blog_475b3d560102w5rg.html 数据失败: invalid literal for int() with base 10: ''
10-27
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值