包含编程籽料、学习路线图、爬虫代码、安装包等!【点击领取】
前言
在当今数据驱动的时代,网络爬虫已成为获取和分析网络数据的重要工具。Python凭借其简洁的语法和丰富的第三方库,成为了爬虫开发的首选语言。本文将介绍7个Python爬虫实战案例,从简单到复杂,带你快速掌握爬虫开发的核心技能。
案例1:静态网页抓取 - 获取豆瓣电影Top250
import requests
from bs4 import BeautifulSoup
import csv
def get_douban_top250():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
base_url = "https://movie.douban.com/top250"
with open('douban_top250.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['排名', '电影名称', '评分', '评价人数', '经典台词'])
for start in range(0, 250, 25):
url = f"{base_url}?start={start}"
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
for item in soup.find_all('div', class_='item'):
rank = item.find('em').text
title = item.find('span', class_='title').text
rating = item.find('span', class_='rating_num').text
num = item.find('div', class_='star').find_all('span')[-1].text[:-3]
quote = item.find('span', class_='inq').text if item.find('span', class_='inq') else '无'
writer.writerow([rank, title, rating, num, quote])
print("豆瓣Top250数据已保存到douban_top250.csv")
if __name__ == '__main__':
get_douban_top250()
技术要点:
使用requests库发送HTTP请求
使用BeautifulSoup解析HTML
处理分页数据
数据存储到CSV文件
案例2:动态内容抓取 - 使用Selenium获取京东商品信息
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
import time
def get_jd_product(keyword):
# 设置Chrome驱动路径
service = Service('chromedriver.exe') # 替换为你的chromedriver路径
driver = webdriver.Chrome(service=service)
try:
driver.get("https://www.jd.com")
search_box = driver.find_element(By.ID, "key")
search_box.send_keys(keyword)
search_box.submit()
time.sleep(3) # 等待页面加载
# 滚动页面以加载更多商品
for _ in range(3):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
products = driver.find_elements(By.CSS_SELECTOR, ".gl-item")
for i, product in enumerate(products[:10], 1):
name = product.find_element(By.CSS_SELECTOR, ".p-name a").text
price = product.find_element(By.CSS_SELECTOR, ".p-price strong").text
print(f"{i}. {name} - 价格: {price}")
finally:
driver.quit()
if __name__ == '__main__':
get_jd_product("Python编程书籍")
技术要点:
使用Selenium模拟浏览器操作
处理JavaScript渲染的动态内容
页面滚动加载更多数据
元素定位与交互
案例3:API接口抓取 - 获取天气数据
import requests
import json
def get_weather(city):
url = f"http://wthrcdn.etouch.cn/weather_mini?city={city}"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
try:
response = requests.get(url, headers=headers)
data = json.loads(response.text)
if data['status'] == 1000:
weather_info = data['data']
print(f"城市: {weather_info['city']}")
print(f"温度: {weather_info['wendu']}℃")
print("预报:")
for forecast in weather_info['forecast']:
print(f"{forecast['date']}: {forecast['type']}, {forecast['high']}/{forecast['low']}, 风向: {forecast['fengxiang']}")
else:
print("获取天气信息失败")
except Exception as e:
print(f"发生错误: {e}")
if __name__ == '__main__':
get_weather("北京")
技术要点:
直接调用API接口获取数据
处理JSON格式的响应
错误处理与异常捕获
案例4:登录验证 - 模拟登录GitHub
import requests
from bs4 import BeautifulSoup
def login_github(username, password):
session = requests.Session()
login_url = "https://github.com/login"
session_url = "https://github.com/session"
# 获取登录页面,提取authenticity_token
response = session.get(login_url)
soup = BeautifulSoup(response.text, 'html.parser')
token = soup.find('input', {'name': 'authenticity_token'})['value']
# 构造登录数据
login_data = {
'commit': 'Sign in',
'authenticity_token': token,
'login': username,
'password': password
}
# 发送登录请求
response = session.post(session_url, data=login_data)
# 验证登录是否成功
if response.url == "https://github.com/":
print("登录成功!")
# 获取用户信息
profile_response = session.get(f"https://github.com/{username}")
soup = BeautifulSoup(profile_response.text, 'html.parser')
name = soup.find('span', {'itemprop': 'name'}).text.strip()
print(f"欢迎, {name}!")
else:
print("
技术要点:
使用Session保持登录状态
处理表单和CSRF token
模拟登录流程
验证登录结果
案例5:图片爬取 - 下载Unsplash壁纸
import os
import requests
from bs4 import BeautifulSoup
def download_unsplash_images(keyword, count=10):
url = f"https://unsplash.com/s/photos/{keyword}"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
# 创建保存目录
save_dir = f"unsplash_{keyword}"
os.makedirs(save_dir, exist_ok=True)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
# 获取图片链接
img_tags = soup.find_all('img', {'class': 'YVj9w'})[:count]
for i, img in enumerate(img_tags, 1):
img_url = img['src'] if 'src' in img.attrs else img['data-src']
if not img_url.startswith('http'):
continue
try:
img_data = requests.get(img_url, headers=headers).content
with open(f"{save_dir}/{keyword}_{i}.jpg", 'wb') as f:
f.write(img_data)
print(f"已下载第{i}张图片")
except Exception as e:
print(f"下载第{i}张图片失败: {e}")
if __name__ == '__main__':
download_unsplash_images("nature", 5)
技术要点:
批量下载网络图片
创建本地目录保存文件
处理图片URL
二进制文件写入
案例6:反爬虫应对 - 使用代理IP抓取数据
import requests
from fake_useragent import UserAgent
import random
def get_with_proxy(url, proxy_list):
ua = UserAgent()
headers = {'User-Agent': ua.random}
for proxy in random.sample(proxy_list, len(proxy_list)):
try:
response = requests.get(
url,
headers=headers,
proxies={"http": proxy, "https": proxy},
timeout=10
)
if response.status_code == 200:
return response.text
except:
continue
return None
def scrape_with_anti_anti_spider():
# 示例代理列表(实际使用时需要获取有效的代理IP)
proxy_list = [
'http://123.456.789.101:8080',
'http://111.222.333.444:8888',
# 添加更多代理IP...
]
target_url = "https://www.example.com/data" # 替换为目标网站
html = get_with_proxy(target_url, proxy_list)
if html:
print("成功获取数据")
# 在这里处理获取到的HTML数据
else:
print("获取数据失败")
if __name__ == '__main__':
scrape_with_anti_anti_spider()
技术要点:
使用随机User-Agent
代理IP轮换
异常处理和重试机制
应对反爬虫策略
案例7:Scrapy框架实战 - 构建知乎热榜爬虫
首先安装Scrapy:
pip install scrapy
然后创建Scrapy项目:
scrapy startproject zhihu_hot
cd zhihu_hot
修改items.py:
import scrapy
class ZhihuHotItem(scrapy.Item):
rank = scrapy.Field()
title = scrapy.Field()
hot_score = scrapy.Field()
url = scrapy.Field()
创建spiders/zhihu_spider.py:
import scrapy
from zhihu_hot.items import ZhihuHotItem
class ZhihuSpider(scrapy.Spider):
name = 'zhihu'
allowed_domains = ['www.zhihu.com']
start_urls = ['https://www.zhihu.com/hot']
def parse(self, response):
items = response.css('.HotItem')
for item in items:
zhihu_item = ZhihuHotItem()
zhihu_item['rank'] = item.css('.HotItem-rank::text').get()
zhihu_item['title'] = item.css('.HotItem-title::text').get()
zhihu_item['hot_score'] = item.css('.HotItem-metrics::text').get()
zhihu_item['url'] = item.css('.HotItem-content a::attr(href)').get()
yield zhihu_item
修改settings.py添加以下配置:
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 2
FEED_FORMAT = 'json'
FEED_URI = 'zhihu_hot.json'
运行爬虫:
scrapy crawl zhihu
技术要点:
Scrapy框架的基本使用
项目结构与组件
Item定义与数据处理
选择器与CSS选择器
结果导出
爬虫开发注意事项
遵守robots.txt协议:在爬取网站前检查robots.txt文件,了解网站的爬取限制
设置合理的请求间隔:避免对目标网站造成过大压力
处理异常情况:网络错误、页面结构变化等情况需要妥善处理
尊重版权和隐私:不要爬取敏感或受版权保护的数据
用户代理设置:使用合理的User-Agent标识你的爬虫
数据存储考虑:根据数据量选择合适的存储方式(CSV、数据库等)
结语
通过这7个实战案例,我们从简单的静态页面抓取到复杂的动态内容获取,从基础请求到框架使用,逐步掌握了Python爬虫开发的各项技能。爬虫技术的学习是一个不断实践和积累经验的过程,希望这些案例能为你提供良好的起点。
最后:
希望你编程学习上不急不躁,按照计划有条不紊推进,把任何一件事做到极致,都是不容易的,加油,努力!相信自己!
文末福利
最后这里免费分享给大家一份Python全套学习资料,希望能帮到那些不满现状,想提升自己却又没有方向的朋友,也可以和我一起来学习交流呀。
包含编程资料、学习路线图、源代码、软件安装包等!【点击这里】领取!
① Python所有方向的学习路线图,清楚各个方向要学什么东西
② 100多节Python课程视频,涵盖必备基础、爬虫和数据分析
③ 100多个Python实战案例,学习不再是只会理论
④ 华为出品独家Python漫画教程,手机也能学习