Python作为数据科学和自动化领域的主流语言,在网络爬虫开发中占据着重要地位。本文将全面介绍Python爬虫的技术栈、实现方法和最佳实践。
爬虫技术概述
网络爬虫(Web Crawler)是一种按照特定规则自动抓取互联网信息的程序。它可以自动化地浏览网络、下载内容并提取有价值的数据,广泛应用于搜索引擎、数据分析和商业智能等领域。
核心库与技术栈
1. 基础请求库
-
Requests:简洁易用的HTTP库,适合大多数静态页面抓取
-
urllib:Python标准库中的HTTP工具集
2. 解析库
-
BeautifulSoup:HTML/XML解析库,适合初学者
-
lxml:高性能解析库,支持XPath
-
PyQuery:jQuery风格的解析库
3. 高级框架
-
Scrapy:完整的爬虫框架,适合大型项目
-
Selenium:浏览器自动化工具,处理JavaScript渲染
-
Playwright:新兴的浏览器自动化库,支持多浏览器
4. 异步处理
-
aiohttp:异步HTTP客户端/服务器框架
-
Asyncio:Python异步IO框架
实战示例
示例1:基础静态页面抓取
python
import requests
from bs4 import BeautifulSoup
import pandas as pd
def scrape_basic_website(url):
"""抓取静态网站基本信息"""
try:
# 设置请求头模拟浏览器
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
# 发送GET请求
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status() # 检查请求是否成功
# 解析HTML内容
soup = BeautifulSoup(response.content, 'lxml')
# 提取数据
data = {
'title': soup.title.string if soup.title else '',
'headings': [h.get_text().strip() for h in soup.find_all(['h1', 'h2', 'h3'])],
'links': [a.get('href') for a in soup.find_all('a') if a.get('href')],
'text_content': soup.get_text()[0:500] + '...' # 限制文本长度
}
return data
except requests.exceptions.RequestException as e:
print(f"请求错误: {e}")
return None
# 使用示例
if __name__ == "__main__":
result = scrape_basic_website('https://httpbin.org/html')
if result:
print("网页标题:", result['title'])
print("前5个链接:", result['links'][:5])
示例2:处理动态内容(使用Selenium)
python
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
def scrape_dynamic_content(url):
"""抓取需要JavaScript渲染的动态内容"""
# 配置浏览器选项
chrome_options = Options()
chrome_options.add_argument('--headless') # 无头模式
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=chrome_options)
try:
driver.get(url)
# 等待特定元素加载完成
wait = WebDriverWait(driver, 10)
element = wait.until(
EC.presence_of_element_located((By.TAG_NAME, "main"))
)
# 提取动态生成的内容
dynamic_content = driver.find_element(By.TAG_NAME, "main").text
# 截图功能(用于调试)
driver.save_screenshot('page_screenshot.png')
return dynamic_content[:1000] # 返回部分内容
finally:
driver.quit()
# 使用示例
# content = scrape_dynamic_content('https://example.com')
# print(content)
示例3:使用Scrapy框架
创建Scrapy项目:
bash
scrapy startproject myproject cd myproject
定义爬虫(spiders/example_spider.py):
python
import scrapy
from myproject.items import WebsiteItem
class ExampleSpider(scrapy.Spider):
name = "example"
allowed_domains = ["example.com"]
start_urls = ["https://example.com"]
custom_settings = {
'CONCURRENT_REQUESTS': 1,
'DOWNLOAD_DELAY': 2, # 遵守爬虫礼仪
'USER_AGENT': 'MyWebCrawler/1.0 (+https://mywebsite.com)'
}
def parse(self, response):
# 提取数据
item = WebsiteItem()
item['url'] = response.url
item['title'] = response.css('title::text').get()
item['content'] = response.css('p::text').getall()
yield item
# 跟踪链接(可选)
for next_page in response.css('a::attr(href)').getall():
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
高级技巧与最佳实践
1. 处理反爬机制
python
import random
import time
def advanced_scraper(url):
"""高级爬虫,应对反爬措施"""
headers_list = [
{'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'},
{'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'},
{'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'}
]
# 使用代理(可选)
proxies = {
'http': 'http://10.10.1.10:3128',
'https': 'http://10.10.1.10:1080',
}
try:
# 随机选择请求头
headers = random.choice(headers_list)
response = requests.get(
url,
headers=headers,
timeout=15,
# proxies=proxies # 如果需要使用代理取消注释
)
# 随机延迟,避免请求过于频繁
time.sleep(random.uniform(1, 3))
return response
except Exception as e:
print(f"高级抓取错误: {e}")
return None
2. 数据存储
python
import json
import csv
import sqlite3
def save_data(data, format='json', filename='data'):
"""多种格式保存数据"""
if format == 'json':
with open(f'{filename}.json', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
elif format == 'csv':
if data and isinstance(data, list) and len(data) > 0:
keys = data[0].keys()
with open(f'{filename}.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=keys)
writer.writeheader()
writer.writerows(data)
elif format == 'sqlite':
conn = sqlite3.connect(f'{filename}.db')
c = conn.cursor()
# 创建表(根据实际数据结构调整)
c.execute('''CREATE TABLE IF NOT EXISTS scraped_data
(id INTEGER PRIMARY KEY, title TEXT, content TEXT)''')
# 插入数据(根据实际数据结构调整)
for item in data:
c.execute("INSERT INTO scraped_data (title, content) VALUES (?, ?)",
(item.get('title'), str(item.get('content'))))
conn.commit()
conn.close()
3. 异步爬虫提高效率
python
import aiohttp
import asyncio
async def async_scraper(urls):
"""异步爬虫,提高抓取效率"""
async with aiohttp.ClientSession() as session:
tasks = []
for url in urls:
task = asyncio.ensure_future(fetch(session, url))
tasks.append(task)
results = await asyncio.gather(*tasks)
return results
async def fetch(session, url):
"""异步获取单个URL"""
try:
async with session.get(url, timeout=aiohttp.ClientTimeout(total=10)) as response:
return await response.text()
except Exception as e:
print(f"Error fetching {url}: {e}")
return None
# 使用示例
# urls = ['https://example.com/page1', 'https://example.com/page2']
# results = asyncio.run(async_scraper(urls))
4417

被折叠的 条评论
为什么被折叠?



