Apache Airflow与数据仓库集成:Snowflake、BigQuery实战
概述
在现代数据工程架构中,Apache Airflow已成为数据管道编排的事实标准。结合云数据仓库如Snowflake和BigQuery,可以构建强大、可扩展的数据处理解决方案。本文将深入探讨如何使用Apache Airflow与这两大主流数据仓库进行深度集成,并提供实战示例。
核心集成架构
Airflow与数据仓库集成模式
技术栈对比
| 特性 | Snowflake | BigQuery |
|---|---|---|
| 计算存储分离 | ✅ 完全分离 | ✅ 完全分离 |
| 自动扩缩容 | ✅ 支持 | ✅ 支持 |
| 定价模式 | 按计算+存储 | 按查询+存储 |
| 地理分布 | 多区域支持 | 多区域支持 |
| 数据共享 | ✅ 安全数据共享 | ✅ Datashare |
| 机器学习集成 | ✅ Snowpark ML | ✅ BigQuery ML |
Snowflake集成实战
环境配置
首先需要安装Snowflake provider包:
pip install apache-airflow-providers-snowflake
连接配置
在Airflow Web UI中配置Snowflake连接:
- Connection ID:
snowflake_default - Connection Type:
Snowflake - Host:
{account}.snowflakecomputing.com - Schema:
{database}.{schema} - Login:
{username} - Password:
{password} - Extra:
{
"warehouse": "COMPUTE_WH",
"database": "PROD_DB",
"role": "SYSADMIN"
}
基础操作示例
1. 执行SQL查询
from airflow import DAG
from airflow.providers.snowflake.operators.snowflake import SnowflakeOperator
from datetime import datetime
with DAG(
'snowflake_etl_pipeline',
start_date=datetime(2024, 1, 1),
schedule_interval='@daily'
) as dag:
create_table = SnowflakeOperator(
task_id='create_table',
sql='''
CREATE TABLE IF NOT EXISTS user_activity (
user_id INT,
activity_date DATE,
page_views INT,
duration INTERVAL
)
''',
snowflake_conn_id='snowflake_default'
)
load_data = SnowflakeOperator(
task_id='load_data',
sql='''
INSERT INTO user_activity
SELECT
user_id,
CURRENT_DATE(),
COUNT(*) as page_views,
SUM(duration) as total_duration
FROM raw_events
WHERE event_date = CURRENT_DATE()
GROUP BY user_id
''',
snowflake_conn_id='snowflake_default'
)
create_table >> load_data
2. 数据质量检查
from airflow.providers.snowflake.operators.snowflake import SnowflakeCheckOperator
data_quality_check = SnowflakeCheckOperator(
task_id='data_quality_check',
sql='''
SELECT
CASE
WHEN COUNT(*) = 0 THEN 0
WHEN COUNT(*) > 100000 THEN 0
ELSE 1
END as is_valid
FROM user_activity
WHERE activity_date = CURRENT_DATE()
''',
snowflake_conn_id='snowflake_default'
)
3. 使用Snowpark进行高级处理
from airflow.providers.snowflake.operators.snowflake import SnowflakeOperator
advanced_processing = SnowflakeOperator(
task_id='advanced_processing',
sql='''
CREATE OR REPLACE PROCEDURE process_user_behavior()
RETURNS STRING
LANGUAGE PYTHON
RUNTIME_VERSION = '3.8'
PACKAGES = ('snowflake-snowpark-python')
HANDLER = 'process_data'
AS
$$
def process_data(session):
# 使用Snowpark进行复杂数据处理
df = session.table('user_activity')
result = df.filter(df['page_views'] > 10).groupBy('user_id').count()
result.write.mode('overwrite').save_as_table('active_users')
return "Processing completed"
$$
''',
snowflake_conn_id='snowflake_default'
)
BigQuery集成实战
环境配置
安装BigQuery provider包:
pip install apache-airflow-providers-google
连接配置
配置BigQuery连接:
- Connection ID:
google_cloud_default - Connection Type:
Google Cloud - Project ID:
your-project-id - Keyfile JSON:
服务账户密钥JSON内容
核心操作示例
1. 执行查询作业
from airflow import DAG
from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator
from datetime import datetime
with DAG(
'bigquery_analytics_pipeline',
start_date=datetime(2024, 1, 1),
schedule_interval='@daily'
) as dag:
daily_aggregation = BigQueryInsertJobOperator(
task_id='daily_aggregation',
configuration={
'query': {
'query': '''
CREATE OR REPLACE TABLE `project.dataset.daily_metrics` AS
SELECT
DATE(timestamp) as event_date,
user_id,
COUNT(*) as event_count,
SUM(value) as total_value,
APPROX_QUANTILES(value, 100)[OFFSET(50)] as median_value
FROM `project.dataset.raw_events`
WHERE DATE(timestamp) = CURRENT_DATE()
GROUP BY event_date, user_id
''',
'useLegacySql': False,
'writeDisposition': 'WRITE_TRUNCATE'
}
},
project_id='your-project-id',
location='US'
)
2. 数据加载操作
from airflow.providers.google.cloud.transfers.gcs_to_bigquery import GCSToBigQueryOperator
load_from_gcs = GCSToBigQueryOperator(
task_id='load_from_gcs',
bucket='your-bucket-name',
source_objects=['data/*.parquet'],
destination_project_dataset_table='project.dataset.staging_table',
source_format='PARQUET',
write_disposition='WRITE_TRUNCATE',
create_disposition='CREATE_IF_NEEDED',
autodetect=True
)
3. 机器学习集成
bq_ml_training = BigQueryInsertJobOperator(
task_id='bq_ml_training',
configuration={
'query': {
'query': '''
CREATE OR REPLACE MODEL `project.dataset.customer_churn_model`
OPTIONS(
model_type='logistic_reg',
auto_class_weights=True,
input_label_cols=['churned']
) AS
SELECT
tenure,
monthly_charges,
total_charges,
contract,
payment_method,
CASE WHEN churn_date IS NOT NULL THEN 1 ELSE 0 END as churned
FROM `project.dataset.customers`
WHERE partition_date < CURRENT_DATE() - 30
''',
'useLegacySql': False
}
}
)
高级集成模式
跨数据仓库数据同步
from airflow import DAG
from airflow.providers.snowflake.operators.snowflake import SnowflakeOperator
from airflow.providers.google.cloud.transfers.bigquery_to_gcs import BigQueryToGCSOperator
from airflow.providers.snowflake.transfers.s3_to_snowflake import S3ToSnowflakeOperator
from datetime import datetime
with DAG(
'cross_warehouse_sync',
start_date=datetime(2024, 1, 1),
schedule_interval='@weekly'
) as dag:
# 从BigQuery导出到GCS
export_to_gcs = BigQueryToGCSOperator(
task_id='export_to_gcs',
source_project_dataset_table='project.dataset.source_table',
destination_cloud_storage_uris=['gs://your-bucket/data/export_*.parquet'],
export_format='PARQUET'
)
# 从GCS加载到Snowflake
load_to_snowflake = SnowflakeOperator(
task_id='load_to_snowflake',
sql='''
COPY INTO analytics.target_table
FROM @my_s3_stage/data/
FILE_FORMAT = (TYPE = PARQUET)
PATTERN = '.*export_.*parquet'
''',
snowflake_conn_id='snowflake_default'
)
export_to_gcs >> load_to_snowflake
数据质量监控框架
from airflow import DAG
from airflow.providers.snowflake.operators.snowflake import SnowflakeCheckOperator
from airflow.providers.google.cloud.operators.bigquery import BigQueryCheckOperator
from airflow.operators.python import PythonOperator
from datetime import datetime
def validate_cross_warehouse_consistency(**context):
# 实现跨数据仓库一致性检查逻辑
pass
with DAG(
'data_quality_monitoring',
start_date=datetime(2024, 1, 1),
schedule_interval='@daily'
) as dag:
# Snowflake数据质量检查
snowflake_quality_check = SnowflakeCheckOperator(
task_id='snowflake_quality_check',
sql='''
SELECT
COUNT(*) as total_records,
COUNT(DISTINCT user_id) as distinct_users,
MIN(event_date) as min_date,
MAX(event_date) as max_date
FROM snowflake_db.schema.user_activity
WHERE event_date = CURRENT_DATE() - 1
''',
snowflake_conn_id='snowflake_default'
)
# BigQuery数据质量检查
bigquery_quality_check = BigQueryCheckOperator(
task_id='bigquery_quality_check',
sql='''
SELECT
COUNT(*) as total_records,
COUNT(DISTINCT user_id) as distinct_users
FROM `project.dataset.user_activity`
WHERE event_date = DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY)
''',
gcp_conn_id='google_cloud_default',
use_legacy_sql=False
)
# 跨仓库一致性验证
cross_validation = PythonOperator(
task_id='cross_validation',
python_callable=validate_cross_warehouse_consistency,
provide_context=True
)
[snowflake_quality_check, bigquery_quality_check] >> cross_validation
性能优化策略
查询优化技巧
具体优化示例
Snowflake优化
创作声明:本文部分内容由AI辅助生成(AIGC),仅供参考



