Spark SQL 命令大全详解
本文全面整理 Spark SQL 的核心命令,涵盖数据操作、表管理、性能优化等关键场景,结合示例说明最佳实践。
一、数据操作命令
1. 数据读写命令
-- 读取 Parquet 文件
CREATE TABLE users USING parquet OPTIONS (path "hdfs://data/users.parquet")
-- 读取 CSV 文件(含头部)
CREATE TABLE sales USING csv OPTIONS (
path "hdfs://data/sales.csv",
header "true",
inferSchema "true"
)
-- 写入 JSON 文件
INSERT OVERWRITE DIRECTORY '/output/json'
USING json
SELECT * FROM orders
-- 读写 JDBC
CREATE TABLE jdbc_table USING jdbc OPTIONS (
url "jdbc:mysql://dbserver:3306/test",
dbtable "mytable",
user "admin",
password "secret"
)
2. 数据转换命令
-- 列操作
SELECT
name,
UPPER(email) AS email_upper, -- 内置函数
salary * 1.1 AS new_salary -- 表达式
FROM employees
-- 类型转换
SELECT
CAST(age AS DOUBLE) AS age_double,
DATE_FORMAT(hire_date, 'yyyy-MM') AS hire_month
FROM staff
-- 条件表达式
SELECT
product_id,
CASE
WHEN price > 1000 THEN 'premium'
WHEN price > 500 THEN 'standard'
ELSE 'budget'
END AS category
FROM products
二、表管理命令
1. 表创建与修改
-- 创建托管表
CREATE TABLE employees (
id INT PRIMARY KEY,
name STRING NOT NULL,
salary DECIMAL(10,2),
department_id INT
) USING parquet
-- 创建外部表
CREATE EXTERNAL TABLE logs (
log_time TIMESTAMP,
message STRING
) LOCATION 'hdfs://data/logs/'
-- 添加分区
ALTER TABLE sales ADD PARTITION (dt='2023-10-01')
-- 修改列类型
ALTER TABLE employees ALTER COLUMN salary TYPE DOUBLE
-- 添加列
ALTER TABLE users ADD COLUMN last_login TIMESTAMP
2. 分区与分桶
-- 创建分区表
CREATE TABLE web_logs (
ip STRING,
url STRING
) PARTITIONED BY (dt STRING, country STRING)
-- 创建分桶表
CREATE TABLE user_behavior (
user_id BIGINT,
action STRING
) CLUSTERED BY (user_id) INTO 32 BUCKETS
三、数据查询命令
1. 基础查询
-- 过滤与排序
SELECT * FROM orders
WHERE amount > 1000
ORDER BY order_date DESC
LIMIT 100
-- 分组聚合
SELECT
department,
AVG(salary) AS avg_salary,
COUNT(*) AS employee_count
FROM employees
GROUP BY department
HAVING AVG(salary) > 8000
2. 高级查询
-- 窗口函数
SELECT
product_id,
sale_date,
amount,
SUM(amount) OVER (PARTITION BY product_id ORDER BY sale_date) AS running_total
FROM sales
-- CTE 表达式
WITH top_customers AS (
SELECT customer_id, SUM(amount) AS total
FROM orders
GROUP BY customer_id
ORDER BY total DESC
LIMIT 100
)
SELECT * FROM top_customers
-- EXPLAIN 执行计划
EXPLAIN EXTENDED
SELECT department, AVG(salary) FROM employees GROUP BY department
四、性能优化命令
1. 配置参数
-- 设置 Shuffle 分区数
SET spark.sql.shuffle.partitions=200;
-- 启用 Adaptive Query Execution
SET spark.sql.adaptive.enabled=true;
-- 配置广播连接阈值
SET spark.sql.autoBroadcastJoinThreshold=10485760; -- 10MB
-- 启用向量化读取
SET spark.sql.parquet.enableVectorizedReader=true;
2. 数据倾斜处理
-- 倾斜连接优化
SELECT /*+ SKEW('orders', 'customer_id') */
o.order_id, c.name
FROM orders o JOIN customers c
ON o.customer_id = c.id
-- 随机前缀处理倾斜
SELECT
customer_id,
CONCAT(CAST(FLOOR(RAND() * 10) AS STRING), '_', customer_id) AS salted_id
FROM big_table
3. 缓存与持久化
-- 缓存表(内存)
CACHE TABLE hot_products;
-- 持久化到磁盘
CREATE TABLE sales_cache USING parquet
AS SELECT * FROM sales WHERE year=2023;
-- 检查存储状态
DESCRIBE EXTENDED sales_cache;
五、元数据操作命令
1. 数据库操作
-- 创建数据库
CREATE DATABASE ecommerce
COMMENT '电商业务数据库'
LOCATION 'hdfs://data/warehouse/ecommerce'
-- 切换数据库
USE ecommerce;
-- 查看所有表
SHOW TABLES;
2. 表结构查看
-- 查看表结构
DESCRIBE FORMATTED customers;
-- 查看分区信息
SHOW PARTITIONS web_logs;
-- 查看建表语句
SHOW CREATE TABLE orders;
3. 函数管理
-- 查看内置函数
SHOW FUNCTIONS LIKE 'substring*';
-- 创建 UDF
CREATE TEMPORARY FUNCTION area AS 'com.example.GeoUDF';
-- 查看函数详情
DESCRIBE FUNCTION EXTENDED area;
六、实用工具命令
1. 数据导入导出
-- 从 Hive 导入
CREATE TABLE hive_import AS
SELECT * FROM hive_catalog.default.sales
-- 导出到 CSV
INSERT OVERWRITE LOCAL DIRECTORY '/tmp/export'
USING csv
OPTIONS (header 'true')
SELECT * FROM results
2. 事务控制 (Delta Lake)
-- 创建 Delta 表
CREATE TABLE events USING delta
LOCATION 'hdfs://data/delta/events'
-- 时间旅行查询
SELECT * FROM events VERSION AS OF 12
3. 维护命令
-- 修复元数据(Hive 兼容)
MSCK REPAIR TABLE partitioned_table
-- 刷新表元数据
REFRESH TABLE external_data
-- 清除缓存
UNCACHE TABLE temp_results
七、最佳实践命令示例
1. 高效 ETL 流程
-- 步骤1: 创建分区表
CREATE TABLE user_events (
event_time TIMESTAMP,
event_type STRING,
user_id BIGINT
) PARTITIONED BY (event_date DATE);
-- 步骤2: 动态分区插入
SET spark.sql.sources.partitionOverwriteMode=dynamic;
INSERT OVERWRITE TABLE user_events
PARTITION (event_date)
SELECT
event_time,
event_type,
user_id,
CAST(event_time AS DATE) AS event_date
FROM raw_events;
2. 数据分析工作流
-- 创建临时视图
CREATE OR REPLACE TEMP VIEW active_users AS
SELECT user_id, COUNT(*) AS events
FROM user_events
WHERE event_date > current_date() - 30
GROUP BY user_id
HAVING COUNT(*) > 10;
-- 复杂分析查询
WITH user_segments AS (
SELECT
u.user_id,
CASE
WHEN u.events > 100 THEN 'power'
WHEN u.events > 50 THEN 'active'
ELSE 'casual'
END AS segment
FROM active_users u
)
SELECT
segment,
AVG(p.order_amount) AS avg_order_value
FROM user_segments s
JOIN purchases p ON s.user_id = p.user_id
GROUP BY segment;
八、注意事项与技巧
-
执行模式选择:
-- 本地模式快速测试 SET spark.sql.execution.local.enabled=true;
-
小文件合并:
SET spark.sql.mergeSmallFile.enabled=true; SET spark.sql.mergeSmallFile.size=134217728; -- 128MB
-
资源队列指定:
SET spark.yarn.queue=production;
-
执行计划优化提示:
SELECT /*+ COALESCE(3) */ * FROM large_table; SELECT /*+ REPARTITION(100) */ * FROM skewed_data;
-
错误处理:
-- 忽略损坏文件 SET spark.sql.files.ignoreCorruptFiles=true;
提示:Spark 3.x+ 推荐使用 Delta Lake 获得 ACID 事务支持:
CREATE TABLE delta_table USING DELTA LOCATION '/data/delta/table'
掌握这些命令可覆盖 95% 的日常 Spark SQL 开发场景,结合 Spark UI 进行性能分析和调优,可构建高效的数据处理管道。