insert into/overwrite table xxx partition(dt=xxx) select xxx from tmp;语句执行成功,但插入失败

本文针对Hive SQL执行过程中出现的内存溢出等问题进行了详细排查,并提供了有效的解决方案,包括更改执行引擎、调整YARN配置及优化SQL语句等。

报错信息

在hive 中执行insert into/overwrite table dwd_xxx partition(dt=xxx) select xxx from tmp语句,发现插入失败,其中dwd_xxx存储格式为parquet,tmp存储格式为textfile。

问题排查思路

  1. 查日志,一般查两个日志第一个为hive.log => 缺省情况下 /tmp/root/hive.log (hive-site.conf),第二个为MR的日志 => 启动historyserver、日志聚合 + SQL运行在集群模式。

  2. 排查jobhistory日志未发现报错信息,排查hive.log未发现报错信息。在这里插入图片描述

  3. 关闭hive的local模式,重新运行,查看jobhistory报错信息发现报错java.lang.reflect.InvocationTargetException/java.lang.NullPointException错误。在这里插入图片描述

  4. 上述报错信息无明显定位,实际作用不大。将hive的执行引擎由mr替换为tez,set hive.execution.engine=tez;执行语句时报错Exception in thread main java.lang.NoClassDefFoundError: org/apache/tez/dag/api/TezConfiguration,在~/.bashrc中加入

for jar in `ls $TEZ_HOME |grep jar`; do
    export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$TEZ_HOME/$jar
done
for jar in `ls $TEZ_HOME/lib`; do
    export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$TEZ_HOME/lib/$jar
done

然后source ~/.bashrc;

5.重新执行insert into … select…语句报错xxx is running beyond virtual memory limits. Current usage:201.9MB of 1GB physical memory used; 2.6GB of 2.1 GB virtual memory used. Killing container.在这里插入图片描述
6.经查yarn虚拟内存不足,cd $HADOOP_HOME/conf/hadoop,修改yarn-site.xml,加个配置

<property>
	<name>yarn.nodemanager.vmem-pmem-ratio</name>
	<value>5</value>
</property>

7.再次运行即可,上面扩展5倍后有可能还是执行不成功,这时候试试改写sql语句,先做select,将结果插入临时表,再将临时表的内容插入dwd。
改写前

insert overwrite table dwd_xxx
partition(dt='$do_date')
select * from ods_xxx where ...

改写后

create table tmp.tmp_xxx as
( select *from ods_xxx where...);

insert overwrite table dwd_xxx
partition(dt='$do_date')
select * from tmp_xxx;

相当于中间数据落盘,减少内存消耗。

-- ******************************************************************** -- -- Author: 蔡义 -- CreateTime: 2025-09-11 -- Comment: 扩展用户常听歌曲序列的side_info信息表 - 性能优化版 -- 优化策略: 热点用户识别+分层处理+动态盐值 -- ******************************************************************** -- -- 设置优化参数 SET hive.map.aggr=true; SET hive.map.aggr.hash.percentmemory=0.5; SET hive.groupby.skewindata=true; SET hive.optimize.skewjoin=true; SET hive.skewjoin.key=100000; SET hive.exec.reducers.bytes.per.reducer=256000000; SET hive.exec.reducers.max=999; SET hive.auto.convert.join=true; SET hive.mapjoin.smalltable.filesize=100000000; SET mapreduce.map.memory.mb=4096; SET mapreduce.reduce.memory.mb=8192; SET hive.exec.parallel=true; SET hive.exec.parallel.thread.number=8; -- 建表语句(保持不变) CREATE TABLE IF NOT EXISTS music_most.user_song_sequence_with_side_info ( user_id STRING COMMENT '用户ID', user_often_listen_song_id_list_7d ARRAY COMMENT '用户7天常听的歌曲ID列表', user_often_listen_song_name_list_7d ARRAY COMMENT '用户7天常听的歌曲名称列表', user_often_listen_artist_id_list_7d ARRAY COMMENT '用户7天常听歌曲的艺人ID列表', user_often_listen_artist_name_list_7d ARRAY COMMENT '用户7天常听歌曲的艺人名称列表', user_often_listen_genre_list_7d ARRAY COMMENT '用户7天常听歌曲的曲风列表', user_often_listen_language_list_7d ARRAY COMMENT '用户7天常听歌曲的语种列表', user_often_listen_play_cnt_list_7d ARRAY COMMENT '用户7天常听歌曲的播放次数列表', user_often_listen_duration_list_7d ARRAY COMMENT '用户7天常听歌曲的时长列表(毫秒)', user_often_listen_song_id_list_30d ARRAY COMMENT '用户30天常听的歌曲ID列表', user_often_listen_song_name_list_30d ARRAY COMMENT '用户30天常听的歌曲名称列表', user_often_listen_artist_id_list_30d ARRAY COMMENT '用户30天常听歌曲的艺人ID列表', user_often_listen_artist_name_list_30d ARRAY COMMENT '用户30天常听歌曲的艺人名称列表', user_often_listen_genre_list_30d ARRAY COMMENT '用户30天常听歌曲的曲风列表', user_often_listen_language_list_30d ARRAY COMMENT '用户30天常听歌曲的语种列表', user_often_listen_play_cnt_list_30d ARRAY COMMENT '用户30天常听歌曲的播放次数列表', user_often_listen_duration_list_30d ARRAY COMMENT '用户30天常听歌曲的时长列表(毫秒)', user_often_listen_song_id_list_90d ARRAY COMMENT '用户90天常听的歌曲ID列表', user_often_listen_song_name_list_90d ARRAY COMMENT '用户90天常听的歌曲名称列表', user_often_listen_artist_id_list_90d ARRAY COMMENT '用户90天常听歌曲的艺人ID列表', user_often_listen_artist_name_list_90d ARRAY COMMENT '用户90天常听歌曲的艺人名称列表', user_often_listen_genre_list_90d ARRAY COMMENT '用户90天常听歌曲的曲风列表', user_often_listen_language_list_90d ARRAY COMMENT '用户90天常听歌曲的语种列表', user_often_listen_play_cnt_list_90d ARRAY COMMENT '用户90天常听歌曲的播放次数列表', user_often_listen_duration_list_90d ARRAY COMMENT '用户90天常听歌曲的时长列表(毫秒)', total_play_count_7d BIGINT COMMENT '7天总播放次数', total_play_count_30d BIGINT COMMENT '30天总播放次数', total_play_count_90d BIGINT COMMENT '90天总播放次数', distinct_song_count_7d BIGINT COMMENT '7天独立歌曲数', distinct_song_count_30d BIGINT COMMENT '30天独立歌曲数', distinct_song_count_90d BIGINT COMMENT '90天独立歌曲数', update_time STRING COMMENT '数据更新时间' ) COMMENT '用户常听歌曲序列扩展side_info表(7天/30天/90天窗口)' PARTITIONED BY (dt STRING COMMENT '时间分区yyyy-mm-dd') STORED AS PARQUET TBLPROPERTIES ( 'PARTITION_LIFECYCLE'='30d', 'table.source'='自定义', 'table.creator'='caiyi05@corp.netease.com' ); -- 优化版数据插入 INSERT OVERWRITE TABLE music_most.user_song_sequence_with_side_info PARTITION(dt='${bizdate_1}') WITH -- ========== 步骤1: 识别热点用户(基于最近一天数据快速识别) ========== -- hot_users AS ( SELECT /+ MAPJOIN / user_id, COUNT() as record_count FROM music_most.user_song_daily_with_side_info WHERE dt = '${bizdate_1}' GROUP BY user_id HAVING COUNT() > 2000 -- 热点用户阈值 ), -- ========== 步骤2: 处理7天窗口 ========== -- -- 2.1 普通用户7天数据(不加盐) normal_user_7d AS ( SELECT t1.user_id, t1.song_id, MAX(t1.song_name) as song_name, MAX(t1.singer_artist_id_list) as singer_artist_id_list, MAX(t1.singer_artist_name_list) as singer_artist_name_list, MAX(t1.genre_tag_name) as genre_tag_name, MAX(t1.norm_language) as norm_language, MAX(t1.duration) as duration, SUM(t1.effective_play_cnt_1d) as play_count FROM music_most.user_song_daily_with_side_info t1 LEFT ANTI JOIN hot_users t2 ON t1.user_id = t2.user_id WHERE t1.dt BETWEEN date_sub('${bizdate_1}', 6) AND '${bizdate_1}' AND t1.effective_play_cnt_1d > 0 GROUP BY t1.user_id, t1.song_id ), -- 2.2 热点用户7天数据(加盐处理) hot_user_7d_salted AS ( SELECT CASE WHEN t2.record_count > 10000 THEN CONCAT(t1.user_id, '', CAST(RAND() * 50 AS INT)) WHEN t2.record_count > 5000 THEN CONCAT(t1.user_id, '', CAST(RAND() * 30 AS INT)) ELSE CONCAT(t1.user_id, '_', CAST(RAND() * 20 AS INT)) END as user_id_salt, t1.user_id, t1.song_id, t1.song_name, t1.singer_artist_id_list, t1.singer_artist_name_list, t1.genre_tag_name, t1.norm_language, t1.duration, t1.effective_play_cnt_1d FROM music_most.user_song_daily_with_side_info t1 INNER JOIN hot_users t2 ON t1.user_id = t2.user_id WHERE t1.dt BETWEEN date_sub('${bizdate_1}', 6) AND '${bizdate_1}' AND t1.effective_play_cnt_1d > 0 ), -- 2.3 热点用户第一阶段聚合 hot_user_7d_stage1 AS ( SELECT user_id_salt, user_id, song_id, MAX(song_name) as song_name, MAX(singer_artist_id_list) as singer_artist_id_list, MAX(singer_artist_name_list) as singer_artist_name_list, MAX(genre_tag_name) as genre_tag_name, MAX(norm_language) as norm_language, MAX(duration) as duration, SUM(effective_play_cnt_1d) as partial_play_count FROM hot_user_7d_salted GROUP BY user_id_salt, user_id, song_id ), -- 2.4 热点用户第二阶段聚合(去盐) hot_user_7d AS ( SELECT user_id, song_id, MAX(song_name) as song_name, MAX(singer_artist_id_list) as singer_artist_id_list, MAX(singer_artist_name_list) as singer_artist_name_list, MAX(genre_tag_name) as genre_tag_name, MAX(norm_language) as norm_language, MAX(duration) as duration, SUM(partial_play_count) as play_count FROM hot_user_7d_stage1 GROUP BY user_id, song_id ), -- 2.5 合并7天数据 user_song_7d AS ( SELECT * FROM normal_user_7d UNION ALL SELECT * FROM hot_user_7d ), -- 2.6 7天排序和聚合 user_song_7d_ranked AS ( SELECT user_id, song_id, play_count, song_name, singer_artist_id_list, singer_artist_name_list, genre_tag_name, norm_language, duration, ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY play_count DESC, song_id) as rank FROM user_song_7d ), user_song_7d_agg AS ( SELECT user_id, collect_list(CASE WHEN rank <= 100 THEN song_id END) as user_often_listen_song_id_list_7d, collect_list(CASE WHEN rank <= 100 THEN song_name END) as user_often_listen_song_name_list_7d, collect_list(CASE WHEN rank <= 100 THEN singer_artist_id_list END) as user_often_listen_artist_id_list_7d, collect_list(CASE WHEN rank <= 100 THEN singer_artist_name_list END) as user_often_listen_artist_name_list_7d, collect_list(CASE WHEN rank <= 100 THEN genre_tag_name END) as user_often_listen_genre_list_7d, collect_list(CASE WHEN rank <= 100 THEN norm_language END) as user_often_listen_language_list_7d, collect_list(CASE WHEN rank <= 100 THEN play_count END) as user_often_listen_play_cnt_list_7d, collect_list(CASE WHEN rank <= 100 THEN duration END) as user_often_listen_duration_list_7d, SUM(CASE WHEN rank <= 100 THEN play_count ELSE 0 END) as total_play_count_7d, COUNT(DISTINCT CASE WHEN rank <= 100 THEN song_id END) as distinct_song_count_7d FROM user_song_7d_ranked WHERE rank <= 100 GROUP BY user_id ), -- ========== 步骤3: 处理30天窗口(复用相同逻辑) ========== -- normal_user_30d AS ( SELECT t1.user_id, t1.song_id, MAX(t1.song_name) as song_name, MAX(t1.singer_artist_id_list) as singer_artist_id_list, MAX(t1.singer_artist_name_list) as singer_artist_name_list, MAX(t1.genre_tag_name) as genre_tag_name, MAX(t1.norm_language) as norm_language, MAX(t1.duration) as duration, SUM(t1.effective_play_cnt_1d) as play_count FROM music_most.user_song_daily_with_side_info t1 LEFT ANTI JOIN hot_users t2 ON t1.user_id = t2.user_id WHERE t1.dt BETWEEN date_sub('${bizdate_1}', 29) AND '${bizdate_1}' AND t1.effective_play_cnt_1d > 0 GROUP BY t1.user_id, t1.song_id HAVING SUM(t1.effective_play_cnt_1d) > 2 -- 30天播放超过2次 ), hot_user_30d_salted AS ( SELECT CASE WHEN t2.record_count > 10000 THEN CONCAT(t1.user_id, '', CAST(RAND() * 50 AS INT)) WHEN t2.record_count > 5000 THEN CONCAT(t1.user_id, '', CAST(RAND() * 30 AS INT)) ELSE CONCAT(t1.user_id, '_', CAST(RAND() * 20 AS INT)) END as user_id_salt, t1.user_id, t1.song_id, t1.song_name, t1.singer_artist_id_list, t1.singer_artist_name_list, t1.genre_tag_name, t1.norm_language, t1.duration, t1.effective_play_cnt_1d FROM music_most.user_song_daily_with_side_info t1 INNER JOIN hot_users t2 ON t1.user_id = t2.user_id WHERE t1.dt BETWEEN date_sub('${bizdate_1}', 29) AND '${bizdate_1}' AND t1.effective_play_cnt_1d > 0 ), hot_user_30d_stage1 AS ( SELECT user_id_salt, user_id, song_id, MAX(song_name) as song_name, MAX(singer_artist_id_list) as singer_artist_id_list, MAX(singer_artist_name_list) as singer_artist_name_list, MAX(genre_tag_name) as genre_tag_name, MAX(norm_language) as norm_language, MAX(duration) as duration, SUM(effective_play_cnt_1d) as partial_play_count FROM hot_user_30d_salted GROUP BY user_id_salt, user_id, song_id ), hot_user_30d AS ( SELECT user_id, song_id, MAX(song_name) as song_name, MAX(singer_artist_id_list) as singer_artist_id_list, MAX(singer_artist_name_list) as singer_artist_name_list, MAX(genre_tag_name) as genre_tag_name, MAX(norm_language) as norm_language, MAX(duration) as duration, SUM(partial_play_count) as play_count FROM hot_user_30d_stage1 GROUP BY user_id, song_id HAVING SUM(partial_play_count) > 2 ), user_song_30d AS ( SELECT * FROM normal_user_30d UNION ALL SELECT * FROM hot_user_30d ), user_song_30d_ranked AS ( SELECT user_id, song_id, play_count, song_name, singer_artist_id_list, singer_artist_name_list, genre_tag_name, norm_language, duration, ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY play_count DESC, song_id) as rank FROM user_song_30d ), user_song_30d_agg AS ( SELECT user_id, collect_list(CASE WHEN rank <= 100 THEN song_id END) as user_often_listen_song_id_list_30d, collect_list(CASE WHEN rank <= 100 THEN song_name END) as user_often_listen_song_name_list_30d, collect_list(CASE WHEN rank <= 100 THEN singer_artist_id_list END) as user_often_listen_artist_id_list_30d, collect_list(CASE WHEN rank <= 100 THEN singer_artist_name_list END) as user_often_listen_artist_name_list_30d, collect_list(CASE WHEN rank <= 100 THEN genre_tag_name END) as user_often_listen_genre_list_30d, collect_list(CASE WHEN rank <= 100 THEN norm_language END) as user_often_listen_language_list_30d, collect_list(CASE WHEN rank <= 100 THEN play_count END) as user_often_listen_play_cnt_list_30d, collect_list(CASE WHEN rank <= 100 THEN duration END) as user_often_listen_duration_list_30d, SUM(CASE WHEN rank <= 100 THEN play_count ELSE 0 END) as total_play_count_30d, COUNT(DISTINCT CASE WHEN rank <= 100 THEN song_id END) as distinct_song_count_30d FROM user_song_30d_ranked WHERE rank <= 100 GROUP BY user_id ), -- ========== 步骤4: 处理90天窗口(复用相同逻辑) ========== -- normal_user_90d AS ( SELECT t1.user_id, t1.song_id, MAX(t1.song_name) as song_name, MAX(t1.singer_artist_id_list) as singer_artist_id_list, MAX(t1.singer_artist_name_list) as singer_artist_name_list, MAX(t1.genre_tag_name) as genre_tag_name, MAX(t1.norm_language) as norm_language, MAX(t1.duration) as duration, SUM(t1.effective_play_cnt_1d) as play_count FROM music_most.user_song_daily_with_side_info t1 LEFT ANTI JOIN hot_users t2 ON t1.user_id = t2.user_id WHERE t1.dt BETWEEN date_sub('${bizdate_1}', 89) AND '${bizdate_1}' AND t1.effective_play_cnt_1d > 0 GROUP BY t1.user_id, t1.song_id HAVING SUM(t1.effective_play_cnt_1d) > 3 -- 90天播放超过3次 ), hot_user_90d_salted AS ( SELECT CASE WHEN t2.record_count > 10000 THEN CONCAT(t1.user_id, '', CAST(RAND() * 50 AS INT)) WHEN t2.record_count > 5000 THEN CONCAT(t1.user_id, '', CAST(RAND() * 30 AS INT)) ELSE CONCAT(t1.user_id, '_', CAST(RAND() * 20 AS INT)) END as user_id_salt, t1.user_id, t1.song_id, t1.song_name, t1.singer_artist_id_list, t1.singer_artist_name_list, t1.genre_tag_name, t1.norm_language, t1.duration, t1.effective_play_cnt_1d FROM music_most.user_song_daily_with_side_info t1 INNER JOIN hot_users t2 ON t1.user_id = t2.user_id WHERE t1.dt BETWEEN date_sub('${bizdate_1}', 89) AND '${bizdate_1}' AND t1.effective_play_cnt_1d > 0 ), hot_user_90d_stage1 AS ( SELECT user_id_salt, user_id, song_id, MAX(song_name) as song_name, MAX(singer_artist_id_list) as singer_artist_id_list, MAX(singer_artist_name_list) as singer_artist_name_list, MAX(genre_tag_name) as genre_tag_name, MAX(norm_language) as norm_language, MAX(duration) as duration, SUM(effective_play_cnt_1d) as partial_play_count FROM hot_user_90d_salted GROUP BY user_id_salt, user_id, song_id ), hot_user_90d AS ( SELECT user_id, song_id, MAX(song_name) as song_name, MAX(singer_artist_id_list) as singer_artist_id_list, MAX(singer_artist_name_list) as singer_artist_name_list, MAX(genre_tag_name) as genre_tag_name, MAX(norm_language) as norm_language, MAX(duration) as duration, SUM(partial_play_count) as play_count FROM hot_user_90d_stage1 GROUP BY user_id, song_id HAVING SUM(partial_play_count) > 3 ), user_song_90d AS ( SELECT * FROM normal_user_90d UNION ALL SELECT * FROM hot_user_90d ), user_song_90d_ranked AS ( SELECT user_id, song_id, play_count, song_name, singer_artist_id_list, singer_artist_name_list, genre_tag_name, norm_language, duration, ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY play_count DESC, song_id) as rank FROM user_song_90d ), user_song_90d_agg AS ( SELECT user_id, collect_list(CASE WHEN rank <= 100 THEN song_id END) as user_often_listen_song_id_list_90d, collect_list(CASE WHEN rank <= 100 THEN song_name END) as user_often_listen_song_name_list_90d, collect_list(CASE WHEN rank <= 100 THEN singer_artist_id_list END) as user_often_listen_artist_id_list_90d, collect_list(CASE WHEN rank <= 100 THEN singer_artist_name_list END) as user_often_listen_artist_name_list_90d, collect_list(CASE WHEN rank <= 100 THEN genre_tag_name END) as user_often_listen_genre_list_90d, collect_list(CASE WHEN rank <= 100 THEN norm_language END) as user_often_listen_language_list_90d, collect_list(CASE WHEN rank <= 100 THEN play_count END) as user_often_listen_play_cnt_list_90d, collect_list(CASE WHEN rank <= 100 THEN duration END) as user_often_listen_duration_list_90d, SUM(CASE WHEN rank <= 100 THEN play_count ELSE 0 END) as total_play_count_90d, COUNT(DISTINCT CASE WHEN rank <= 100 THEN song_id END) as distinct_song_count_90d FROM user_song_90d_ranked WHERE rank <= 100 GROUP BY user_id ) -- ========== 最终结果合并 ========== -- SELECT COALESCE(s7.user_id, s30.user_id, s90.user_id) as user_id, -- 7天窗口 s7.user_often_listen_song_id_list_7d, s7.user_often_listen_song_name_list_7d, s7.user_often_listen_artist_id_list_7d, s7.user_often_listen_artist_name_list_7d, s7.user_often_listen_genre_list_7d, s7.user_often_listen_language_list_7d, s7.user_often_listen_play_cnt_list_7d, s7.user_often_listen_duration_list_7d, -- 30天窗口 s30.user_often_listen_song_id_list_30d, s30.user_often_listen_song_name_list_30d, s30.user_often_listen_artist_id_list_30d, s30.user_often_listen_artist_name_list_30d, s30.user_often_listen_genre_list_30d, s30.user_often_listen_language_list_30d, s30.user_often_listen_play_cnt_list_30d, s30.user_often_listen_duration_list_30d, -- 90天窗口 s90.user_often_listen_song_id_list_90d, s90.user_often_listen_song_name_list_90d, s90.user_often_listen_artist_id_list_90d, s90.user_often_listen_artist_name_list_90d, s90.user_often_listen_genre_list_90d, s90.user_often_listen_language_list_90d, s90.user_often_listen_play_cnt_list_90d, s90.user_often_listen_duration_list_90d, -- 统计信息 COALESCE(s7.total_play_count_7d, 0) as total_play_count_7d, COALESCE(s30.total_play_count_30d, 0) as total_play_count_30d, COALESCE(s90.total_play_count_90d, 0) as total_play_count_90d, COALESCE(s7.distinct_song_count_7d, 0) as distinct_song_count_7d, COALESCE(s30.distinct_song_count_30d, 0) as distinct_song_count_30d, COALESCE(s90.distinct_song_count_90d, 0) as distinct_song_count_90d, current_timestamp() as update_time FROM user_song_7d_agg s7 FULL OUTER JOIN user_song_30d_agg s30 ON s7.user_id = s30.user_id FULL OUTER JOIN user_song_90d_agg s90 ON COALESCE(s7.user_id, s30.user_id) = s90.user_id WHERE COALESCE(s7.user_id, s30.user_id, s90.user_id) IS NOT NULL; 🤖 错误内容 通用异常,Spark任务执行失败,错误信息为: java.lang.Exception: process exitCode: 1 该错误信息为Spark任务执行失败的通用异常,未包含具体的SQL语法或逻辑错误提示。 根据提供的信息,无法定位具体的SQL错误原因。建议检查Spark执行环境配置、资源分配及运行日志以获取详细错误信息。 错误原因:无具体语法或逻辑错误提示,任务执行失败可能由环境配置或资源分配问题引起。 修正建议:无SQL语法错误,暂不修改SQL。看下sql
最新发布
09-19
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值