array_name和&array_name的异同

本文详细解释了C语言中数组与指针的概念及其区别,包括如何正确地使用指针指向数组的第一个元素及整个数组,并通过示例代码展示了不同类型的指针声明与赋值方式。

前者是指向数组中第一个元素的指针,后者是指向整个数组的指针。
char a[MAX];        /*array of MAX characters*/
char *p = a;         /*p
为指向数组的指针*/
char *pa = &a;      /*
该语句是不正确的,pa的类型为'char *',而&a的类型为'char (*)[MAX]
*/
char (*pb)[MAX] = &a;       /*
该语句是正确的,pb的类型为
'char (*)[MAX]'*/

#include<stdio.h>

 

void main()

{

char a[5] = {'a','b','c','d','/0'};   

char *p = a;    

 

//运行下面这句后, vc6.0 提示的错误为:cannot convert from 'char (*)[5]' to 'char *'&a的类型应该是指向一个数组的指针

//char *pa = &a;

//所以,应该定义一个指向相同类型和大小的数组的指针来获得“&a”的值

char (*point_to_str)[5];

   point_to_str = &a;

 

 printf("%d/n%d/n",&p, &point_to_str);

 printf("%s/n%s/n", p,  point_to_str);

}

 

运行结果为:

1245044

1245040

abcd

abcd

 

-- ******************************************************************** -- -- Author: 蔡义 -- CreateTime: 2025-09-11 -- Comment: 扩展用户常听歌曲序列的side_info信息表 - 性能优化版 -- 优化策略: 热点用户识别+分层处理+动态盐值 -- ******************************************************************** -- -- 设置优化参数 SET hive.map.aggr=true; SET hive.map.aggr.hash.percentmemory=0.5; SET hive.groupby.skewindata=true; SET hive.optimize.skewjoin=true; SET hive.skewjoin.key=100000; SET hive.exec.reducers.bytes.per.reducer=256000000; SET hive.exec.reducers.max=999; SET hive.auto.convert.join=true; SET hive.mapjoin.smalltable.filesize=100000000; SET mapreduce.map.memory.mb=4096; SET mapreduce.reduce.memory.mb=8192; SET hive.exec.parallel=true; SET hive.exec.parallel.thread.number=8; -- 建表语句(保持不变) CREATE TABLE IF NOT EXISTS music_most.user_song_sequence_with_side_info ( user_id STRING COMMENT '用户ID', user_often_listen_song_id_list_7d ARRAY COMMENT '用户7天常听的歌曲ID列表', user_often_listen_song_name_list_7d ARRAY COMMENT '用户7天常听的歌曲名称列表', user_often_listen_artist_id_list_7d ARRAY COMMENT '用户7天常听歌曲的艺人ID列表', user_often_listen_artist_name_list_7d ARRAY COMMENT '用户7天常听歌曲的艺人名称列表', user_often_listen_genre_list_7d ARRAY COMMENT '用户7天常听歌曲的曲风列表', user_often_listen_language_list_7d ARRAY COMMENT '用户7天常听歌曲的语种列表', user_often_listen_play_cnt_list_7d ARRAY COMMENT '用户7天常听歌曲的播放次数列表', user_often_listen_duration_list_7d ARRAY COMMENT '用户7天常听歌曲的时长列表(毫秒)', user_often_listen_song_id_list_30d ARRAY COMMENT '用户30天常听的歌曲ID列表', user_often_listen_song_name_list_30d ARRAY COMMENT '用户30天常听的歌曲名称列表', user_often_listen_artist_id_list_30d ARRAY COMMENT '用户30天常听歌曲的艺人ID列表', user_often_listen_artist_name_list_30d ARRAY COMMENT '用户30天常听歌曲的艺人名称列表', user_often_listen_genre_list_30d ARRAY COMMENT '用户30天常听歌曲的曲风列表', user_often_listen_language_list_30d ARRAY COMMENT '用户30天常听歌曲的语种列表', user_often_listen_play_cnt_list_30d ARRAY COMMENT '用户30天常听歌曲的播放次数列表', user_often_listen_duration_list_30d ARRAY COMMENT '用户30天常听歌曲的时长列表(毫秒)', user_often_listen_song_id_list_90d ARRAY COMMENT '用户90天常听的歌曲ID列表', user_often_listen_song_name_list_90d ARRAY COMMENT '用户90天常听的歌曲名称列表', user_often_listen_artist_id_list_90d ARRAY COMMENT '用户90天常听歌曲的艺人ID列表', user_often_listen_artist_name_list_90d ARRAY COMMENT '用户90天常听歌曲的艺人名称列表', user_often_listen_genre_list_90d ARRAY COMMENT '用户90天常听歌曲的曲风列表', user_often_listen_language_list_90d ARRAY COMMENT '用户90天常听歌曲的语种列表', user_often_listen_play_cnt_list_90d ARRAY COMMENT '用户90天常听歌曲的播放次数列表', user_often_listen_duration_list_90d ARRAY COMMENT '用户90天常听歌曲的时长列表(毫秒)', total_play_count_7d BIGINT COMMENT '7天总播放次数', total_play_count_30d BIGINT COMMENT '30天总播放次数', total_play_count_90d BIGINT COMMENT '90天总播放次数', distinct_song_count_7d BIGINT COMMENT '7天独立歌曲数', distinct_song_count_30d BIGINT COMMENT '30天独立歌曲数', distinct_song_count_90d BIGINT COMMENT '90天独立歌曲数', update_time STRING COMMENT '数据更新时间' ) COMMENT '用户常听歌曲序列扩展side_info表(7天/30天/90天窗口)' PARTITIONED BY (dt STRING COMMENT '时间分区yyyy-mm-dd') STORED AS PARQUET TBLPROPERTIES ( 'PARTITION_LIFECYCLE'='30d', 'table.source'='自定义', 'table.creator'='caiyi05@corp.netease.com' ); -- 优化版数据插入 INSERT OVERWRITE TABLE music_most.user_song_sequence_with_side_info PARTITION(dt='${bizdate_1}') WITH -- ========== 步骤1: 识别热点用户(基于最近一天数据快速识别) ========== -- hot_users AS ( SELECT /+ MAPJOIN / user_id, COUNT() as record_count FROM music_most.user_song_daily_with_side_info WHERE dt = '${bizdate_1}' GROUP BY user_id HAVING COUNT() > 2000 -- 热点用户阈值 ), -- ========== 步骤2: 处理7天窗口 ========== -- -- 2.1 普通用户7天数据(不加盐) normal_user_7d AS ( SELECT t1.user_id, t1.song_id, MAX(t1.song_name) as song_name, MAX(t1.singer_artist_id_list) as singer_artist_id_list, MAX(t1.singer_artist_name_list) as singer_artist_name_list, MAX(t1.genre_tag_name) as genre_tag_name, MAX(t1.norm_language) as norm_language, MAX(t1.duration) as duration, SUM(t1.effective_play_cnt_1d) as play_count FROM music_most.user_song_daily_with_side_info t1 LEFT ANTI JOIN hot_users t2 ON t1.user_id = t2.user_id WHERE t1.dt BETWEEN date_sub('${bizdate_1}', 6) AND '${bizdate_1}' AND t1.effective_play_cnt_1d > 0 GROUP BY t1.user_id, t1.song_id ), -- 2.2 热点用户7天数据(加盐处理) hot_user_7d_salted AS ( SELECT CASE WHEN t2.record_count > 10000 THEN CONCAT(t1.user_id, '', CAST(RAND() * 50 AS INT)) WHEN t2.record_count > 5000 THEN CONCAT(t1.user_id, '', CAST(RAND() * 30 AS INT)) ELSE CONCAT(t1.user_id, '_', CAST(RAND() * 20 AS INT)) END as user_id_salt, t1.user_id, t1.song_id, t1.song_name, t1.singer_artist_id_list, t1.singer_artist_name_list, t1.genre_tag_name, t1.norm_language, t1.duration, t1.effective_play_cnt_1d FROM music_most.user_song_daily_with_side_info t1 INNER JOIN hot_users t2 ON t1.user_id = t2.user_id WHERE t1.dt BETWEEN date_sub('${bizdate_1}', 6) AND '${bizdate_1}' AND t1.effective_play_cnt_1d > 0 ), -- 2.3 热点用户第一阶段聚合 hot_user_7d_stage1 AS ( SELECT user_id_salt, user_id, song_id, MAX(song_name) as song_name, MAX(singer_artist_id_list) as singer_artist_id_list, MAX(singer_artist_name_list) as singer_artist_name_list, MAX(genre_tag_name) as genre_tag_name, MAX(norm_language) as norm_language, MAX(duration) as duration, SUM(effective_play_cnt_1d) as partial_play_count FROM hot_user_7d_salted GROUP BY user_id_salt, user_id, song_id ), -- 2.4 热点用户第二阶段聚合(去盐) hot_user_7d AS ( SELECT user_id, song_id, MAX(song_name) as song_name, MAX(singer_artist_id_list) as singer_artist_id_list, MAX(singer_artist_name_list) as singer_artist_name_list, MAX(genre_tag_name) as genre_tag_name, MAX(norm_language) as norm_language, MAX(duration) as duration, SUM(partial_play_count) as play_count FROM hot_user_7d_stage1 GROUP BY user_id, song_id ), -- 2.5 合并7天数据 user_song_7d AS ( SELECT * FROM normal_user_7d UNION ALL SELECT * FROM hot_user_7d ), -- 2.6 7天排序聚合 user_song_7d_ranked AS ( SELECT user_id, song_id, play_count, song_name, singer_artist_id_list, singer_artist_name_list, genre_tag_name, norm_language, duration, ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY play_count DESC, song_id) as rank FROM user_song_7d ), user_song_7d_agg AS ( SELECT user_id, collect_list(CASE WHEN rank <= 100 THEN song_id END) as user_often_listen_song_id_list_7d, collect_list(CASE WHEN rank <= 100 THEN song_name END) as user_often_listen_song_name_list_7d, collect_list(CASE WHEN rank <= 100 THEN singer_artist_id_list END) as user_often_listen_artist_id_list_7d, collect_list(CASE WHEN rank <= 100 THEN singer_artist_name_list END) as user_often_listen_artist_name_list_7d, collect_list(CASE WHEN rank <= 100 THEN genre_tag_name END) as user_often_listen_genre_list_7d, collect_list(CASE WHEN rank <= 100 THEN norm_language END) as user_often_listen_language_list_7d, collect_list(CASE WHEN rank <= 100 THEN play_count END) as user_often_listen_play_cnt_list_7d, collect_list(CASE WHEN rank <= 100 THEN duration END) as user_often_listen_duration_list_7d, SUM(CASE WHEN rank <= 100 THEN play_count ELSE 0 END) as total_play_count_7d, COUNT(DISTINCT CASE WHEN rank <= 100 THEN song_id END) as distinct_song_count_7d FROM user_song_7d_ranked WHERE rank <= 100 GROUP BY user_id ), -- ========== 步骤3: 处理30天窗口(复用相同逻辑) ========== -- normal_user_30d AS ( SELECT t1.user_id, t1.song_id, MAX(t1.song_name) as song_name, MAX(t1.singer_artist_id_list) as singer_artist_id_list, MAX(t1.singer_artist_name_list) as singer_artist_name_list, MAX(t1.genre_tag_name) as genre_tag_name, MAX(t1.norm_language) as norm_language, MAX(t1.duration) as duration, SUM(t1.effective_play_cnt_1d) as play_count FROM music_most.user_song_daily_with_side_info t1 LEFT ANTI JOIN hot_users t2 ON t1.user_id = t2.user_id WHERE t1.dt BETWEEN date_sub('${bizdate_1}', 29) AND '${bizdate_1}' AND t1.effective_play_cnt_1d > 0 GROUP BY t1.user_id, t1.song_id HAVING SUM(t1.effective_play_cnt_1d) > 2 -- 30天播放超过2次 ), hot_user_30d_salted AS ( SELECT CASE WHEN t2.record_count > 10000 THEN CONCAT(t1.user_id, '', CAST(RAND() * 50 AS INT)) WHEN t2.record_count > 5000 THEN CONCAT(t1.user_id, '', CAST(RAND() * 30 AS INT)) ELSE CONCAT(t1.user_id, '_', CAST(RAND() * 20 AS INT)) END as user_id_salt, t1.user_id, t1.song_id, t1.song_name, t1.singer_artist_id_list, t1.singer_artist_name_list, t1.genre_tag_name, t1.norm_language, t1.duration, t1.effective_play_cnt_1d FROM music_most.user_song_daily_with_side_info t1 INNER JOIN hot_users t2 ON t1.user_id = t2.user_id WHERE t1.dt BETWEEN date_sub('${bizdate_1}', 29) AND '${bizdate_1}' AND t1.effective_play_cnt_1d > 0 ), hot_user_30d_stage1 AS ( SELECT user_id_salt, user_id, song_id, MAX(song_name) as song_name, MAX(singer_artist_id_list) as singer_artist_id_list, MAX(singer_artist_name_list) as singer_artist_name_list, MAX(genre_tag_name) as genre_tag_name, MAX(norm_language) as norm_language, MAX(duration) as duration, SUM(effective_play_cnt_1d) as partial_play_count FROM hot_user_30d_salted GROUP BY user_id_salt, user_id, song_id ), hot_user_30d AS ( SELECT user_id, song_id, MAX(song_name) as song_name, MAX(singer_artist_id_list) as singer_artist_id_list, MAX(singer_artist_name_list) as singer_artist_name_list, MAX(genre_tag_name) as genre_tag_name, MAX(norm_language) as norm_language, MAX(duration) as duration, SUM(partial_play_count) as play_count FROM hot_user_30d_stage1 GROUP BY user_id, song_id HAVING SUM(partial_play_count) > 2 ), user_song_30d AS ( SELECT * FROM normal_user_30d UNION ALL SELECT * FROM hot_user_30d ), user_song_30d_ranked AS ( SELECT user_id, song_id, play_count, song_name, singer_artist_id_list, singer_artist_name_list, genre_tag_name, norm_language, duration, ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY play_count DESC, song_id) as rank FROM user_song_30d ), user_song_30d_agg AS ( SELECT user_id, collect_list(CASE WHEN rank <= 100 THEN song_id END) as user_often_listen_song_id_list_30d, collect_list(CASE WHEN rank <= 100 THEN song_name END) as user_often_listen_song_name_list_30d, collect_list(CASE WHEN rank <= 100 THEN singer_artist_id_list END) as user_often_listen_artist_id_list_30d, collect_list(CASE WHEN rank <= 100 THEN singer_artist_name_list END) as user_often_listen_artist_name_list_30d, collect_list(CASE WHEN rank <= 100 THEN genre_tag_name END) as user_often_listen_genre_list_30d, collect_list(CASE WHEN rank <= 100 THEN norm_language END) as user_often_listen_language_list_30d, collect_list(CASE WHEN rank <= 100 THEN play_count END) as user_often_listen_play_cnt_list_30d, collect_list(CASE WHEN rank <= 100 THEN duration END) as user_often_listen_duration_list_30d, SUM(CASE WHEN rank <= 100 THEN play_count ELSE 0 END) as total_play_count_30d, COUNT(DISTINCT CASE WHEN rank <= 100 THEN song_id END) as distinct_song_count_30d FROM user_song_30d_ranked WHERE rank <= 100 GROUP BY user_id ), -- ========== 步骤4: 处理90天窗口(复用相同逻辑) ========== -- normal_user_90d AS ( SELECT t1.user_id, t1.song_id, MAX(t1.song_name) as song_name, MAX(t1.singer_artist_id_list) as singer_artist_id_list, MAX(t1.singer_artist_name_list) as singer_artist_name_list, MAX(t1.genre_tag_name) as genre_tag_name, MAX(t1.norm_language) as norm_language, MAX(t1.duration) as duration, SUM(t1.effective_play_cnt_1d) as play_count FROM music_most.user_song_daily_with_side_info t1 LEFT ANTI JOIN hot_users t2 ON t1.user_id = t2.user_id WHERE t1.dt BETWEEN date_sub('${bizdate_1}', 89) AND '${bizdate_1}' AND t1.effective_play_cnt_1d > 0 GROUP BY t1.user_id, t1.song_id HAVING SUM(t1.effective_play_cnt_1d) > 3 -- 90天播放超过3次 ), hot_user_90d_salted AS ( SELECT CASE WHEN t2.record_count > 10000 THEN CONCAT(t1.user_id, '', CAST(RAND() * 50 AS INT)) WHEN t2.record_count > 5000 THEN CONCAT(t1.user_id, '', CAST(RAND() * 30 AS INT)) ELSE CONCAT(t1.user_id, '_', CAST(RAND() * 20 AS INT)) END as user_id_salt, t1.user_id, t1.song_id, t1.song_name, t1.singer_artist_id_list, t1.singer_artist_name_list, t1.genre_tag_name, t1.norm_language, t1.duration, t1.effective_play_cnt_1d FROM music_most.user_song_daily_with_side_info t1 INNER JOIN hot_users t2 ON t1.user_id = t2.user_id WHERE t1.dt BETWEEN date_sub('${bizdate_1}', 89) AND '${bizdate_1}' AND t1.effective_play_cnt_1d > 0 ), hot_user_90d_stage1 AS ( SELECT user_id_salt, user_id, song_id, MAX(song_name) as song_name, MAX(singer_artist_id_list) as singer_artist_id_list, MAX(singer_artist_name_list) as singer_artist_name_list, MAX(genre_tag_name) as genre_tag_name, MAX(norm_language) as norm_language, MAX(duration) as duration, SUM(effective_play_cnt_1d) as partial_play_count FROM hot_user_90d_salted GROUP BY user_id_salt, user_id, song_id ), hot_user_90d AS ( SELECT user_id, song_id, MAX(song_name) as song_name, MAX(singer_artist_id_list) as singer_artist_id_list, MAX(singer_artist_name_list) as singer_artist_name_list, MAX(genre_tag_name) as genre_tag_name, MAX(norm_language) as norm_language, MAX(duration) as duration, SUM(partial_play_count) as play_count FROM hot_user_90d_stage1 GROUP BY user_id, song_id HAVING SUM(partial_play_count) > 3 ), user_song_90d AS ( SELECT * FROM normal_user_90d UNION ALL SELECT * FROM hot_user_90d ), user_song_90d_ranked AS ( SELECT user_id, song_id, play_count, song_name, singer_artist_id_list, singer_artist_name_list, genre_tag_name, norm_language, duration, ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY play_count DESC, song_id) as rank FROM user_song_90d ), user_song_90d_agg AS ( SELECT user_id, collect_list(CASE WHEN rank <= 100 THEN song_id END) as user_often_listen_song_id_list_90d, collect_list(CASE WHEN rank <= 100 THEN song_name END) as user_often_listen_song_name_list_90d, collect_list(CASE WHEN rank <= 100 THEN singer_artist_id_list END) as user_often_listen_artist_id_list_90d, collect_list(CASE WHEN rank <= 100 THEN singer_artist_name_list END) as user_often_listen_artist_name_list_90d, collect_list(CASE WHEN rank <= 100 THEN genre_tag_name END) as user_often_listen_genre_list_90d, collect_list(CASE WHEN rank <= 100 THEN norm_language END) as user_often_listen_language_list_90d, collect_list(CASE WHEN rank <= 100 THEN play_count END) as user_often_listen_play_cnt_list_90d, collect_list(CASE WHEN rank <= 100 THEN duration END) as user_often_listen_duration_list_90d, SUM(CASE WHEN rank <= 100 THEN play_count ELSE 0 END) as total_play_count_90d, COUNT(DISTINCT CASE WHEN rank <= 100 THEN song_id END) as distinct_song_count_90d FROM user_song_90d_ranked WHERE rank <= 100 GROUP BY user_id ) -- ========== 最终结果合并 ========== -- SELECT COALESCE(s7.user_id, s30.user_id, s90.user_id) as user_id, -- 7天窗口 s7.user_often_listen_song_id_list_7d, s7.user_often_listen_song_name_list_7d, s7.user_often_listen_artist_id_list_7d, s7.user_often_listen_artist_name_list_7d, s7.user_often_listen_genre_list_7d, s7.user_often_listen_language_list_7d, s7.user_often_listen_play_cnt_list_7d, s7.user_often_listen_duration_list_7d, -- 30天窗口 s30.user_often_listen_song_id_list_30d, s30.user_often_listen_song_name_list_30d, s30.user_often_listen_artist_id_list_30d, s30.user_often_listen_artist_name_list_30d, s30.user_often_listen_genre_list_30d, s30.user_often_listen_language_list_30d, s30.user_often_listen_play_cnt_list_30d, s30.user_often_listen_duration_list_30d, -- 90天窗口 s90.user_often_listen_song_id_list_90d, s90.user_often_listen_song_name_list_90d, s90.user_often_listen_artist_id_list_90d, s90.user_often_listen_artist_name_list_90d, s90.user_often_listen_genre_list_90d, s90.user_often_listen_language_list_90d, s90.user_often_listen_play_cnt_list_90d, s90.user_often_listen_duration_list_90d, -- 统计信息 COALESCE(s7.total_play_count_7d, 0) as total_play_count_7d, COALESCE(s30.total_play_count_30d, 0) as total_play_count_30d, COALESCE(s90.total_play_count_90d, 0) as total_play_count_90d, COALESCE(s7.distinct_song_count_7d, 0) as distinct_song_count_7d, COALESCE(s30.distinct_song_count_30d, 0) as distinct_song_count_30d, COALESCE(s90.distinct_song_count_90d, 0) as distinct_song_count_90d, current_timestamp() as update_time FROM user_song_7d_agg s7 FULL OUTER JOIN user_song_30d_agg s30 ON s7.user_id = s30.user_id FULL OUTER JOIN user_song_90d_agg s90 ON COALESCE(s7.user_id, s30.user_id) = s90.user_id WHERE COALESCE(s7.user_id, s30.user_id, s90.user_id) IS NOT NULL; 🤖 错误内容 通用异常,Spark任务执行失败,错误信息为: java.lang.Exception: process exitCode: 1 该错误信息为Spark任务执行失败的通用异常,未包含具体的SQL语法或逻辑错误提示。 根据提供的信息,无法定位具体的SQL错误原因。建议检查Spark执行环境配置、资源分配及运行日志以获取详细错误信息。 错误原因:无具体语法或逻辑错误提示,任务执行失败可能由环境配置或资源分配问题引起。 修正建议:无SQL语法错误,暂不修改SQL。看下sql
09-19
rgv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) if training_args.should_log: # The default of training_args.log_level is passive, so we set log level at info here to have that default. transformers.utils.logging.set_verbosity_info() log_level = training_args.get_process_log_level() logger.setLevel(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_process_index}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") # Set seed before initializing model. set_seed(training_args.seed) # Initialize our dataset and prepare it for the audio classification task. raw_datasets = DatasetDict() raw_datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=data_args.train_split_name, token=model_args.token, trust_remote_code=model_args.trust_remote_code, ) raw_datasets["eval"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=data_args.eval_split_name, token=model_args.token, trust_remote_code=model_args.trust_remote_code, ) if data_args.audio_column_name not in raw_datasets["train"].column_names: raise ValueError( f"--audio_column_name {data_args.audio_column_name} not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--audio_column_name` to the correct audio column - one of " f"{', '.join(raw_datasets['train'].column_names)}." ) if data_args.label_column_name not in raw_datasets["train"].column_names: raise ValueError( f"--label_column_name {data_args.label_column_name} not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--label_column_name` to the correct text column - one of " f"{', '.join(raw_datasets['train'].column_names)}." ) # Setting `return_attention_mask=True` is the way to get a correctly masked mean-pooling over # transformer outputs in the classifier, but it doesn't always lead to better accuracy feature_extractor = AutoFeatureExtractor.from_pretrained( model_args.feature_extractor_name or model_args.model_name_or_path, return_attention_mask=model_args.attention_mask, cache_dir=model_args.cache_dir, revision=model_args.model_revision, token=model_args.token, trust_remote_code=model_args.trust_remote_code, ) # `datasets` takes care of automatically loading and resampling the audio, # so we just need to set the correct target sampling rate. raw_datasets = raw_datasets.cast_column( data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate) ) model_input_name = feature_extractor.model_input_names[0] def train_transforms(batch): """Apply train_transforms across a batch.""" subsampled_wavs = [] for audio in batch[data_args.audio_column_name]: wav = random_subsample( audio["array"], max_length=data_args.max_length_seconds, sample_rate=feature_extractor.sampling_rate ) subsampled_wavs.append(wav) inputs = feature_extractor(subsampled_wavs, sampling_rate=feature_extractor.sampling_rate) output_batch = {model_input_name: inputs.get(model_input_name)} output_batch["labels"] = list(batch[data_args.label_column_name]) return output_batch def val_transforms(batch): """Apply val_transforms across a batch.""" wavs = [audio["array"] for audio in batch[data_args.audio_column_name]] inputs = feature_extractor(wavs, sampling_rate=feature_extractor.sampling_rate) output_batch = {model_input_name: inputs.get(model_input_name)} output_batch["labels"] = list(batch[data_args.label_column_name]) return output_batch # Prepare label mappings. # We'll include these in the model's config to get human readable labels in the Inference API. labels = raw_datasets["train"].features[data_args.label_column_name].names label2id, id2label = {}, {} for i, label in enumerate(labels): label2id[label] = str(i) id2label[str(i)] = label # Load the accuracy metric from the datasets package metric = evaluate.load("accuracy", cache_dir=model_args.cache_dir) # Define our compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with # `predictions` and `label_ids` fields) and has to return a dictionary string to float. def compute_metrics(eval_pred): """Computes accuracy on a batch of predictions""" predictions = np.argmax(eval_pred.predictions, axis=1) return metric.compute(predictions=predictions, references=eval_pred.label_ids) config = AutoConfig.from_pretrained( model_args.config_name or model_args.model_name_or_path, num_labels=len(labels), label2id=label2id, id2label=id2label, finetuning_task="audio-classification", cache_dir=model_args.cache_dir, revision=model_args.model_revision, token=model_args.token, trust_remote_code=model_args.trust_remote_code, ) model = AutoModelForAudioClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, token=model_args.token, trust_remote_code=model_args.trust_remote_code, ignore_mismatched_sizes=model_args.ignore_mismatched_sizes, ) # freeze the convolutional waveform encoder if model_args.freeze_feature_encoder: model.freeze_feature_encoder() if training_args.do_train: if data_args.max_train_samples is not None: raw_datasets["train"] = ( raw_datasets["train"].shuffle(seed=training_args.seed).select(range(data_args.max_train_samples)) ) # Set the training transforms raw_datasets["train"].set_transform(train_transforms, output_all_columns=False) if training_args.do_eval: if data_args.max_eval_samples is not None: raw_datasets["eval"] = ( raw_datasets["eval"].shuffle(seed=training_args.seed).select(range(data_args.max_eval_samples)) ) # Set the validation transforms raw_datasets["eval"].set_transform(val_transforms, output_all_columns=False) # Initialize our trainer trainer = Trainer( model=model, args=training_args, train_dataset=raw_datasets["train"] if training_args.do_train else None, eval_dataset=raw_datasets["eval"] if training_args.do_eval else None, compute_metrics=compute_metrics, processing_class=feature_extractor, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() trainer.log_metrics("train", train_result.metrics) trainer.save_metrics("train", train_result.metrics) trainer.save_state() # Evaluation if training_args.do_eval: metrics = trainer.evaluate() trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Write model card and (optionally) push to hub kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": "audio-classification", "dataset": data_args.dataset_name, "tags": ["audio-classification"], } if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs) 你上面说的这些步骤这个音频模型处理是一样的么,看看有啥区别,为什么,还是说是一样的,讲解一下每段代码处于的流程阶段
12-16
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值