现在开头:Fairseq是一个正在快速迭代的产品,而且是开源的!
这不是表扬,这意味着三件事情:
1.他没有文档!所有框架代码都没有任何注释,包括函数docstring都没有
2.他没有经过有效测试,估计是抢时间吧!即使是官网Readme里的例子也是无法跑起来的!
3.他是一个框架,而且是一个非常不Pythonic的框架,充斥着inline/包装器/莫名其妙的语法。
4.他大量使用类的静态方法和全局函数,这是一种破坏对象化的机制,实在是太难理解了。
虽然这四点决定他真的对不住Facebook的金字招牌,但是作为一个学习者,总要把他运行起来,那么开始这场针对 FaceBOOK派“全新老爷车”的修车之旅吧!
通过调试Fairseq我深刻的感到痛苦:各种莫名其妙的跳转,你永远不会知道在什么时候和什么地方发生跳转。
如果想利用Fairseq的机制提供管线封装,那么肯定会很累!
Fairseq的代码质量非常低,模型/模型配置参数/模型管线的参数对应的变量名都是model,任务/任务名称/任务的参数集合的变量名都是task,具体的含义需要调试上下文获得,并且不能保证一个函数的两次重入都是一样的含义。
以上批评意见的时间是1.0版,20201123
Step1 安装
略。。。网上教程很多的,反正你要自己动手
Fairseq
apex
下载预训练模型
下载训练数据 Lirispeech
Step2 跑wave2vec
先用官网的例子跑一个看看:
import torch
from fairseq.models.wav2vec import Wav2VecModel
cp = torch.load('/home/**/Documents/Research/fairseq/model/wav2vec_vox_960h_pl.pt.zip')
model = Wav2VecModel.build_model(cp['args'], task=None)
model.load_state_dict(cp['model'])
model.eval()
wav_input_16khz = torch.randn(1,10000)
z = model.feature_extractor(wav_input_16khz)
c = model.feature_aggregator(z)
你猜会咋样呢?
>>> Exception "AttributeError"
'Namespace' object has no attribute 'prediction_steps'
File: /home/**/Documents/workspace/fairseq/fairseq/models/wav2vec/wav2vec.py, Line: 175
Exception "unhandled AttributeError"
'Namespace' object has no attribute 'prediction_steps'
File: /home/**/Documents/workspace/fairseq/fairseq/models/wav2vec/wav2vec.py, Line: 175
哈哈,很神奇吧,其实也没啥就是单纯的代码质量太差!
@register_model("wav2vec", dataclass=Wav2VecConfig)
class Wav2VecModel(BaseFairseqModel):
@classmethod
def build_model(cls, cfg: Wav2VecConfig, task: FairseqTask):
"""Build a new model instance."""
model = Wav2VecModel(cfg)
logger.info(model)
return model
def __init__(self, cfg: Wav2VecConfig):
super().__init__()
self.prediction_steps = cfg.prediction_steps
offset = cfg.offset
if cfg.activation == "relu":
activation = nn.ReLU()
elif cfg.activation == "gelu":
activation = nn.GELU()
else:
raise Exception("unknown activation " + cfg.activation)
调试可以看到,cfg这个参数根本就不是Wav2VecConfig类型,这个args的类型是<class 'argparse.Namespace'>,内容是:
Namespace(activation_dropout=0.1, adam_betas='(0.9, 0.98)', adam_eps=1e-08, all_gather_list_size=16384,
apply_mask=True, arch='wav2vec_ctc', attention_dropout=0.0, best_checkpoint_metric='wer',
bf16=False, bpe=None, broadcast_buffers=False, bucket_cap_mb=25, checkpoint_suffix='', clip_norm=0.0,
cpu=False, criterion='wav2vec', curriculum=0, data='/private/home/qiantong/w2v/data/train_36s_short_960h_rescore',
data_buffer_size=10, dataset_impl=None, ddp_backend='no_c10d', decay_steps=250000, device_id=0, disable_validation=False,
distributed_backend='nccl', distributed_init_method='tcp://learnfair1679:56443', distributed_no_spawn=True,
distributed_num_procs=None, distributed_port=56443, distributed_rank=0, distributed_world_size=24,
distributed_wrapper='DDP',
dropout=0.0, dropout_input=0,
empty_cache_freq=0, enable_padding=False, fast_stat_sync=False, feature_grad_mult=0.0, final_dropout=0.0, final_lr_scale=0.05, find_unused_parameters=False,
finetune_from_model=None, fix_batches_to_gpus=False,
fixed_validation_seed=None, fp16=True, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, freeze_finetune_updates=10000,
hold_steps=210000, init_lr_scale=0.01, keep_best_checkpoints=-1, keep_interval_updates=-1,
keep_last_epochs=-1, labels='ltr', layerdrop=0.1, localsgd_frequency=3, log_format='json', log_interval=500, lr=[3e-05], lr_scheduler='tri_stage',
mask_channel_length=64, mask_channel_other=0.0, mask_channel_prob=0.1, mask_channel_selection='static',
mask_length=10, mask_other=0.0, mask_prob=0.1, mask_selection='static', max_epoch=0, max_sample_size=None,
max_sentences=None, max_sentences_valid=None, max_tokens=1280000, max_tokens_valid=1280000, max_update=500000, maximize_best_checkpoint_metric=False,
memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001,
min_lr=-1, min_sample_size=None, model_parallel_size=1, no_epoch_checkpoints=True, no_last_checkpoints=False,
no_mask_channel_overlap=False, no_mask_overlap=False, no_pretrained_weights=False, no_progress_bar=False, no_save=False,
no_save_optimizer_state=False, no_seed_provided=False, normalize=True, nprocs_per_node=8, num_workers=4,
optimizer='adam', optimizer_overrides='{}', patience=-1, pipeline_balance=None, pipeline_checkpoint='never',
pipeline_chunks=None, pipeline_devices=None, pipeline_model_parallel=False, profile=False, quantization_config_path=None,
remove_bpe='letter', required_batch_size_multiple=8, required_seq_len_multiple=1, reset_dataloader=False,
reset_lr_scheduler=False, reset_meters=False, reset_optimizer=False, restore_file='checkpoint_last.pt',
sample_rate=16000, save_dir='/checkpoint/michaelauli/asr/pseudolbl/960h_vox.fp16.u500000.savg.nrm.ltr.m_static.mstd0.mask10.mprob0.1.ld0.1.mc_static.mcstd0.maskc64.mcprob0.1.fgm0.0.ffu10000.lr3e-05.warmup40000.hld210000.dec250000.frs0.05.fd0.0.drop0.0.ad0.1.atd0.0.ms1280000.sd1337.uf1.ngpu24',
save_interval=1, save_interval_updates=0, scoring='bleu', seed=1337, sentence_avg=True, skip_invalid_size_inputs_valid_test=False, slowmo_algorithm='LocalSGD',
slowmo_momentum=None, stop_time_hours=0, task='audio_pretraining', tensorboard_logdir='',
threshold_loss_scale=None, tokenizer=None, tpu=False, train_subset='train', update_freq=[1], use_bmuf=False,
use_old_adam=False, user_dir=None, valid_subset='dev_other', validate_after_updates=10000,
validate_interval=1, validate_interval_updates=0, w2v_args=Namespace(activation_dropout=0.0, activation_fn='gelu', adam_betas='(0.9,0.98)',
adam_eps=1e-06, arch='wav2vec2', attention_dropout=0.1, attention_type='default', augment=False,
best_checkpoint_metric='loss', bpe=None, bucket_cap_mb=25, centroids=None, clip_norm=25, codebook_negatives=0, combine_banks=False, combine_dataset=False, conv_bias=True,
conv_feature_layers='[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] * 2', conv_logs=True, conv_pos=128, conv_pos_groups=16, conv_pos_layers=1, cpu=False, criterion='wav2vec',
cross_sample_negatives=0, curriculum=0, data='/private/home/abaevski/data/librivox/no_silence', dataset_impl=None, ddp_backend='c10d', debug=False, device_id=0, disable_validation=False,
distributed_backend='nccl', distributed_init_method='tcp://learnfair1331:55498', distributed_no_spawn=True, distributed_port=55498, distributed_rank=0, distributed_world_size=128,
div_drop_percent=0, div_pen_threshold=None, dropout=0.0, dropout_features=0.1, dropout_input=0.1, duplicate_negatives=0, empty_cache_freq=0, enable_padding=False,
encode_padded_indiv=False, encoder_attention_heads=16, encoder_embed_dim=1024, encoder_ffn_embed_dim=4096, encoder_layerdrop=0.0, encoder_layers=24, encoder_normalize_before=True,
encoder_optimizer_params=None, encoder_schedule=0, end_learning_rate=0.0, extractor_mode='layer_norm', extractor_model=None, extractor_norm_location='default',
fast_stat_sync=False, feature_glu=False, feature_grad_mult=1.0, feature_noise=0, feature_noise_last=0, features_pen=True, final_dim=768, find_unused_parameters=True, finetune_extractor=True, fix_batches_to_gpus=False, fixed_validation_seed=None, force_anneal=None, fp16=True, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, group_norm_features=False, group_norm_groups=512,
gumbel_noise_gain=1, infomax=True, input_noise=0.0, keep_interval_updates=1, keep_last_epochs=-1, label_smoothing=0.0, labels=None, latent_groups=2, latent_temp='(2.0,0.1,0.999995)',
latent_var_banks=2, latent_vars=320, layer_norm_after=-1, layer_norm_before=0, layer_norm_features=True, layer_norm_first=True, layer_norm_repr=True, lazy_load_labels=False,
log_format='json', log_interval=100, logit_temp=0.1, loss_weights=None, lr=[0.005], lr_scheduler='polynomial_decay', mask_min_space=1, mask_multiple_length=10,
mask_prob=0.65, mask_same_channels=False, mask_same_timesteps=False, mask_selection='static',
mask_stdev=0.0, masking_schedule=0, max_epoch=0, max_positions=8000, max_pred_length=0, max_sample_size=320000, max_sentences=None, max_sentences_valid=None, max_tokens=1200000, max_tokens_valid=1200000, max_update=1000000,
maximize_best_checkpoint_metric=False, memory_efficient_fp16=False, min_loss_scale=0.0001, min_lr=-1, min_sample_size=32000, mlp_mi=768, negatives_from_everywhere=False, new_emb_pen=True, new_logit_pen=False, no_bert_init=False,
no_epoch_checkpoints=True, no_last_checkpoints=False, no_mask_channel_overlap=False, no_mask_overlap=False, no_norm_after=None, no_progress_bar=False, no_save=False, no_save_optimizer_state=False, no_token_positional_embeddings=True, noise_type='gaussian', norm_init_weight=1.0, normalize=True, num_negatives=100, num_workers=6, optimizer='adam', optimizer_overrides='{}',
penalize_transformer=False, penalty_coeff='[0,0,0.1,0]', penalty_temp=1.0, pooler_activation_fn='tanh', pooler_dropout=0.0, power=1.0, pre_norm=False, predict_everything=False, predictor_grad_mult=1.0, preemp=False, project_quantized=True,
quantize_input=False, quantize_targets=True, quantized=False, quantizer_chance=0.0, quantizer_grad_mult=1.0, quantizer_init=True, quantizer_init_gain=1.0, quantizer_init_normal=True, relative_positional_embeddings=0, required_batch_size_multiple=8, rescale_sample_size=False,
reset_dataloader=False, reset_lr_scheduler&#