// main.cpp
// 2.使用whisper初始化上下文,并根据给定的模型文件和参数进行配置
struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
if (ctx == nullptr) {
fprintf(stderr, "error: failed to initialize whisper context\n");
return 3;
}
whisper_init_from_file_with_params_no_state
struct whisper_context * whisper_init_from_file_with_params_no_state ( const char * path_model, struct whisper_context_params params) {
WHISPER_LOG_INFO ( "%s: loading model from '%s'\n" , __func__ , path_model) ;
auto fin = std:: ifstream ( path_model, std:: ios:: binary) ;
whisper_model_loader loader = {
} ;
loader. context = & fin;
loader. read = [ ] ( void * ctx, void * output, size_t read_size) {
std:: ifstream * fin = ( std:: ifstream* ) ctx;
fin-> read ( ( char * ) output, read_size) ;
return read_size;
} ;
loader. eof = [ ] ( void * ctx) {
std:: ifstream * fin = ( std:: ifstream* ) ctx;
return fin-> eof ( ) ;
} ;
loader. close = [ ] ( void * ctx) {
std:: ifstream * fin = ( std:: ifstream* ) ctx;
fin-> close ( ) ;
} ;
auto ctx = whisper_init_with_params_no_state ( & loader, params) ;
if ( ctx) {
ctx-> path_model = path_model;
}
return ctx;
}
whisper_model_loader
// whisper.cpp-v1.5.0/whisper.h
// 定义模型加载器结构体
typedef struct whisper_model_loader {
void * context; // 上下文信息,可以是文件流、网络连接等,具体取决于模型加载器的实现
size_t (*read)(void * ctx, void * output, size_t read_size); // 函数指针,用于从上下文中读取数据
bool (*eof)(void * ctx); // 函数指针,用于检查上下文是否已到达末尾
void (*close)(void * ctx); // 函数指针,用于关闭上下文
} whisper_model_loader;
whisper_init_with_params_no_state
// whisper.cpp-v1.5.0/whisper.h
// Various functions for loading a ggml whisper model.
// Allocate (almost) all memory needed for the model.
// Return NULL on failure
WHISPER_API struct whisper_context * whisper_init_from_file_with_params (const char * path_model, struct whisper_context_params params);
WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params(void * buffer, size_t buffer_size, struct whisper_context_params params);
WHISPER_API struct whisper_context * whisper_init_with_params (struct whisper_model_loader * loader, struct whisper_context_params params);
// These are the same as the above, but the internal state of the context is not allocated automatically
// It is the responsibility of the caller to allocate the state using whisper_init_state() (#523)
WHISPER_API struct whisper_context * whisper_init_from_file_with_params_no_state (const char * path_model, struct whisper_context_params params);
WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params_no_state(void * buffer, size_t buffer_size, struct whisper_context_params params);
WHISPER_API struct whisper_context * whisper_init_with_params_no_state (struct whisper_model_loader * loader, struct whisper_context_params params);
struct whisper_context * whisper_init_with_params_no_state ( struct whisper_model_loader * loader, struct whisper_context_params params) {
ggml_time_init ( ) ;
whisper_context * ctx = new whisper_context;
ctx-> params = params;
if ( ! whisper_model_load ( loader, * ctx) ) {
## ! ! ! ! ! !
loader-> close ( loader-> context) ;
WHISPER_LOG_ERROR ( "%s: failed to load model\n" , __func__ ) ;
delete ctx;
return nullptr ;
}
loader-> close ( loader-> context) ;
return ctx;
}
重要数据结构 whisper_context
whisper_model
struct whisper_model {
e_model type = MODEL_UNKNOWN;
whisper_hparams hparams;
whisper_filters filters;
// encoder.positional_embedding
struct ggml_tensor * e_pe;
// encoder.conv1
struct ggml_tensor * e_conv_1_w;
struct ggml_tensor * e_conv_1_b;
// encoder.conv2
struct ggml_tensor * e_conv_2_w;
struct ggml_tensor * e_conv_2_b;
// encoder.ln_post
struct ggml_tensor * e_ln_w;
struct ggml_tensor * e_ln_b;
// decoder.positional_embedding
struct ggml_tensor * d_pe;
// decoder.token_embedding
struct ggml_tensor * d_te;
// decoder.ln
struct ggml_tensor * d_ln_w;
struct ggml_tensor * d_ln_b;
std::vector<whisper_layer_encoder> layers_encoder;
std::vector<whisper_layer_decoder> layers_decoder;
// ggml context that contains all the meta information about the model tensors
struct ggml_context * ctx;
// the model backend data is read-only and can be shared between processors
struct ggml_backend_buffer * buffer;
// tensors
int n_loaded;
std::map<std::string, struct ggml_tensor *> tensors;
};
whisper_layer_encoder : whisper_model的编码器
// audio encoding layer
struct whisper_layer_encoder {
// encoder.blocks.*.attn_ln
struct ggml_tensor * attn_ln_0_w;
struct ggml_tensor * attn_ln_0_b;
// encoder.blocks.*.attn.out
struct ggml_tensor * attn_ln_1_w;
struct ggml_tensor * attn_ln_1_b;
// encoder.blocks.*.attn.query
struct ggml_tensor * attn_q_w;
struct ggml_tensor * attn_q_b;
// encoder.blocks.*.attn.key
struct ggml_tensor * attn_k_w;
// encoder.blocks.*.attn.value
struct ggml_tensor * attn_v_w;
struct ggml_tensor * attn_v_b;
// encoder.blocks.*.mlp_ln
struct ggml_tensor * mlp_ln_w;
struct ggml_tensor * mlp_ln_b;
// encoder.blocks.*.mlp.0
struct ggml_tensor * mlp_0_w;
struct ggml_tensor * mlp_0_b;
// encoder.blocks.*.mlp.2
struct ggml_tensor * mlp_1_w;
struct ggml_tensor * mlp_1_b;
};
whisper_layer_encoder : whisper_model的解码器
// token decoding layer
struct whisper_layer_decoder {
// decoder.blocks.*.attn_ln
struct ggml_tensor * attn_ln_0_w;
struct ggml_tensor * attn_ln_0_b;
// decoder.blocks.*.attn.out
struct ggml_tensor * attn_ln_1_w;
struct ggml_tensor * attn_ln_1_b;
// decoder.blocks.*.attn.query
struct ggml_tensor * attn_q_w;
struct ggml_tensor * attn_q_b;
// decoder.blocks.*.attn.key
struct ggml_tensor * attn_k_w;
// decoder.blocks.*.attn.value
struct ggml_tensor * attn_v_w;
struct ggml_tensor * attn_v_b;
// decoder.blocks.*.cross_attn_ln
struct ggml_tensor * cross_attn_ln_0_w;
struct ggml_tensor * cross_attn_ln_0_b;
// decoder.blocks.*.cross_attn.out
struct ggml_tensor * cross_attn_ln_1_w;
struct ggml_tensor * cross_attn_ln_1_b;
// decoder.blocks.*.cross_attn.query
struct ggml_tensor * cross_attn_q_w;
struct ggml_tensor * cross_attn_q_b;
// decoder.blocks.*.cross_attn.key
struct ggml_tensor * cross_attn_k_w;
// decoder.blocks.*.cross_attn.value
struct ggml_tensor * cross_attn_v_w;
struct ggml_tensor * cross_attn_v_b;
// decoder.blocks.*.mlp_ln
struct ggml_tensor * mlp_ln_w;
struct ggml_tensor * mlp_ln_b;
// decoder.blocks.*.mlp.0
struct ggml_tensor * mlp_0_w;
struct ggml_tensor * mlp_0_b;
// decoder.blocks.*.mlp.2
struct ggml_tensor * mlp_1_w;
struct ggml_tensor * mlp_1_b;
};
whisper_model_load函数
begin
static bool whisper_model_load ( struct whisper_model_loader * loader, whisper_context & wctx) {
WHISPER_LOG_INFO ( "%s: loading model\n" , __func__ ) ;
const int64_t t_start_us = ggml_time_us ( ) ;
wctx. t_start_us = t_start_us;
auto & model = wctx. model;
auto & vocab = wctx. vocab;
{
uint32_t magic;
read_safe ( loader, magic) ;
if ( magic != GGML_FILE_MAGIC) {
WHISPER_LOG_ERROR ( "%s: invalid model data (bad magic)\n" , __func__ ) ;
return false ;
}
}
{
auto & hparams = model. hparams;
read_safe ( loader, hparams. n_vocab) ;
read_safe ( loader, hparams. n_audio_ctx) ;
read_safe ( loader, hparams. n_audio_state) ;
read_safe ( loader, hparams. n_audio_head) ;
read_safe ( loader, hparams. n_audio_layer) ;
read_safe ( loader, hparams. n_text_ctx) ;
read_safe ( loader, hparams. n_text_state) ;
read_safe ( loader, hparams. n_text_head) ;
read_safe ( loader, hparams. n_text_layer) ;
read_safe ( loader, hparams. n_mels) ;
read_safe ( loader, hparams. ftype) ;
assert ( hparams. n_text_state == hparams. n_audio_state) ;
std:: string mver = "" ;
if ( hparams. n_audio_layer == 4 ) {
model. type = e_model:: MODEL_TINY;
}
if ( hparams. n_audio_layer == 6 ) {
model. type = e_model:: MODEL_BASE;
}
if ( hparams. n_audio_layer == 12 ) {
model. type = e_model:: MODEL_SMALL;
}
if ( hparams. n_audio_layer == 24 ) {
model. type = e_model:: MODEL_MEDIUM;
}
if ( hparams. n_audio_layer == 32 ) {
model. type = e_model:: MODEL_LARGE;
if ( hparams. n_vocab == 51866 ) {
mver = " v3" ;
}
}
const int32_t qntvr = hparams. ftype / GGML_QNT_VERSION_FACTOR;
hparams. ftype %= GGML_QNT_VERSION_FACTOR;
wctx. wtype = ggml_ftype_to_ggml_type ( ( ggml_ftype) ( model. hparams. ftype) ) ;
if ( wctx. wtype == GGML_TYPE_COUNT) {
WHISPER_LOG_ERROR ( "%s: invalid model (bad ftype value %d)\n" , __func__ , model. hparams. ftype) ;
return false ;
}
WHISPER_LOG_INFO ( "%s: n_vocab = %d\n" , __func__ , hparams. n_vocab) ;
WHISPER_LOG_INFO ( "%s: n_audio_ctx = %d\n" , __func__ , hparams. n_audio_ctx) ;
WHISPER_LOG_INFO ( "%s: n_audio_state = %d\n" , __func__ , hparams. n_audio_state) ;
WHISPER_LOG_INFO ( "%s: n_audio_head = %d\n" , __func__ , hparams. n_audio_head) ;
WHISPER_LOG_INFO ( "%s: n_audio_layer = %d\n" , __func__ , hparams. n_audio_layer) ;
WHISPER_LOG_INFO ( "%s: n_text_ctx = %d\n" , __func__ , hparams. n_text_ctx) ;
WHISPER_LOG_INFO ( "%s: n_text_state = %d\n" , __func__ , hparams. n_text_state) ;
WHISPER_LOG_INFO ( "%s: n_text_head = %d\n" , __func__ , hparams. n_text_head) ;
WHISPER_LOG_INFO ( "%s: n_text_layer = %d\n" , __func__ , hparams. n_text_layer) ;
WHISPER_LOG_INFO ( "%s: n_mels = %d\n" , __func__ , hparams. n_mels) ;
WHISPER_LOG_INFO ( "%s: ftype = %d\n" , __func__ , model. hparams. ftype) ;
WHISPER_LOG_INFO ( "%s: qntvr = %d\n" , __func__ , qntvr) ;
WHISPER_LOG_INFO ( "%s: type = %d (%s%s)\n" , __func__ , model. type, g_model_name. at ( model. type) . c_str ( ) , mver. c_str ( ) ) ;
}
{
auto & filters = wctx. model. filters;
read_safe ( loader, filters. n_mel) ;
read_safe ( loader, filters. n_fft) ;
filters. data. resize ( filters. n_mel * filters. n_fft) ;
loader-> read ( loader-> context, filters. data. data ( ) , filters. data. size ( ) * sizeof ( float ) ) ;
BYTESWAP_FILTERS ( filters) ;
}
{