简化代码
- 代码中ggml_tensor比较重要的三个属性为:
- 原始数据 : void * data
- 维度向量 : ne[20] :
.ne = {3, 2, 1, 1}, 表示一个3列 2 行的矩阵 - 步长向量 : nb[20]:
.nb = {sizeof(float), sizeof(float) * 3,0, 0}, 因为数据被统一转换为了void指针,任何Tensor的数据都是一个void*的一维数组,只是长度不同。所以设置各维度字节步长,对于区分每个数的具体位置十分重要。 - 代码通过不断执行
ggml_vec_dot_f32,将从乘数得到的两个向量片段点积,然后通过维度和步长将结果放到结果的data数据正确的位置上。
#include <stdio.h>
#include <stdlib.h> // malloc
#include <cstdint> // int64_t
#include <cassert> // assert
#define restrict
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b))
enum ggml_type { GGML_TYPE_F32 = 0, GGML_TYPE_F16 = 1, GGML_TYPE_COUNT,};
enum ggml_op { GGML_OP_NONE = 0, GGML_MAX_OP_PARAMS};
enum ggml_task_type {GGML_TASK_INIT ,GGML_TASK_COMP ,GGML_TASK_FINALIZE };
struct ggml_compute_params {
enum ggml_task_type type;
};
// n-dimensional tensor
struct ggml_tensor {
enum ggml_type type;
int64_t ne[20]; // int64_t ne[GGML_MAX_DIMS]; // number of elements
size_t nb[20]; // size_t nb[GGML_MAX_DIMS]; // stride in bytes
void * data;
};
inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
float sumf = 0.0;// ggml_float sumf = 0.0;
for (int i = 0; i < n; ++i) {// n是向量长度
sumf += x[i]*y[i];
}
printf("%.9f \n", sumf);
*s = sumf;
}
void ggml_compute_forward_mul_mat_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
// int64_t t0 = ggml_perf_time_us();
// UNUSED(t0);
// 分别获取源张量src0在四个维度上的元素数量(ne代表number of elements,即元素个数维度相关的属性),后续用于维度相关的判断和计算
const int ne00 = src0->ne[0];
const int ne01 = src0->ne[1];
const int ne02 = src0->ne[2];
const int ne03 = src0->ne[3];
// 同样,获取另一个源张量src1在四个维度上的元素数量,用于后续的处理和维度检查等操作
const int ne10 = src1->ne[0];
const int ne11 = src1->ne[1];
const int ne12 = src1->ne[2];
const int ne13 = src1->ne[3];
// 获取目标张量dst在四个维度上的元素数量,并计算出dst张量总的元素数量,方便后续整体的计算或者遍历等操作
const int ne0 = dst->ne[0];
const int ne1 = dst->ne[1];
const int ne2 = dst->ne[2];
const int ne3 = dst->ne[3];
const int ne = ne0*ne1*ne2*ne3;
// 获取源张量src0在四个维度上每个元素“步长”所占字节数(nb可能代表number of bytes,字节数维度相关属性)
const int nb00 = src0->nb[0];
const int nb01 = src0->nb[1];
const int nb02 = src0->nb[2];
const int nb03 = src0->nb[3];
// 获取源张量src1在四个维度上每个元素“步长”所占字节数
const int nb10 = src1->nb[0];
const int nb11 = src1->nb[1];
const int nb12 = src1->nb[2];
const int nb13 = src1->nb[3];
// 获取目标张量dst在四个维度上每个元素“步长”所占字节数
const int nb0 = dst->nb[0];
const int nb1 = dst->nb[1];
const int nb2 = dst->nb[2];
const int nb3 = dst->nb[3];
assert(ne02 == ne12);
assert(ne03 == ne13);
assert(ne2 == ne12);
assert(ne3 == ne13);
// TODO: we don't support permuted src0
assert(nb00 == sizeof(float) || nb01 == sizeof(float));
// dst cannot be transposed or permuted
assert(nb0 == sizeof(float));
assert(nb0 <= nb1);
assert(nb1 <= nb2);
assert(nb2 <= nb3);
assert(ne0 == ne01);
assert(ne1 == ne11);
assert(ne2 == ne02);
assert(ne3 == ne03);
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
#endif
if (nb01 >= nb00) {// 当src0张量是未转置情况(nb01 >= nb00)
assert(nb10 == sizeof(float));
// total rows in src0 src0(第一个乘数张量)总的行数(nr)
const int nr = ne01*ne02*ne03;
const int ir0 = 0;
const int ir1 = nr;
for (int ir = ir0; ir < ir1; ++ir) {
// src0 indices
const int i03 = ir/(ne02*ne01);// 第三维度(最后一个维度)上的索引
const int i02 = (ir - i03*ne02*ne01)/ne01; // 第二维度上的索引
const int i01 = (ir - i03*ne02*ne01 - i02*ne01); // 第一维度上的索引
for (int ic = 0; ic < ne11; ++ic) { // ne11第二个乘数张量的第一维度,一共需要计算ir1*ne11个向量乘法
// src1 indices
const int i13 = i03;
const int i12 = i02;
const int i11 = ic;
// dst indices
const int i0 = i01;
const int i1 = i11;
const int i2 = i02;
const int i3 = i03;
printf("%d,%d : ",i0,i1);
ggml_vec_dot_f32(ne00,
(float *) ((char *) dst->data + (i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
(float *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)),
(float *) ((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13)));
}
}
} else {
// TODO: do not support transposed src1
}
}
/*
1.5, 1.0, 1.0, 1.0 1.0 5.5 4.5
0.0, 1.0, 1.0, * 2.0 1.0 = 4.0 3.0
0.0, 1.0, 1.0, 2.0 2.0 4.0 3.0
*/
int main() {
// Data points as mentioned in the comment
float x[] = {1.5, 1.0, 1.0,
0.0, 1.0,1.0,
0.0, 1.0,1.0, }; // 3*3矩阵
float y[] = {1.0, 2.0, 2.0,
1.0, 1.0,2.0, }; // 3*2矩阵的转置
float res[6]; // 3*2矩阵
// 设置src0张量的属性
ggml_tensor src0 = (struct ggml_tensor) {
.type = GGML_TYPE_F32,
.ne = {3, 3, 1, 1},
.nb = {sizeof(float), sizeof(float) * 3, 0, 0}, // 正确设置各维度字节步长
.data = (void *)x,
};
// 设置src1张量的属性
ggml_tensor src1 = (struct ggml_tensor) {
.type = GGML_TYPE_F32,
.ne = {3, 2, 1, 1}, // 3列 2 行
.nb = {sizeof(float), sizeof(float) * 3,0, 0}, // 设置各维度字节步长
.data = (void *)y,
};
// 设置result张量的属性,将结果存储位置指向res数组
ggml_tensor result = (struct ggml_tensor) {
.type = GGML_TYPE_F32,
.ne = {3, 2, 1, 1},
.nb = {sizeof(float), sizeof(float) * 3, sizeof(float) * 6, sizeof(float) * 6}, // 设置各维度字节步长
.data = (void *)res,
};
struct ggml_compute_params params = {
.type = GGML_TASK_COMP,
};
ggml_compute_forward_mul_mat_f32(¶ms, &src0, &src1, &result); // 调用函数进行矩阵乘法计算
// 输出结果验证
for (int i = 0; i < sizeof(res)/sizeof(float); ++i) {
printf("%.9f ", res[i]);
}
printf("\n");
return 0;
}
接近原版的单个可执行项目:
#include <stdio.h>
#include <stdlib.h> // malloc
#include <cstdint> // int64_t
#include <cassert> // assert
#include <string.h> // memset 内存块设置值
#define restrict
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b))
enum ggml_type { GGML_TYPE_F32 = 0, GGML_TYPE_F16 = 1, GGML_TYPE_COUNT,};
enum ggml_op { GGML_OP_NONE = 0, GGML_MAX_OP_PARAMS};
enum ggml_task_type {GGML_TASK_INIT ,GGML_TASK_COMP,GGML_TASK_FINALIZE };
struct ggml_compute_params {
enum ggml_task_type type;
int ith, nth;
// work buffer for all threads
size_t wsize;
void * wdata;
};
// n-dimensional tensor
struct ggml_tensor {
enum ggml_type type;
// GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
struct ggml_backend_buffer * buffer;
int64_t ne[20]; // int64_t ne[GGML_MAX_DIMS]; // number of elements
size_t nb[20]; // size_t nb[GGML_MAX_DIMS]; // stride in bytes:
// nb[0] = ggml_type_size(type)
// nb[1] = nb[0] * (ne[0] / ggml_blck_size(type)) + padding
// nb[i] = nb[i-1] * ne[i-1]
// compute data
enum ggml_op op;
// op params - allocated as int32_t for alignment
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
int32_t flags;
struct ggml_tensor * src[3]; // src[GGML_MAX_SRC];
// source tensor and offset for views
struct ggml_tensor * view_src;
size_t view_offs;
void * data;
char name[3]; // name[GGML_MAX_NAME];
void * extra; // extra things e.g. for ggml-cuda.cu
char padding[8];
};
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; }
inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }
inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
float sumf = 0.0;// ggml_float sumf = 0.0;
#ifdef __ARM_NEON
#elif defined(__AVX2__)
#elif defined(__AVX__)
#elif defined(__wasm_simd128__)
#else
// scalar
for (int i = 0; i < n; ++i) {// n是向量长度
sumf += x[i]*y[i];
}
#endif
*s = sumf;
}
void ggml_compute_forward_mul_mat_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
// int64_t t0 = ggml_perf_time_us();
// UNUSED(t0);
// 分别获取源张量src0在四个维度上的元素数量(ne代表number of elements,即元素个数维度相关的属性),后续用于维度相关的判断和计算
const int ne00 = src0->ne[0];
const int ne01 = src0->ne[1];
const int ne02 = src0->ne[2];
const int ne03 = src0->ne[3];
// 同样,获取另一个源张量src1在四个维度上的元素数量,用于后续的处理和维度检查等操作
const int ne10 = src1->ne[0];
const int ne11 = src1->ne[1];
const int ne12 = src1->ne[2];
const int ne13 = src1->ne[3];
// 获取目标张量dst在四个维度上的元素数量,并计算出dst张量总的元素数量,方便后续整体的计算或者遍历等操作
const int ne0 = dst->ne[0];
const int ne1 = dst->ne[1];
const int ne2 = dst->ne[2];
const int ne3 = dst->ne[3];
const int ne = ne0*ne1*ne2*ne3;
// 获取源张量src0在四个维度上每个元素“步长”所占字节数(nb可能代表number of bytes,字节数维度相关属性)
const int nb00 = src0->nb[0];
const int nb01 = src0->nb[1];
const int nb02 = src0->nb[2];
const int nb03 = src0->nb[3];
// 获取源张量src1在四个维度上每个元素“步长”所占字节数
const int nb10 = src1->nb[0];
const int nb11 = src1->nb[1];
const int nb12 = src1->nb[2];
const int nb13 = src1->nb[3];
// 获取目标张量dst在四个维度上每个元素“步长”所占字节数
const int nb0 = dst->nb[0];
const int nb1 = dst->nb[1];
const int nb2 = dst->nb[2];
const int nb3 = dst->nb[3];
// 从传入的计算参数结构体中获取当前线程索引(ith)和总的线程数量(nth),可能用于多线程相关的计算划分等情况
const int ith = params->ith;
const int nth = params->nth;
assert(ne02 == ne12);
assert(ne03 == ne13);
assert(ne2 == ne12);
assert(ne3 == ne13);
// TODO: we don't support permuted src0
assert(nb00 == sizeof(float) || nb01 == sizeof(float));
// dst cannot be transposed or permuted
assert(nb0 == sizeof(float));
assert(nb0 <= nb1);
assert(nb1 <= nb2);
assert(nb2 <= nb3);
assert(ne0 == ne01);
assert(ne1 == ne11);
assert(ne2 == ne02);
assert(ne3 == ne03);
// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows
//
// nb00 < nb01 - src0 is transposed
// compute by src0 columns
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
#endif
if (params->type == GGML_TASK_INIT) {
if (nb01 >= nb00) {
return; // 当传入的计算参数类型为GGML_TASK_INIT时,如果src0张量按前面规则判断为未转置(nb01 >= nb00),则直接返回不做其他操作。
}
// TODO: fix this memset (wsize is overestimated)
memset(params->wdata, 0, params->wsize);// 若src0是转置情况(nb01 < nb00),则使用memset将参数中的工作数据区域(wdata)清零(wsize可能被高估了,此内存设置操作可能存在优化空间)
return;
}
if (params->type == GGML_TASK_FINALIZE) {
if (nb01 >= nb00) {
return;// 当传入的计算参数类型为GGML_TASK_FINALIZE时,如果src0张量是未转置情况(nb01 >= nb00)直接返回。
}
// TODO: fix this memset (wsize is overestimated)
//assert(params->wsize == (ggml_nbytes(dst) + CACHE_LINE_SIZE)*nth);
float * const wdata = (float *)params->wdata; // params->wdata; 对于src0转置情况(nb01 < nb00),获取工作数据指针(wdata)
// cols per thread
const int dc = (ne + nth - 1)/nth; // 计算每个线程负责的列数(dc)以及本线程负责的列范围(ic0到ic1)
// col range for this thread
const int ic0 = dc*ith;
const int ic1 = MIN(ic0 + dc, ne);
ggml_vec_cpy_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + ic0);// 将目标张量dst对应本线程负责的列范围的数据复制到工作数据区域(wdata)
for (int k = 1; k < nth; k++) {
// 循环调用ggml_vec_acc_f32函数,将其他线程对应的工作数据累加到本线程负责的目标张量区域,最后返回
ggml_vec_acc_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + (ne + 16)*k + ic0); // CACHE_LINE_SIZE_F32 = 16
}
return;
}
if (nb01 >= nb00) {// 当src0张量是未转置情况(nb01 >= nb00)
// TODO: do not support transposed src1
assert(nb10 == sizeof(float));// 4 bytes 检查src1在某个维度上元素所占字节数符合单精度浮点数大小要求
// parallelize by src0 rows using ggml_vec_dot_f32
// total rows in src0 src0总的行数(nr)
const int nr = ne01*ne02*ne03;
// rows per thread 每个线程负责的行数(dr)
const int dr = (nr + nth - 1)/nth;
// row range for this thread 本线程负责的行范围(ir0到ir1)
const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);
for (int ir = ir0; ir < ir1; ++ir) {
// src0 indices
const int i03 = ir/(ne02*ne01);
const int i02 = (ir - i03*ne02*ne01)/ne01;
const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
for (int ic = 0; ic < ne11; ++ic) {
// src1 indices
const int i13 = i03;
const int i12 = i02;
const int i11 = ic;
// dst indices
const int i0 = i01;
const int i1 = i11;
const int i2 = i02;
const int i3 = i03;
ggml_vec_dot_f32(ne00,
(float *) ((char *) dst->data + (i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
(float *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)),
(float *) ((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13)));
}
}
} else {
}
//int64_t t1 = ggml_perf_time_us();
//static int64_t acc = 0;
//acc += t1 - t0;
//if (t1 - t0 > 10) {
// printf("\n");
// printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
// printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
// printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
// printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13);
// printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
//}
}
// struct ggml_init_params params = {
// .mem_size = 16*1024*1024,
// .mem_buffer = NULL,
// };
//
// // memory allocation happens here
// struct ggml_context * ctx = ggml_init(params);
//
// struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
//
// ggml_set_param(ctx, x); // x is an input variable
// struct ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
// struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
// struct ggml_tensor * x2 = ggml_mul(ctx, x, x);
int main() {
// Data points as mentioned in the comment
float x[] = {1, 0, 0, 1}; // 2*2矩阵
float y[] = {1, 2, 1, 1}; // 2*2矩阵
float res[4]; // 2*2矩阵
// 设置src0张量的属性
ggml_tensor src0 = (struct ggml_tensor) {
.type = GGML_TYPE_F32,
.ne = {2, 2, 1, 1},
.nb = {sizeof(float), sizeof(float) * 2, 0, 0}, // 正确设置各维度字节步长
.data = (void *)x,
};
// 设置src1张量的属性
ggml_tensor src1 = (struct ggml_tensor) {
.type = GGML_TYPE_F32,
.ne = {2, 2, 1, 1},
.nb = {sizeof(float), sizeof(float) * 2, 0, 0}, // 正确设置各维度字节步长
.data = (void *)y,
};
// 设置result张量的属性,将结果存储位置指向res数组
ggml_tensor result = (struct ggml_tensor) {
.type = GGML_TYPE_F32,
.ne = {2, 2, 1, 1},
.nb = {sizeof(float), sizeof(float) * 2, sizeof(float) * 2, sizeof(float) * 2}, // 正确设置各维度字节步长
.data = (void *)res,
};
// 合理初始化计算参数结构体,根据涉及的数据量等情况设置工作缓冲区大小等参数
size_t work_buffer_size = sizeof(float) * 4; // 根据实际运算情况预估一个合适的工作缓冲区大小,这里简单按结果矩阵元素个数 * 单元素字节数设置
void *work_buffer = malloc(work_buffer_size);
if (!work_buffer) {
perror("malloc failed");
return -1;
}
struct ggml_compute_params params = {
.type = GGML_TASK_COMP,
.ith = 0,
.nth = 1,
.wsize = work_buffer_size,
.wdata = work_buffer
};
ggml_compute_forward_mul_mat_f32(¶ms, &src0, &src1, &result); // 调用函数进行矩阵乘法计算
// // 切换计算参数类型为GGML_TASK_FINALIZE并再次调用函数完成最终的结果整合等操作
// params.type = GGML_TASK_FINALIZE;
// ggml_compute_forward_mul_mat_f32(¶ms, &src0, &src1, &result);
// 输出结果验证
for (int i = 0; i < 4; ++i) {
printf("%.0f ", res[i]);
}
printf("\n");
free(work_buffer); // 释放工作缓冲区内存
return 0;
}