webassembly003 ggml GGML Tensor Library part-4 矩阵乘法的实现

简化代码

代码中ggml_tensor比较重要的三个属性为：
原始数据 ： void * data
维度向量 ： ne[20] ：.ne = {3, 2, 1, 1}, 表示一个3列 2 行的矩阵
步长向量 ： nb[20]：
.nb = {sizeof(float), sizeof(float) * 3,0, 0}, 因为数据被统一转换为了void指针，任何Tensor的数据都是一个void*的一维数组，只是长度不同。所以设置各维度字节步长，对于区分每个数的具体位置十分重要。
代码通过不断执行ggml_vec_dot_f32，将从乘数得到的两个向量片段点积，然后通过维度和步长将结果放到结果的data数据正确的位置上。

#include <stdio.h>
#include <stdlib.h> // malloc
#include <cstdint> // int64_t
#include <cassert> // assert

#define restrict
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b))

enum ggml_type { GGML_TYPE_F32     = 0, GGML_TYPE_F16     = 1,  GGML_TYPE_COUNT,};
enum ggml_op   { GGML_OP_NONE      = 0, GGML_MAX_OP_PARAMS};
enum ggml_task_type {GGML_TASK_INIT ,GGML_TASK_COMP ,GGML_TASK_FINALIZE };

struct ggml_compute_params {
    enum ggml_task_type type;
};


// n-dimensional tensor
struct ggml_tensor {
    enum ggml_type type;
    int64_t ne[20]; // int64_t ne[GGML_MAX_DIMS]; // number of elements
    size_t  nb[20]; // size_t  nb[GGML_MAX_DIMS]; // stride in bytes
    void * data;
};


inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
    float sumf = 0.0;// ggml_float sumf = 0.0;
    for (int i = 0; i < n; ++i) {// n是向量长度
        sumf += x[i]*y[i];
    }
    printf("%.9f \n", sumf);
    *s = sumf;
}



void ggml_compute_forward_mul_mat_f32(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
              struct ggml_tensor * dst) {
    // int64_t t0 = ggml_perf_time_us();
    // UNUSED(t0);

    // 分别获取源张量src0在四个维度上的元素数量（ne代表number of elements，即元素个数维度相关的属性），后续用于维度相关的判断和计算
    const int ne00 = src0->ne[0];
    const int ne01 = src0->ne[1];
    const int ne02 = src0->ne[2];
    const int ne03 = src0->ne[3];
    // 同样，获取另一个源张量src1在四个维度上的元素数量，用于后续的处理和维度检查等操作
    const int ne10 = src1->ne[0];
    const int ne11 = src1->ne[1];
    const int ne12 = src1->ne[2];
    const int ne13 = src1->ne[3];
    // 获取目标张量dst在四个维度上的元素数量，并计算出dst张量总的元素数量，方便后续整体的计算或者遍历等操作
    const int ne0  = dst->ne[0];
    const int ne1  = dst->ne[1];
    const int ne2  = dst->ne[2];
    const int ne3  = dst->ne[3];
    const int ne   = ne0*ne1*ne2*ne3;
    // 获取源张量src0在四个维度上每个元素“步长”所占字节数（nb可能代表number of bytes，字节数维度相关属性）
    const int nb00 = src0->nb[0];
    const int nb01 = src0->nb[1];
    const int nb02 = src0->nb[2];
    const int nb03 = src0->nb[3];
    // 获取源张量src1在四个维度上每个元素“步长”所占字节数
    const int nb10 = src1->nb[0];
    const int nb11 = src1->nb[1];
    const int nb12 = src1->nb[2];
    const int nb13 = src1->nb[3];
    // 获取目标张量dst在四个维度上每个元素“步长”所占字节数
    const int nb0  = dst->nb[0];
    const int nb1  = dst->nb[1];
    const int nb2  = dst->nb[2];
    const int nb3  = dst->nb[3];

    assert(ne02 == ne12);
    assert(ne03 == ne13);
    assert(ne2  == ne12);
    assert(ne3  == ne13);

    // TODO: we don't support permuted src0
    assert(nb00 == sizeof(float) || nb01 == sizeof(float));

    // dst cannot be transposed or permuted
    assert(nb0 == sizeof(float));
    assert(nb0 <= nb1);
    assert(nb1 <= nb2);
    assert(nb2 <= nb3);

    assert(ne0 == ne01);
    assert(ne1 == ne11);
    assert(ne2 == ne02);
    assert(ne3 == ne03);


    #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
    #endif

    if (nb01 >= nb00) {// 当src0张量是未转置情况（nb01 >= nb00） 
    
        assert(nb10 == sizeof(float));
        // total rows in src0  src0(第一个乘数张量)总的行数（nr）
        const int nr = ne01*ne02*ne03;

        const int ir0 = 0;
        const int ir1 = nr;

        for (int ir = ir0; ir < ir1; ++ir) {
            // src0 indices
            const int i03 = ir/(ne02*ne01);// 第三维度（最后一个维度）上的索引
            const int i02 = (ir - i03*ne02*ne01)/ne01; // 第二维度上的索引
            const int i01 = (ir - i03*ne02*ne01 - i02*ne01); // 第一维度上的索引

            for (int ic = 0; ic < ne11; ++ic) { // ne11第二个乘数张量的第一维度，一共需要计算ir1*ne11个向量乘法
                // src1 indices
                const int i13 = i03;
                const int i12 = i02;
                const int i11 = ic;

                // dst indices
                const int i0 = i01;
                const int i1 = i11;
                const int i2 = i02;
                const int i3 = i03;
                
                printf("%d,%d : ",i0,i1);
                ggml_vec_dot_f32(ne00,
                        (float *) ((char *)  dst->data + (i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 
                        (float *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)),
                        (float *) ((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13)));
            }
        }
    } else {
        // TODO: do not support transposed src1
    }

}

/*                 
1.5, 1.0, 1.0,         1.0  1.0     5.5  4.5
0.0, 1.0, 1.0,   *     2.0  1.0  =  4.0  3.0
0.0, 1.0, 1.0,         2.0  2.0     4.0  3.0
                 
*/

int main() {
    // Data points as mentioned in the comment
    float x[] = {1.5, 1.0, 1.0, 
                 0.0, 1.0,1.0, 
                 0.0, 1.0,1.0, }; // 3*3矩阵
                 
    float y[] = {1.0, 2.0, 2.0, 
                 1.0, 1.0,2.0, }; // 3*2矩阵的转置
    float res[6]; // 3*2矩阵

    // 设置src0张量的属性
    ggml_tensor src0 = (struct ggml_tensor) {
      .type = GGML_TYPE_F32,
      .ne = {3, 3, 1, 1},
      .nb = {sizeof(float), sizeof(float) * 3, 0, 0},  // 正确设置各维度字节步长
      .data = (void *)x,
    };

    // 设置src1张量的属性
    ggml_tensor src1 = (struct ggml_tensor) {
      .type = GGML_TYPE_F32,
      .ne = {3, 2, 1, 1}, // 3列 2 行
      .nb = {sizeof(float), sizeof(float) * 3,0, 0},  // 设置各维度字节步长
      .data = (void *)y,
    };

    // 设置result张量的属性，将结果存储位置指向res数组
    ggml_tensor result = (struct ggml_tensor) {
      .type = GGML_TYPE_F32,
      .ne = {3, 2, 1, 1},
      .nb = {sizeof(float), sizeof(float) * 3, sizeof(float) * 6, sizeof(float) * 6},  // 设置各维度字节步长
      .data = (void *)res,
    };

    struct ggml_compute_params params = {
      .type = GGML_TASK_COMP,
    };
    ggml_compute_forward_mul_mat_f32(&params, &src0, &src1, &result);  // 调用函数进行矩阵乘法计算

    
    // 输出结果验证
    for (int i = 0; i < sizeof(res)/sizeof(float); ++i) {
        printf("%.9f ", res[i]);
    }
    printf("\n");

    return 0;
}

接近原版的单个可执行项目：

这里简化了多线程，实际上只有一个线程，也同时简化了ggml_compute_params ，没有使用 work buffer。
关于线程的部分实现细节，可以查看webassembly003 ggml GGML Tensor Library part-3

#include <stdio.h>
#include <stdlib.h> // malloc
#include <cstdint> // int64_t
#include <cassert> // assert
#include <string.h> // memset 内存块设置值

#define restrict
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b))

enum ggml_type { GGML_TYPE_F32     = 0, GGML_TYPE_F16     = 1,  GGML_TYPE_COUNT,};
enum ggml_op   { GGML_OP_NONE      = 0, GGML_MAX_OP_PARAMS};
enum ggml_task_type {GGML_TASK_INIT ,GGML_TASK_COMP,GGML_TASK_FINALIZE };

struct ggml_compute_params {
    enum ggml_task_type type;
    int ith, nth;
    // work buffer for all threads
    size_t wsize;
    void * wdata;
};


// n-dimensional tensor
struct ggml_tensor {
    enum ggml_type type;
    // GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");

    struct ggml_backend_buffer * buffer;

    int64_t ne[20]; // int64_t ne[GGML_MAX_DIMS]; // number of elements
    size_t  nb[20]; // size_t  nb[GGML_MAX_DIMS]; // stride in bytes:
                                // nb[0] = ggml_type_size(type)
                                // nb[1] = nb[0]   * (ne[0] / ggml_blck_size(type)) + padding
                                // nb[i] = nb[i-1] * ne[i-1]

    // compute data
    enum ggml_op op;

    // op params - allocated as int32_t for alignment
    int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];

    int32_t flags;

    struct ggml_tensor * src[3]; // src[GGML_MAX_SRC];

    // source tensor and offset for views
    struct ggml_tensor * view_src;
    size_t               view_offs;

    void * data;

    char name[3]; // name[GGML_MAX_NAME];

    void * extra; // extra things e.g. for ggml-cuda.cu

    char padding[8];
};



inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i]; }
inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i] += x[i];        }
inline static void ggml_vec_acc1_f32(const int n, float * y, const float   v)                  { for (int i = 0; i < n; ++i) y[i] += v;           }
inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] - y[i]; }
inline static void ggml_vec_set_f32 (const int n, float * x, const float   v)                  { for (int i = 0; i < n; ++i) x[i]  = v;           }
inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = x[i];        }
inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = -x[i];       }
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }

inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }

inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
    float sumf = 0.0;// ggml_float sumf = 0.0;
    #ifdef __ARM_NEON
    #elif defined(__AVX2__)
    #elif defined(__AVX__)
    #elif defined(__wasm_simd128__)
    #else
    // scalar
    for (int i = 0; i < n; ++i) {// n是向量长度
        sumf += x[i]*y[i];
    }
    #endif

    *s = sumf;
}



void ggml_compute_forward_mul_mat_f32(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
              struct ggml_tensor * dst) {
    // int64_t t0 = ggml_perf_time_us();
    // UNUSED(t0);

    // 分别获取源张量src0在四个维度上的元素数量（ne代表number of elements，即元素个数维度相关的属性），后续用于维度相关的判断和计算
    const int ne00 = src0->ne[0];
    const int ne01 = src0->ne[1];
    const int ne02 = src0->ne[2];
    const int ne03 = src0->ne[3];
    // 同样，获取另一个源张量src1在四个维度上的元素数量，用于后续的处理和维度检查等操作
    const int ne10 = src1->ne[0];
    const int ne11 = src1->ne[1];
    const int ne12 = src1->ne[2];
    const int ne13 = src1->ne[3];
    // 获取目标张量dst在四个维度上的元素数量，并计算出dst张量总的元素数量，方便后续整体的计算或者遍历等操作
    const int ne0  = dst->ne[0];
    const int ne1  = dst->ne[1];
    const int ne2  = dst->ne[2];
    const int ne3  = dst->ne[3];
    const int ne   = ne0*ne1*ne2*ne3;
    // 获取源张量src0在四个维度上每个元素“步长”所占字节数（nb可能代表number of bytes，字节数维度相关属性）
    const int nb00 = src0->nb[0];
    const int nb01 = src0->nb[1];
    const int nb02 = src0->nb[2];
    const int nb03 = src0->nb[3];
    // 获取源张量src1在四个维度上每个元素“步长”所占字节数
    const int nb10 = src1->nb[0];
    const int nb11 = src1->nb[1];
    const int nb12 = src1->nb[2];
    const int nb13 = src1->nb[3];
    // 获取目标张量dst在四个维度上每个元素“步长”所占字节数
    const int nb0  = dst->nb[0];
    const int nb1  = dst->nb[1];
    const int nb2  = dst->nb[2];
    const int nb3  = dst->nb[3];
    // 从传入的计算参数结构体中获取当前线程索引（ith）和总的线程数量（nth），可能用于多线程相关的计算划分等情况
    const int ith = params->ith;
    const int nth = params->nth;

    assert(ne02 == ne12);
    assert(ne03 == ne13);
    assert(ne2  == ne12);
    assert(ne3  == ne13);

    // TODO: we don't support permuted src0
    assert(nb00 == sizeof(float) || nb01 == sizeof(float));

    // dst cannot be transposed or permuted
    assert(nb0 == sizeof(float));
    assert(nb0 <= nb1);
    assert(nb1 <= nb2);
    assert(nb2 <= nb3);

    assert(ne0 == ne01);
    assert(ne1 == ne11);
    assert(ne2 == ne02);
    assert(ne3 == ne03);

    // nb01 >= nb00 - src0 is not transposed
    //   compute by src0 rows
    //
    // nb00 <  nb01 - src0 is transposed
    //   compute by src0 columns

    #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
    #endif
   
    
    if (params->type == GGML_TASK_INIT) {
        if (nb01 >= nb00) {
            return; // 当传入的计算参数类型为GGML_TASK_INIT时，如果src0张量按前面规则判断为未转置（nb01 >= nb00），则直接返回不做其他操作。
        }

        // TODO: fix this memset (wsize is overestimated)
        memset(params->wdata, 0, params->wsize);// 若src0是转置情况（nb01 < nb00），则使用memset将参数中的工作数据区域（wdata）清零（wsize可能被高估了，此内存设置操作可能存在优化空间）
        return;
    }

    if (params->type == GGML_TASK_FINALIZE) {
        if (nb01 >= nb00) {
            return;// 当传入的计算参数类型为GGML_TASK_FINALIZE时，如果src0张量是未转置情况（nb01 >= nb00）直接返回。
        }

        // TODO: fix this memset (wsize is overestimated)
        //assert(params->wsize == (ggml_nbytes(dst) + CACHE_LINE_SIZE)*nth);

        float * const wdata = (float *)params->wdata; // params->wdata; 对于src0转置情况（nb01 < nb00），获取工作数据指针（wdata）

        // cols per thread
        const int dc = (ne + nth - 1)/nth; // 计算每个线程负责的列数（dc）以及本线程负责的列范围（ic0到ic1）

        // col range for this thread
        const int ic0 = dc*ith;
        const int ic1 = MIN(ic0 + dc, ne);

        ggml_vec_cpy_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + ic0);// 将目标张量dst对应本线程负责的列范围的数据复制到工作数据区域（wdata）

        for (int k = 1; k < nth; k++) {
            // 循环调用ggml_vec_acc_f32函数，将其他线程对应的工作数据累加到本线程负责的目标张量区域，最后返回
            ggml_vec_acc_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + (ne + 16)*k + ic0); // CACHE_LINE_SIZE_F32 = 16
        }

        return;
    }

    if (nb01 >= nb00) {// 当src0张量是未转置情况（nb01 >= nb00） 
        // TODO: do not support transposed src1
        assert(nb10 == sizeof(float));// 4 bytes 检查src1在某个维度上元素所占字节数符合单精度浮点数大小要求

        // parallelize by src0 rows using ggml_vec_dot_f32

        // total rows in src0  src0总的行数（nr）
        const int nr = ne01*ne02*ne03;

        // rows per thread 每个线程负责的行数（dr）
        const int dr = (nr + nth - 1)/nth;

        // row range for this thread  本线程负责的行范围（ir0到ir1）
        const int ir0 = dr*ith;
        const int ir1 = MIN(ir0 + dr, nr);

        for (int ir = ir0; ir < ir1; ++ir) {
            // src0 indices
            const int i03 = ir/(ne02*ne01);
            const int i02 = (ir - i03*ne02*ne01)/ne01;
            const int i01 = (ir - i03*ne02*ne01 - i02*ne01);

            for (int ic = 0; ic < ne11; ++ic) {
                // src1 indices
                const int i13 = i03;
                const int i12 = i02;
                const int i11 = ic;

                // dst indices
                const int i0 = i01;
                const int i1 = i11;
                const int i2 = i02;
                const int i3 = i03;

                ggml_vec_dot_f32(ne00,
                        (float *) ((char *)  dst->data + (i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
                        (float *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)),
                        (float *) ((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13)));
            }
        }
    } else {

    }

    //int64_t t1 = ggml_perf_time_us();
    //static int64_t acc = 0;
    //acc += t1 - t0;
    //if (t1 - t0 > 10) {
    //    printf("\n");
    //    printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
    //    printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
    //    printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
    //    printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13);

    //    printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
    //}
}

//       struct ggml_init_params params = {
//           .mem_size   = 16*1024*1024,
//           .mem_buffer = NULL,
//       };
//
//       // memory allocation happens here
//       struct ggml_context * ctx = ggml_init(params);
//
//       struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
//
//       ggml_set_param(ctx, x); // x is an input variable

//       struct ggml_tensor * a  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
//       struct ggml_tensor * b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
//       struct ggml_tensor * x2 = ggml_mul(ctx, x, x);

int main() {
    // Data points as mentioned in the comment
    float x[] = {1, 0, 0, 1}; // 2*2矩阵
    float y[] = {1, 2, 1, 1}; // 2*2矩阵
    float res[4]; // 2*2矩阵

   // 设置src0张量的属性
    ggml_tensor src0 = (struct ggml_tensor) {
      .type = GGML_TYPE_F32,
      .ne = {2, 2, 1, 1},
      .nb = {sizeof(float), sizeof(float) * 2, 0, 0},  // 正确设置各维度字节步长
      .data = (void *)x,
    };

    // 设置src1张量的属性
    ggml_tensor src1 = (struct ggml_tensor) {
      .type = GGML_TYPE_F32,
      .ne = {2, 2, 1, 1},
      .nb = {sizeof(float), sizeof(float) * 2, 0, 0},  // 正确设置各维度字节步长
      .data = (void *)y,
    };

    // 设置result张量的属性，将结果存储位置指向res数组
    ggml_tensor result = (struct ggml_tensor) {
      .type = GGML_TYPE_F32,
      .ne = {2, 2, 1, 1},
      .nb = {sizeof(float), sizeof(float) * 2, sizeof(float) * 2, sizeof(float) * 2},  // 正确设置各维度字节步长
      .data = (void *)res,
    };

    // 合理初始化计算参数结构体，根据涉及的数据量等情况设置工作缓冲区大小等参数
    size_t work_buffer_size = sizeof(float) * 4;  // 根据实际运算情况预估一个合适的工作缓冲区大小，这里简单按结果矩阵元素个数 * 单元素字节数设置
    void *work_buffer = malloc(work_buffer_size);
    if (!work_buffer) {
        perror("malloc failed");
        return -1;
    }

    struct ggml_compute_params params = {
      .type = GGML_TASK_COMP,
      .ith = 0,
      .nth = 1,
      .wsize = work_buffer_size,
      .wdata = work_buffer
    };
    ggml_compute_forward_mul_mat_f32(&params, &src0, &src1, &result);  // 调用函数进行矩阵乘法计算

    // // 切换计算参数类型为GGML_TASK_FINALIZE并再次调用函数完成最终的结果整合等操作
    // params.type = GGML_TASK_FINALIZE;
    // ggml_compute_forward_mul_mat_f32(&params, &src0, &src1, &result);

    // 输出结果验证
    for (int i = 0; i < 4; ++i) {
        printf("%.0f ", res[i]);
    }
    printf("\n");

    free(work_buffer);  // 释放工作缓冲区内存

    return 0;
}