OpenCL(sgemm)

最新推荐文章于 2022-06-15 16:08:48 发布

GayKoo

最新推荐文章于 2022-06-15 16:08:48 发布

阅读量367

点赞数

分类专栏： OpenCL 文章标签： opencl

本文链接：https://blog.youkuaiyun.com/GayKoo/article/details/120271693

版权

OpenCL 专栏收录该内容

10 篇文章

订阅专栏

例程介绍

在OpenCL设备端进行单精度矩阵乘法运算，并与利用CBLAS库的运算结果进行比较。仅关注与OpenCL或TI设备相关的代码，其他算法逻辑不深入研究。

例程源码

Host端源码

//main.cpp
#include <iostream>
#include <cstdlib>
#include <iomanip>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <math.h>
#include <signal.h>
#include <ocl_util.h>
#include "kernel.dsp_h"

#ifdef _TI_RTOS
#include "../rtos_main.c"
    #if ti_sysbios_BIOS_version <= (0x65200)
    #include <ti/sysbios/posix/time.h>
    #else
    #include <ti/posix/gcc/time.h>
    #endif
#else
#include <time.h>
#endif

extern "C" {
#include "cblas.h"
}

using namespace std;
using namespace cl;

static double clock_diff (struct timespec *t1, struct timespec *t2)
{ return t2->tv_sec - t1->tv_sec + (t2->tv_nsec - t1->tv_nsec) / 1e9; }

void PrintMatrix(float *mat, int rows, int cols, enum CBLAS_ORDER mem_order);
void MatmulHost_ATLAS(enum CBLAS_ORDER mem_order,
                const float* A, const float *B, float *C, int M, int N, int K,
                float alpha, float beta);
void MatmulHost_loopnest(enum CBLAS_ORDER mem_order,
                const float* A, const float *B, float *C, int M, int N, int K,
                float alpha, float beta);
int  CheckForErrors(const float *Mat, const float *Golden, int M, int N, int K,
                    enum CBLAS_ORDER mem_order);

/* ======================================================================== */
/*  Global Variables                                                        */
/* ======================================================================== */
float alpha                         = 1.0f;
float beta                          = 0.0f;
enum CBLAS_ORDER order              = CblasColMajor;
#ifndef _TI_RTOS
bool check                          = true;
#else
bool check                          = false;
#endif
bool random_in                      = false;
bool calc_check                     = false;

int M                               = 0;
int N                               = 0;
int K                               = 0;
int L2_BUF_SIZE                     = 0;
int MSMC_BUF_SIZE                   = 0;
int NUMAPANELS                      = 0;
int NUMBPANELS                      = 0;
int NUMCOMPUNITS                    = 0;

/* ======================================================================== */
/*  Function Headers                                                        */
/* ======================================================================== */
void PrintUsageAndExit();
void HandleOptions(int argc, char* argv[]);
void SetSgemmParams(Device& device);

/* ======================================================================== */
/*  MAIN                                                                    */
/* ======================================================================== */
#ifdef _TI_RTOS
void ocl_main(UArg arg0, UArg arg1)
{
   int    argc = (int)     arg0;
   char **argv = (char **) arg1;
#else
#define RETURN(x) return x
int main(int argc, char *argv[])
{
#endif
   /*-------------------------------------------------------------------------
   * Catch ctrl-c so we ensure that we call dtors and the dsp is reset properly
   *------------------------------------------------------------------------*/
   signal(SIGABRT, exit);
   signal(SIGTERM, exit);

   int errs = 0;

    /* ---------------------------------------------------------------- */
    /*  Handle command line options to get M,N,K                        */
    /* ---------------------------------------------------------------- */
    HandleOptions(argc,argv);

    try
    {
       Context context(CL_DEVICE_TYPE_ACCELERATOR);
       std::vector<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
       CommandQueue Q(context, devices[0]);

       /*---------------------------------------------------------------------
       * Determine platform, set sgemm blocking/tiling parameters
       *--------------------------------------------------------------------*/
       SetSgemmParams(devices[0]);
       if (NUMCOMPUNITS == 0)  RETURN(-1);

       int VALRANGE = 17;
       if (random_in)
       {
           srand(time(NULL));
           alpha = (float) (rand() % VALRANGE + 1);
           beta  = (float) (rand() % VALRANGE + 1);
       }

       printf("C[%d,%d] = alpha * A[%d,%d] * B[%d,%d] + beta * C[%d,%d], %s\n",
              M,N,M,K,K,N, M,N,
              (order == CblasRowMajor ? "use row-major storage"
                                      : "use col-major storage"));
       printf("alpha=%f, beta=%f\n\n", alpha, beta);

       double dsp_elapsed = 0;
       double total_GFLOP = 2.0*M*N*K*1.0e-9;

       /*---------------------------------------------------------------------
       * Build kernel from pre-compiled and embedded binary
       *--------------------------------------------------------------------*/
       Program::Binaries binary(1, make_pair(kernel_dsp_bin,
                                             sizeof(kernel_dsp_bin)));
       Program             program = Program(context, devices, binary);
       program.build(devices);

       /* ---------------------------------------------------------------- */
       /*  Allocate Buffers:                                               */
       /* ---------------------------------------------------------------- */
       Buffer bufA   (context, CL_MEM_READ_ONLY,  M*K*sizeof(float));
       Buffer bufB   (context, CL_MEM_READ_ONLY,  K*N*sizeof(float));
       Buffer bufC   (context, CL_MEM_READ_WRITE, M*N*sizeof(float));
       Buffer *bufMsmc = NULL;
       if (MSMC_BUF_SIZE != 0)
           bufMsmc = new Buffer(context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI,
                                MSMC_BUF_SIZE);
       else
           bufMsmc = new Buffer(context, CL_MEM_READ_WRITE, 4); // dummy one

       /* ---------------------------------------------------------------- */
       /*  Initialized input arrays with random test data.                 */
       /* ---------------------------------------------------------------- */
       float *A = (float*) Q.enqueueMapBuffer(bufA, CL_TRUE, CL_MAP_WRITE, 0,
                                              M*K*sizeof(float));
       float *B = (float*) Q.enqueueMapBuffer(bufB, CL_TRUE, CL_MAP_WRITE, 0,
                                              K*N*sizeof(float));
       float *C = (float*) Q.enqueueMapBuffer(bufC, CL_TRUE, CL_MAP_WRITE, 0,
                                              M*N*sizeof(float));
       float *gold = nullptr;

       cout << "Generating Input Data ..." << flush;
       for (int i = 0; i < M*K; ++i)
           A[i] = random_in ? (float)(rand() % VALRANGE + 1) : 1 + (i & 7);
       for (int i = 0; i < K*N; ++i)
           B[i] = random_in ? (float)(rand() % VALRANGE + 1) : 1 + (i & 11);
       for (int i = 0; i < M*N; ++i)
           C[i] = random_in ? (float)(rand() % VALRANGE + 1) : 1 + (i & 5);
       cout << "Complete" << endl;

       if (check)
       {
#ifndef _TI_RTOS
           if ((gold = (float*) malloc(M*N*sizeof(float))) == NULL)
#else
           if ((gold = (float*) __malloc_ddr(M*N*sizeof(float))) == NULL)
#endif
           {
               printf("Unable to allocate memory to verify results\n");
               exit(-1);
           }
           memcpy(gold, C, M*N*sizeof(float));
       }

       PrintMatrix(A,M,K,order);
       PrintMatrix(B,K,N,order);
       if (random_in)  PrintMatrix(C,M,N,order);

       Q.enqueueUnmapMemObject(bufA, A);
       Q.enqueueUnmapMemObject(bufB, B);
       Q.enqueueUnmapMemObject(bufC, C);
       Q.finish();

       /*----------------------------------------------------------------------
       * Device: Do A*B = C
       *---------------------------------------------------------------------*/
       Kernel kernel (program, "K_ocl_sgemm_dsp");
       KernelFunctor matmul = kernel.bind(Q, NDRange(NUMCOMPUNITS), NDRange(1));

       struct timespec t0,t1;
       clock_gettime(CLOCK_MONOTONIC, &t0);

       /*----------------------------------------------------------------------
       * Convert RowMajor computation to ColumnMajor computation
       * Fact: Mem_Layout(A_RowMajor) === Mem_Layout(Transpose(A)_ColMajor)
       * Therefore: C_RowMajor = A_RowMajor * B_RowMajor
       *            C[mxn] = A[mxk] * B[kxn]
       * can be computed as:
       * Transpose(C)_ColMajor = Transpose(B)_ColMajor * Transpose(A)_ColMajor
       * C'[nxm] = B'[nxk] * A'[kxm],
       * where ptrC' === ptrC, ptrA' === ptrA, ptrB' === ptrB
       * So, all we need to do is to: swap(m, n), swap(a, b)
       * ld_Transpose(a)_col = lda_row = k,
       * ld_Transpose(b)_col = ldb_row = n,
       * ld_Transpose(c)_col = ldc_row = n,
       *---------------------------------------------------------------------*/
       if (order == CblasRowMajor)
           matmul(N, M, K, alpha, bufB, N, bufA, K, beta, bufC, N,
                  NUMAPANELS, NUMBPANELS,
                  __local(L2_BUF_SIZE), *bufMsmc).wait();
       else
           matmul(M, N, K, alpha, bufA, M, bufB, K, beta, bufC, M,
                  NUMAPANELS, NUMBPANELS,
                  __local(L2_BUF_SIZE), *bufMsmc).wait();

       clock_gettime(CLOCK_MONOTONIC, &t1);
       dsp_elapsed = clock_diff (&t0, &t1);

       double gflops = total_GFLOP/dsp_elapsed;
       printf("%4d DSPs: %.3f Gflops (%.6f s) \n",
              NUMCOMPUNITS, gflops, dsp_elapsed);

       if (bufMsmc != NULL)  delete bufMsmc;

       /*----------------------------------------------------------------------
       * If checking results against a host matmul.
       * This can be time consuming for large sizes.
       *---------------------------------------------------------------------*/
       C = (float*) Q.enqueueMapBuffer(bufC, CL_TRUE, CL_MAP_READ, 0,
                                       M*N*sizeof(float));

       if (check)
       {
           A = (float*) Q.enqueueMapBuffer(bufA, CL_TRUE, CL_MAP_READ, 0,
                                           M*K*sizeof(float));
           B = (float*) Q.enqueueMapBuffer(bufB, CL_TRUE, CL_MAP_READ, 0,
                                           K*N*sizeof(float));
           clock_gettime(CLOCK_MONOTONIC, &t0);

           MatmulHost_ATLAS(order, A, B, gold, M, N, K, alpha, beta);

           clock_gettime(CLOCK_MONOTONIC, &t1);
           dsp_elapsed = clock_diff (&t0, &t1);

           double gflops = total_GFLOP/dsp_elapsed;
           printf("   1 CPU : %.3f Gflops (%.6f s) with ATLAS library\n",
                  gflops, dsp_elapsed);
           Q.enqueueUnmapMemObject(bufA, A);
           Q.enqueueUnmapMemObject(bufB, B);
	   PrintMatrix(gold,M,N,order);
	   errs = CheckForErrors(C, gold, M, N, K, order);
#ifndef _TI_RTOS
           free(gold);
#else
           __free_ddr(gold);
#endif
       }

       PrintMatrix(C,M,N,order);
       Q.enqueueUnmapMemObject(bufC, C); Q.finish();
   }
   catch (Error& err)
   {
       cerr << "ERROR: " << err.what() << "(" << err.err() << ", "
            << ocl_decode_error(err.err()) << ")" << endl;
       exit(-1);
   }

   RETURN(errs);
}

/******************************************************************************
* Supporting Functions
******************************************************************************/
void PrintUsageAndExit()
{
    cout << "Matrix C[M,N] = A[M,K] * B [K,N]" << endl
         << "Default value of M,N,K is " << M << endl
         << "Usage: sgemm [options] " << endl
         << "Options: " << endl
         << "-M arg : Number of rows for array C and A" << endl
         << "-K arg : Number of cols for array A, rows for array B" << endl
         << "-N arg : Number of cols for array C and B" << endl
         << "-d     : Do not check results against host computation" << endl
         << "-r     : Generate random inputs" << endl
         << "-or    : Use Row-Major storage (default is Col-Major)" << endl
         << "-h     : Show help message"
         << endl;
    exit(0);
}

void HandleOptions(int argc, char* argv[])
{
    int c;

    if (argc == 1) return;

    while ((c = getopt (argc, argv, "o:M:K:N:hdrx")) != -1)
        switch(c)
        {
            case 'o': order = (*optarg == 'r') ? CblasRowMajor
                                               : CblasColMajor; break;
            case 'M': M = abs(atoi(optarg)); break;
            case 'K': K = abs(atoi(optarg)); break;
            case 'N': N = abs(atoi(optarg)); break;
            case 'h': PrintUsageAndExit();   break;
            case 'd': check = false;         break;
            case 'r': random_in = true;      break;
            case 'x': calc_check = true;     break;
            default:  PrintUsageAndExit();
        }
}

void PrintMatrix(float *mat, int rows, int cols, enum CBLAS_ORDER mem_order)
{
    if (rows > 64) return;
    if (cols > 16) return;

    int index;
    for (int i=0; i<rows; i++)
    {
        for (int j=0; j<cols; j++)
        {
            if (mem_order == CblasRowMajor) index = i*cols + j;
            else                            index = j*rows + i;

            cout << setprecision(9) << setw(10)
                 << mat[index] << " ";
        }
        cout << endl;
    }
    cout << endl;
}

#define EPISILON 0.01  // we have all integer inputs
int CheckForErrors(const float *Mat, const float *Golden, int M, int N, int K,
                    enum CBLAS_ORDER mem_order)
{
    int       num_errors = 0, i, j;
    const int print_nerrors = 13;
    int       index;

    for (i=0; i<M; i++)
        for (j=0; j<N; j++)
        {
            if (mem_order == CblasRowMajor) index = i*N + j;
            else                            index = j*M + i;

            float delta = Golden[index] - Mat[index];

            if (delta < -EPISILON || delta > EPISILON)
                if ((num_errors += 1) < print_nerrors)
                    printf("Error [%d,%d]: %f <==> %f\n", i, j,
                           Golden[index], Mat[index]);
        }

    if (num_errors > 0)
         cout << "FAIL with " << num_errors << " errors!" << endl;
    else cout << "PASS!" << endl;
    return num_errors;
}

void MatmulHost_ATLAS(enum CBLAS_ORDER mem_order,
                const float*A, const float *B, float *C, int M, int N, int K,
                float alpha, float beta)
{
#ifndef _TI_RTOS
    if (mem_order == CblasRowMajor)
    {
        cblas_sgemm(mem_order, CblasNoTrans, CblasNoTrans,
                    M, N, K, alpha,
                    A, /* lda = */ K,
                    B, /* ldb = */ N,
                    beta,
                    C, /* ldc = */ N
                   );
    } else {
        cblas_sgemm(mem_order, CblasNoTrans, CblasNoTrans,
                    M, N, K, alpha,
                    A, /* lda = */ M,
                    B, /* ldb = */ K,
                    beta,
                    C, /* ldc = */ M
                   );
    }
#endif
}

static cl_ulong roundDownPower2(cl_ulong value)
{ return (value == 0) ? 0 :  1 << ilogb(value); }

/*-----------------------------------------------------------------------------
* Check platform name, set sgemm blocking/tiling parameters accordingly
*----------------------------------------------------------------------------*/
void SetSgemmParams(Device& device)
{
   int APanelSz        = 8  << 10;
   int BPanelSz        = 16 << 10;
   cl_ulong global_mem = 0;
   cl_ulong l2_mem     = 0;
   cl_ulong msmc_mem   = 0;

   device.getInfo(CL_DEVICE_MAX_COMPUTE_UNITS, &NUMCOMPUNITS);
   device.getInfo(CL_DEVICE_GLOBAL_MEM_SIZE,   &global_mem);
   device.getInfo(CL_DEVICE_LOCAL_MEM_SIZE,    &l2_mem);

#ifdef CL_DEVICE_MSMC_MEM_SIZE_TI
   device.getInfo(CL_DEVICE_MSMC_MEM_SIZE_TI,  &msmc_mem);
#endif

   global_mem    = roundDownPower2(global_mem);
   L2_BUF_SIZE   = roundDownPower2(l2_mem);
   MSMC_BUF_SIZE = roundDownPower2(msmc_mem);

   /*----------------------------------------------------------------------
   * How big of a square matrix can we use.  Need 3 BTW
   *---------------------------------------------------------------------*/
   if (!M && !N && !K)
   {
       M = N = K = roundDownPower2(sqrt(global_mem / 3 / sizeof(float)));
       if (M >= 2048) M = N = K = 2048;
   }

   NUMAPANELS    = L2_BUF_SIZE / 2 / APanelSz;
   NUMBPANELS    = L2_BUF_SIZE / 4 / BPanelSz;

   if ((NUMCOMPUNITS * APanelSz * NUMAPANELS) > MSMC_BUF_SIZE)
        MSMC_BUF_SIZE = 0;
   else MSMC_BUF_SIZE = NUMCOMPUNITS * APanelSz * NUMAPANELS;

   if (calc_check)
   {
       cout << "M,N,K         = " << M << ", "
            << N << ", "
            << K << endl;
       cout << "MSMC_BUF_SIZE = " << MSMC_BUF_SIZE << endl;
       cout << "L2_BUF_SIZE   = " << L2_BUF_SIZE << endl;
       cout << "NUMAPANELS    = " << NUMAPANELS << endl;
       cout << "NUMBPANELS    = " << NUMBPANELS << endl;
   }
}

OpenCL设备端源码

#include "data.h"
#include "cblas.h"

//矩阵乘法api
void sgemm(
           int m, int n, int k,
           float alpha,
	   global float * a, int lda,
           global float * b, int ldb,
           float beta,
           global float * c, int ldc,
           int NUMAPANELS, int NUMBPANELS,
           float* pL1, local float* pL2, global float* pMsmc, int tid);

kernel __attribute__((reqd_work_group_size(1,1,1))) void 
K_ocl_sgemm_dsp(
                int m, int n, int k,
                float alpha,
                global float *a, int lda,
                global float *b, int ldb,
                float beta,
                global float *c, int ldc,
                int NUMAPANELS, int NUMBPANELS,
                local  float *L2_buf, global float *Msmc_buf)
{
    int chunks    = get_global_size(0);
    int id        = get_global_id(0);

    int mLocal    = m < chunks ? 1 : m / chunks;
    /* if not enough work for all cores, only first (chunks) cores compute */
    if (m < chunks && id >= m)  return;

    int offset    = mLocal * id;
    /* if m > chunks and (chunks) does not divide (m) evenly,
     * first (m % chunks) cores get one extra row to compute */
    if (m > chunks)
    {
        mLocal += (id < (m % chunks) ? 1 : 0);
        offset += (id < (m % chunks) ? id : (m % chunks));
    }

    float* L1_buf = (float*) __scratch_l1d_start(); //返回可作为暂存内存的L1D SRAM基地址。
    if (Msmc_buf >= (global float*) 0x80000000)  Msmc_buf = (global float*) 0;
    __cache_l1d_16k(); //设置L1D内存为16k缓存，剩下的作为暂存内存
    sgemm(mLocal, n, k,
          alpha, a + offset, lda, b, ldb, beta, c + offset, ldc,
          NUMAPANELS, NUMBPANELS,
          L1_buf, L2_buf, Msmc_buf, __local_core_num());
    __cache_l1d_all(); //L1D内存全部设为缓存
}

## 其他源文件

```cpp
//sgemm.cpp
#define USE_EDMA 1

#include "data.h"
#include "sgemm_kernel.h"
#include <string.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include "dsp_c.h"
#include "dsp_edmamgr.h"

/*-----------------------------------------------------------------------------
* On KeyStone devices, MSMC is not cached in L2 and this macro could be 
* #define DSP_wbInv_L2  __cache_l1d_flush.  However, the performance delta 
* is negligible.  On Sitara (AM57) devices, the MSMC or (OCMC) is cached in 
* L2 as well as L1D.
*----------------------------------------------------------------------------*/
#define DSP_wbInv_L2  __cache_l2_flush //清空L2缓冲区
#define DSP_wbInv_L1D __cache_l1d_flush //清空L1D

// sgemm interface
// Assuming COLUMN MAJOR, NO TRANPOSE ON A AND B
// Computes C[MxN] = alpha * A[MxK] * B[KxN] + beta * C[MxN]
// Requirements: 16KB  of L1 SRAM, passed in as pL1
// Requirements: 128KB of L2 SRAM, passed in as pL2
void sgemm(
           const int m, const int n, const int k,
           const float  alpha,
           float* restrict a, const int lda,
           float* restrict b, const int ldb,
           const float  beta,
           float* restrict c, const int ldc,
           int NUMAPANELS, int NUMBPANELS,
           float* restrict pL1, float* restrict pL2,
           float* restrict pMsmc, int tid
          )
{
    if (m == 0 || n == 0 || ((alpha == 0.0f || k == 0) && beta == 1.0f)) return;

    float __attribute__((aligned(8)))
          ptrCTemp[CORE_PROCESS_ROWS*CORE_PROCESS_COLS];
    int mXferIndex, nXferIndex;
    int kIndex, kCnt, kCntNext;
    int mIndex, mCnt, mCntNext;
    int nIndex, nCnt, nCntNext/*, innerNCnt*/;
#if !(USE_EDMA)
    int kCntPrev, nCntPrev, mCntPrev;
#endif
    int innerIndex_m, innerIndex_n;
    int flagLastK, flagLastM, flagLastN, flagLastMXfers, flagLastNXfers;
    float * restrict ptrA, * restrict ptrB, * restrict ptrC;
    float * restrict ptrASeg1, * restrict ptrASeg2;
    float * restrict ptrBSeg1, * restrict ptrBSeg2;
    short  indexACurrent, indexANext, indexBCurrent, indexBNext;
    float * restrict ptrCInternal;
    int ldcInternal, i, j, nValid, mValid;

    // partition in m dimension
    int MPARTITION = (NUMAPANELS*CORE_PROCESS_ROWS);
    // partition in n dimension
    int NPARTITION = (NUMBPANELS*CORE_PROCESS_COLS);
    
    if (pMsmc)
    {
        // Keep unpacked A panels in MSMC SRAM
        ptrASeg1 = pMsmc+(tid)*MPARTITION*KPARTITION;
    }
    // Move packed A panel in L2 from DDR
    ptrASeg2 = pL2; 
    // Keep B panel ping pong buffers in L2 
    ptrBSeg1 = ptrASeg2+(MPARTITION*KPARTITION);
    ptrBSeg2 = ptrBSeg1+(NPARTITION*KPARTITION); 

    /* Beta scaling of C */
    if(beta != (float) 1.0) // only if scaling of C is needed
    {
        if(beta == (float) 0.0)
        {
            // zero out c column by column
            for(nCnt = 0; nCnt < n; nCnt++)
                memset(c + nCnt*ldc, 0, m * sizeof(float));
        } // if(beta==0.0f)
        else
        {
            // column by column multiplication
            for(nCnt = 0; nCnt < n; nCnt++)
                for(mCnt = 0; mCnt < m; mCnt++)
                    c[nCnt*ldc + mCnt] *= beta;
        } // else
    } // if(beta != 1.0f)

    mXferIndex = 0;
    nXferIndex = 0;

    mCnt = (m < MPARTITION) ? m : MPARTITION;
    kCnt = (k < KPARTITION) ? k : KPARTITION;
    nCnt = (n < NPARTITION) ? n : NPARTITION;

#if USE_EDMA
    /* Initialize EDMA Manager */
    EdmaMgr_Handle chan0, chan1;
    if (pMsmc != NULL)  chan0 = __ocl_EdmaMgr_alloc_intrakernel(1);
    chan1 = __ocl_EdmaMgr_alloc_intrakernel(1);
    if ((pMsmc != NULL && !chan0) || !chan1) 
    {  
        printf("Failed to alloc edma handle.\n");
    }
#endif

    if (pMsmc)
    {
        // initiate first transfer of A to MSMC
#if USE_EDMA
        EdmaMgr_copy2D2DSep(chan0,
                            a, /* src */
                            ptrASeg1, /* dst */
                            mCnt*sizeof(float), /* num_bytes */
                            kCnt, /* num_lines */
                            lda*sizeof(float), /* src_pitch */
                            MPARTITION*sizeof(float) /* dst_pitch */
                            ); //一次DMA操作
        DSP_wbInv_L2();  // ptrASeg1
#else
        for (i = 0; i < kCnt; i++)
            memcpy(ptrASeg1 + i * MPARTITION, a + i * lda, mCnt*sizeof(float));
#endif
    }

    // initiate first transfer of B to L2
#if USE_EDMA
    EdmaMgr_copy2D2DSep(chan1,
                        b, /* src */
                        ptrBSeg1, /* dst */
                        kCnt*sizeof(float), /* num_bytes */
                        nCnt, /* num_lines */
                        ldb*sizeof(float), /* src_pitch */
                        KPARTITION*sizeof(float) /* dst_pitch */
                        );
    DSP_wbInv_L1D();  // ptrBSeg1
#else
    for (i = 0; i < nCnt; i++)
        memcpy(ptrBSeg1 + i * KPARTITION, b + i * ldb, kCnt*sizeof(float));
#endif

    indexACurrent=1;
    indexANext=0;
    indexBCurrent=1;
    indexBNext=0;

    for(kIndex=0; kIndex<k; kIndex+=KPARTITION)  // partition in k dimension
    {
        nXferIndex = kIndex;
        kCnt = ((k-kIndex) < KPARTITION) ? (k-kIndex) : KPARTITION;
        kCntNext = ((k-kIndex-KPARTITION) < KPARTITION) ? (k-kIndex-KPARTITION) : KPARTITION;
        flagLastK = ((kIndex+KPARTITION) < k) ? 0 : 1;

        for(mIndex = 0; mIndex<m; mIndex+=MPARTITION)  // partition in m dimension
        {
            mCnt = ((m-mIndex) < MPARTITION) ? (m-mIndex) : MPARTITION;
            flagLastM = ((mIndex+MPARTITION)<m) ? 0 : 1;
            flagLastMXfers = ((mIndex+2*MPARTITION)<m) ? 0 : 1;
            mCntNext = ((m-mIndex-MPARTITION) < MPARTITION) ?
                       (m-mIndex-MPARTITION) : MPARTITION;
            mCntNext = (mCntNext <= 0) ? (m < MPARTITION ? m : MPARTITION)
                                       :  mCntNext;
            if(flagLastM) mCntNext = (m < MPARTITION) ? m : MPARTITION;

            // bring in A into MSMC SRAM (a new parallel transfer)
            indexACurrent = (indexACurrent+1) & 1;
            indexANext = (indexANext+1) & 1;

            // No need to memset invalid rows, because we select results
            // memset((void *) ptrASeg2, 0,
            //        MPARTITION * KPARTITION * sizeof(float));
            if (pMsmc)
            {
#if USE_EDMA
                EdmaMgr_wait(chan0); //等待与句柄相关的DMA操作完成
#endif
                dataMoveA(ptrASeg2, ptrASeg1, mCnt, kCnt, MPARTITION);
            }
            else
            {
                dataMoveA(ptrASeg2, a+mXferIndex, mCnt, kCnt, lda);
            }

            mXferIndex += mCnt;
            mXferIndex = (!flagLastM) ? mXferIndex: mXferIndex-m+kCnt*lda;

            if (pMsmc)
            {
                if ((!flagLastM) || (!flagLastK))
                {
                    if (mIndex == 0 || flagLastMXfers)
                    {
#if USE_EDMA
                        EdmaMgr_copy2D2DSep(chan0,
                                            a+mXferIndex, /* src */
                                            ptrASeg1, /* dst */
                                            mCntNext*sizeof(float), /* num_bytes */
                                            (flagLastM ? kCntNext : kCnt), /* num_lines */
                                            lda*sizeof(float), /* src_pitch */
                                            MPARTITION*sizeof(float) /* dst_pitch */
                                            );
                        DSP_wbInv_L2();  // ptrASeg1
#else
                        kCntPrev = (flagLastM ? kCntNext : kCnt);
                        for (i = 0; i < kCntPrev; i++)
                            memcpy(ptrASeg1 + i * MPARTITION,
                                   a+mXferIndex + i * lda,
                                   mCntNext*sizeof(float));
                        mCntPrev = mCntNext;
#endif
                    }
                    else if (flagLastM)
                    {
#if USE_EDMA
                        EdmaMgr_copy2D2DSep(chan0,
                                            a+mXferIndex, /* src */
                                            ptrASeg1, /* dst */
                                            mCntNext*sizeof(float), /* num_bytes */
                                            kCntNext, /* num_lines */
                                            lda*sizeof(float), /* src_pitch */
                                            MPARTITION*sizeof(float) /* dst_pitch */
                                            );
                        DSP_wbInv_L2();  // ptrASeg1
#else
                        kCntPrev = kCntNext;
                        for (i = 0; i < kCntPrev; i++)
                            memcpy(ptrASeg1 + i * MPARTITION,
                                   a+mXferIndex + i * lda,
                                   mCntNext*sizeof(float));
                        mCntPrev = mCntNext;
#endif
                    }
                    else
                    {
#if USE_EDMA
                        EdmaMgr_copyFast(chan0,
                                         a+mXferIndex, /* src */
                                         ptrASeg1 /* dst */
                                         ); //设置不同的起终点，使用该句柄上次指令的其他参数
                        DSP_wbInv_L2();  // ptrASeg1
#else
                        for (i = 0; i < kCntPrev; i++)
                            memcpy(ptrASeg1 + i * MPARTITION,
                                   a+mXferIndex + i * lda,
                                   mCntPrev*sizeof(float));
#endif
                    }
                }
            }


            for(nIndex = 0; nIndex<n; nIndex+=NPARTITION)  // partition in n dimension
            {
                nCnt = ((n-nIndex) < NPARTITION) ? (n-nIndex) : NPARTITION;
                nCntNext = ((n-nIndex-NPARTITION) < NPARTITION) ? (n-nIndex-NPARTITION) : NPARTITION;
                nCntNext = (nCntNext <= 0) ? (n < NPARTITION ? n : NPARTITION)
                                           : nCntNext;
                flagLastN = ((nIndex+NPARTITION)<n) ? 0 : 1;
                flagLastNXfers = ((nIndex+2*NPARTITION)<n) ? 0 : 1;
                if(flagLastN) nCntNext = (n < NPARTITION) ? n : NPARTITION;

                // bring in B into L1 SRAM (a new parallel transfer)
                indexBCurrent = (indexBCurrent+1) & 1;
                indexBNext = (indexBNext+1) & 1;
                
#if USE_EDMA
                EdmaMgr_wait(chan1);
#endif

                if((!flagLastM) || (!flagLastK) || (!flagLastN)) // don't carry out DMA for the last iteration
                {
                    nXferIndex += nCnt*ldb;
                    nXferIndex = (!flagLastN) ? nXferIndex: kIndex;
                    nXferIndex = ((!flagLastN) || (!flagLastM)) ? nXferIndex: (kIndex+kCnt);
                    ptrB = (indexBNext == 0) ? ptrBSeg1: ptrBSeg2;
                    if (nIndex == 0 || flagLastNXfers)
                    {
#if USE_EDMA
                        EdmaMgr_copy2D2DSep(chan1,
                                            b+nXferIndex, /* src */
                                            ptrB, /* dst */
                                            ((flagLastM && flagLastN) ? kCntNext : kCnt)*sizeof(float), /* num_bytes */
                                            nCntNext, /* num_lines */
                                            ldb*sizeof(float), /* src_pitch */
                                            KPARTITION*sizeof(float) /* dst_pitch */
                                            );
                        DSP_wbInv_L1D();  // ptrB
#else
                        for (i = 0; i < nCntNext; i++)
                            memcpy(ptrB + i * KPARTITION,
                                   b+nXferIndex + i * ldb,
                                   ((flagLastM && flagLastN)
                                    ? kCntNext : kCnt)*sizeof(float));
                        nCntPrev = nCntNext;
                        kCntPrev = (flagLastM && flagLastN) ? kCntNext : kCnt;
#endif
                    }
                    else if (flagLastM && flagLastN)
                    {
#if USE_EDMA
                        EdmaMgr_copy2D2DSep(chan1,
                                            b+nXferIndex, /* src */
                                            ptrB, /* dst */
                                            kCntNext*sizeof(float), /* num_bytes */
                                            nCntNext, /* num_lines */
                                            ldb*sizeof(float), /* src_pitch */
                                            KPARTITION*sizeof(float) /* dst_pitch */
                                            );
                        DSP_wbInv_L1D();  // ptrB
#else
                        for (i = 0; i < nCntNext; i++)
                            memcpy(ptrB + i * KPARTITION,
                                   b+nXferIndex + i * ldb,
                                   kCntNext*sizeof(float));
                        nCntPrev = nCntNext;
                        kCntPrev = kCntNext;
#endif
                    }
                    else
                    {
#if USE_EDMA
                        EdmaMgr_copyFast(chan1,
                                         b+nXferIndex, /* src */
                                         ptrB /* dst */
                                         );
                        DSP_wbInv_L1D();  // ptrB
#else
                        for (i = 0; i < nCntPrev; i++)
                            memcpy(ptrB + i * KPARTITION,
                                   b+nXferIndex + i * ldb,
                                   kCntPrev*sizeof(float));
#endif
                    }
                }

                // L2 memory assignment for B
                ptrB = (indexBCurrent == 0) ? ptrBSeg1: ptrBSeg2;
                // Corner case, zero out invalid cols in ptrB
                if (flagLastN && (nCnt % CORE_PROCESS_COLS != 0))
                {
                    for (i = 0; i < CORE_PROCESS_COLS -
                                    nCnt%CORE_PROCESS_COLS; i++)
                        memset((void *) (ptrB + (i+nCnt)*KPARTITION), 0,
                               kCnt * sizeof(float));
                }
                for(innerIndex_n = 0; innerIndex_n<nCnt; innerIndex_n+=CORE_PROCESS_COLS)
                {

                    dataMoveB(pL1, ptrB, kCnt);
                    ptrB += (CORE_PROCESS_COLS*KPARTITION);

                    // L2 memory assignment for B
                    ptrA = ptrASeg2;
                    // output memory assignment
                    ptrC= c + mIndex + (nIndex+innerIndex_n)*ldc;
                    for(innerIndex_m = 0; innerIndex_m<mCnt; innerIndex_m+=CORE_PROCESS_ROWS)
                    {
                        if (ldc % 2 != 0 ||
                            innerIndex_n + CORE_PROCESS_COLS > nCnt ||
                            innerIndex_m + CORE_PROCESS_ROWS > mCnt)
                        {
                            nValid = (nCnt - innerIndex_n > CORE_PROCESS_COLS)
                                     ? CORE_PROCESS_COLS : nCnt - innerIndex_n;
                            mValid = (mCnt - innerIndex_m > CORE_PROCESS_ROWS)
                                     ? CORE_PROCESS_ROWS : mCnt - innerIndex_m;
                            for (j = 0; j < nValid; j++)
                                for (i = 0; i < mValid; i++)
                                    ptrCTemp[j*CORE_PROCESS_ROWS + i] =
                                                           ptrC[j*ldc + i];
                            ptrCInternal = ptrCTemp;
                            ldcInternal  = CORE_PROCESS_ROWS;
                        }
                        else
                        {
                            ptrCInternal = ptrC;
                            ldcInternal  = ldc;;
                        }
                        // pre-fetch required A to L1 Cache
                        // 4xk * kx8 core matrix multiplications
                        __touch((const char *)ptrA,
                                CORE_PROCESS_ROWS * kCnt * sizeof(float));

                        sgemm_kernel(ptrA, pL1, ptrCInternal, alpha, kCnt,
                                     ldcInternal);

                        if (ldc % 2 != 0 ||
                            innerIndex_n + CORE_PROCESS_COLS > nCnt ||
                            innerIndex_m + CORE_PROCESS_ROWS > mCnt)
                        {
                            for (j = 0; j < nValid; j++)
                                for (i = 0; i < mValid; i++)
                                    ptrC[j*ldc + i] =
                                         ptrCTemp[j*CORE_PROCESS_ROWS + i];
                        }

                        // address of C to write to
                        ptrC += CORE_PROCESS_ROWS;
                        ptrA += (CORE_PROCESS_ROWS*KPARTITION);

                    } // inner loop m

                } // inner loop n
            } // n loop
        } // m loop
    } // k loop

#if USE_EDMA
    if (pMsmc)  EdmaMgr_free(chan0);
    EdmaMgr_free(chan1);
#endif
}

// sgemm row major interface, calls sgemm in turn
// Requirements: 16KB  of L1 SRAM, passed in as pL1
// Requirements: 128KB of L2 SRAM, passed in as pL2
void sgemm_rowmajor(
                 const int M, const int N, const int K,
                 const float alpha, float* restrict A, const int lda,
                 float* restrict B, const int ldb,
                 const float beta, float* restrict C, const int ldc,
                 int NUMAPANELS, int NUMBPANELS,
                 float* restrict pL1, float* restrict pL2,
                 float* restrict pMsmc, int tid)
{
       /*----------------------------------------------------------------------
       * Convert RowMajor computation to ColumnMajor computation
       * Fact: Mem_Layout(A_RowMajor) === Mem_Layout(Transpose(A)_ColMajor)
       * Therefore: C_RowMajor = A_RowMajor * B_RowMajor
       *            C[mxn] = A[mxk] * B[kxn]
       * can be computed as:
       * Transpose(C)_ColMajor = Transpose(B)_ColMajor * Transpose(A)_ColMajor
       * C'[nxm] = B'[nxk] * A'[kxm],
       * where ptrC' === ptrC, ptrA' === ptrA, ptrB' === ptrB
       * So, all we need to do is to: swap(m, n), swap(a, b)
       * ldA'_col = ldA_row = k, 
       * ldB'_col = ldB_row = n, 
       * ldC'_col = ldC_row = n, 
       *---------------------------------------------------------------------*/
       sgemm(N, M, K, alpha, B, ldb, A, lda, beta, C, ldc, 
             NUMAPANELS, NUMBPANELS, pL1, pL2, pMsmc, tid);
}