TensorFlow中的一个重要op---MatMul的实现（一）

最新推荐文章于 2025-10-26 13:39:22 发布

原创

最新推荐文章于 2025-10-26 13:39:22 发布 · 1.6w 阅读

5 ·

CC 4.0 BY-SA版权

文章标签：

#TensorFlow #MatMul

本文详细解析TensorFlow中的MatMul操作，探讨其正向计算节点和梯度计算节点的实现。核心在于核心文件core/kernels/matmul_op.cc，其中LaunchMatMul类模板根据不同设备提供不同实现，如CPU、GPU和SYCL。运算主要依赖于Eigen库的矩阵相乘，并在GPU环境下利用cuBLAS库进行高效计算。通过深入理解MatMul，有助于更好地掌握TensorFlow的工作原理。

本文目的是以Tensorfl中的MatMul这个具有代表性又比较简单的ops为例介绍一下TensorFlow中的图的节点是怎么实现的。我个人认为TensorFlow中的ops是整个TensorFlow的核心，如果理解了这个，那么对TensorFlow就有了比较深的认识。
在阅读这段代码前看一下官方文档中的添加新的op会很有帮助：
中文翻译：http://www.tensorfly.cn/tfdoc/how_tos/adding_an_op.html
英文原文：http://www.tensorflow.org/how_tos/adding_an_op/index.html#adding-a-new-op
好了对添加一个新的op的方法有一定的了解后我们看真实的可实用的例子：MatMul
一个ops其实包含两个计算节点，一个是正向计算节点，一个是梯度计算节点。
正向计算节点在源代码的：core/kernels/matmul_op.cc这个文件里面。

/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

// See docs in ../ops/math_ops.cc.

#define EIGEN_USE_THREADS

#include "tensorflow/core/kernels/matmul_op.h"

#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/kernels/fill_functor.h"

#if GOOGLE_CUDA
#include "cuda/include/cuda.h"
#include "tensorflow/core/platform/stream_executor.h"
#endif  // GOOGLE_CUDA

namespace tensorflow {

#if GOOGLE_CUDA

namespace {
template <typename T>
perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
  perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory));
  perftools::gputools::DeviceMemory<T> typed(wrapped);
  return typed;
}
}  // namespace

#endif  // GOOGLE_CUDA

typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;
#ifdef TENSORFLOW_USE_SYCL
typedef Eigen::SyclDevice SYCLDevice;
#endif  // TENSORFLOW_USE_SYCL

template <typename Device, typename T, bool USE_CUBLAS>
struct LaunchMatMul;

namespace {
// Converts a TensorFlow Tensor to an Eigen Matrix.
template <typename T>
Eigen::Map<
    const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
ToEigenMatrix(const Tensor& tensor) {
  auto matrix = tensor.matrix<T>();
  return Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>::Map(
      matrix.data(), matrix.dimension(0), matrix.dimension(1));
}

// Converts a TensorFlow Tensor to an Eigen Vector.
template <typename T>
Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, 1>> ToEigenVector(Tensor* tensor) {
  auto v = tensor->flat<T>();
  return Eigen::Matrix<T, Eigen::Dynamic, 1>::Map(v.data(), v.dimension(0));
}
template <typename T>
Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>> ToEigenVector(
    const Tensor& tensor) {
  auto v = tensor.flat<T>();
  return Eigen::Matrix<T, Eigen::Dynamic, 1>::Map(v.data(), v.dimension(0));
}
}  // namespace

// If either side can be represented as a vector, do an explicit vector
// matrix multiply and return true; else return false.
//
// Note: this uses plain Eigen and not Eigen Tensor because it is more
// efficient.
template <typename T>
bool ExplicitVectorMatrixOptimization(
    const Tensor& a, const Tensor& b,
    const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
    Tensor* out) {
  if (out->dim_size(0) == 1) {
    if (dim_pair[0].second == 0) {
      // Note: this case is optimized in Eigen Tensors.
      return false;
    } else {
      auto out_v = ToEigenVector<T>(out);
      auto a_v = ToEigenVector<T>(a);
      auto b_m = ToEigenMatrix<T>(b);
      out_v.noalias() = b_m * a_v;
    }
    return true;
  } else if (out->dim_size(1) == 1) {
    auto out_v = ToEigenVector<T>(out);
    auto a_m = ToEigenMatrix<T>(a);
    auto b_v = ToEigenVector<T>(b);
    if (dim_pair[0].first == 0) {
      out_v.noalias() = a_m.transpose() * b_v;
    } else {
      out_v.noalias() = a_m * b_v;
    }
    return true;
  }
  return false;
}
// Half is not supported.
template <>
bool ExplicitVectorMatrixOptimization<Eigen::half>(
    const Tensor& a, const Tensor& b,
    const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
    Tensor* out) {
  return false;
}

template <typename Device, typename T>
struct LaunchMatMulBase {
  static void launch(
      OpKernelContext* ctx, OpKernel* kernel, const Tensor& a, const Tensor& b,
      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
      Tensor* out) {
#ifndef TENSORFLOW_USE_SYCL
    // An explicit vector-matrix multiply is much better optimized than an
    // implicit one and this is a bottleneck during non-batched inference.
    bool was_vector = ExplicitVectorMatrixOptimization<T>(a, b, dim_pair, out);
    if (!was_vector) {
#endif  // TENSORFLOW_USE_SYCL
      functor::MatMulFunctor<

最低0.47元/天解锁文章