【BBuf的CUDA笔记】五,解读 PyTorch index_add 操作涉及的优化技术

部署运行你感兴趣的模型镜像

本文把pytorch index_add算子的代码抽取出来放在:https://github.com/BBuf/how-to-optim-algorithm-in-cuda/blob/master/indexing/index_add_cuda_pytorch_impl.cu 。如果不太熟悉PyTorch的话也可以直接看这个.cu文件,有问题请在这个repo提issue。

0x0. 前言

我们可以在 PyTorch 的文档中找到 torch.index_add_ 的定义(https://pytorch.org/docs/stable/generated/torch.Tensor.index_add_.html#torch.Tensor.index_add_):

在这里插入图片描述
简单来说就是我们需要根据index的索引完成对当前Tensor dim维度的inplace加和,注意被加数是由另外一个Tensor src决定的。在PyTorch的codebase中搜索index_add,我们可以发现这个操作应用得非常广泛,比如说作为as_strided算子的backward的一部分,作为一些sparse操作的一部分等等。我最近研究了一下,发现PyTorch对index_add算子的cuda kernel进行了较为精细的优化,主要有两个亮点,本篇文章就来学习一下。

顺便提一下,在PyTorch中index_add的cuda kernel实现在https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cuda/Indexing.cu#L712 ,如果你想自己详细读这个代码我建议先编译一下PyTorch再进行调试和阅读,编译PyTorch源码可以参考:https://github.com/BBuf/how-to-optim-algorithm-in-cuda/tree/master/how-to-complie-pytorch-from-source(这个也是参考PyTorch官方的教程,补充了几个报错的坑) 。

0x1. 亮点1: 按照index的元素个数派发不同的实现

PyTorch优化的出发点是,index_add操作中index这个Tensor是尤其重要,它决定了输入Tensor的哪些位置会被重新赋值,然后index的元素可多可少。如果使用同一套naive的计算逻辑可能会因为重复访问index导致全局内存访问过多,而如果index很大那么为了保证性能kernel又需要满足足够的并行度才可以。为了平衡这两种情况,PyTorch按照index的元素个数实现了2套kernel。这2套kernel的实现在:https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cuda/Indexing.cu#L576-L675 。然后根据index元素个数进行dispatch:https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cuda/Indexing.cu#L801-L829 。

在这里插入图片描述

我在 https://github.com/BBuf/how-to-optim-algorithm-in-cuda/blob/master/indexing/indexing_pytorch_explain.cu#L381-L505 这里以PyTorch文档展示的例子(https://pytorch.org/docs/stable/generated/torch.Tensor.index_add_.html#torch.Tensor.index_add_)为例记录了各个中间变量的值,并加上了一些方便理解的注释,感兴趣的可以查看。

我们这里展示一下当index的元素很少的时候的indexFuncSmallIndex kernel实现(代码中的设置是index元素个数少于16):

// 如果索引的数量很少,我们更喜欢使用这个Kernel来避免重新加载 index。
// 这个kernel实际上适用于几乎所有问题大小的选择,但如果选择的索引数量很大,
// 那么indexFuncLargeIndex Kernel是增加并行度的更好选择。
// 下面的innerSize就是输人的self张量忽略dim维度的切片大小,对于每一个indices[i],我们都要处理innerSize个元素的copy

// selfAddDim(dstAddDim) = 0
// sourceAddDim(srcAddDim) = 0
// sliceSize(innerSize) = 3
// selfAddDimSize(dstAddDimSize) = 5
// selfNumel(dstNumel) = 15
// selfInfo.sizes(dst): 1, 3, 
// selfInfo.strides(dst): 3, 1,
// sourceInfo.sizes(src): 1, 3, 
// sourceInfo.strides(src): 3, 1
// indexInfo.sizes(indices): 3, 
// indexInfo.strides(indices): 1,

template <typename T, typename IndicesType, typename IndexType, int DstDim, int SrcDim, int IdxDim,
          typename func_t>
__global__ void indexFuncSmallIndex(cuda::detail::TensorInfo<T, IndexType> dst,
                                    cuda::detail::TensorInfo<T, IndexType> src,
                                    cuda::detail::TensorInfo<IndicesType, IndexType> indices,
                                    int dstAddDim,
                                    int srcAddDim,
                                    IndexType innerSize,
                                    int64_t dstAddDimSize,
                                    int64_t dstNumel,
                                    const func_t& op,
                                    T alpha) {
  // In order to avoid reloading the index that we are copying, load
  // it once to handle all of the points that are being selected, so
  // it can be reused as much as possible. This kernel is chosen when
  // this is a good choice (small number of chosen indices), since
  // re-accessing indices in addition to src elements can be slow.
  // 为了避免重新加载我们正在复制的索引,加载一次以处理所有正在选择的点,以便尽可能地重复使用它。 
  // 当这是一个不错的选择(选择的索引数量很少)时,就会选择这个Kernel,
  // 因为除了 src 元素之外,重新访问索引可能很慢。
  for (IndexType srcIndex = 0; srcIndex < indices.sizes[0]; ++srcIndex) {
    // Lua indices begin at 1
    IndexType dstIndex =
        indices.data[cuda::detail::IndexToOffset<IndicesType, IndexType, IdxDim>::get(srcIndex, indices)];
    CUDA_KERNEL_ASSERT(dstIndex < dstAddDimSize);

    // We stride over the output ignoring the indexed dimension
    // (innerSize), whose offset calculation is handled differently
    for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
         linearIndex < innerSize;
         linearIndex += gridDim.x * blockDim.x) {
      IndexType dstOffset =
          cuda::detail::IndexToOffset<T, IndexType, DstDim>::get(linearIndex, dst);
      dstOffset += dstIndex * dst.strides[dstAddDim];

      IndexType srcOffset =
          cuda::detail::IndexToOffset<T, IndexType, SrcDim>::get(linearIndex, src);
      srcOffset += srcIndex * src.strides[srcAddDim];

      T val = src.data[srcOffset] * alpha;
      op(dst.data, dstOffset, dstNumel, &val);
    }

  }
}

我们可以看到首先有一个for (IndexType srcIndex = 0; srcIndex < indices.sizes[0]; ++srcIndex) 的循环来避免重复加载 index Tensor(这个时候index Tensor信息由indices管理),后续的实验结果也将证明这个优化在 index 元素个数比较小而 self Tensor 比较大的时候是有一定性能提升的。然后选定一个indices[i] 之后就启动一堆线程计算完这个indices[i]对应的 self Tensor的一个切片(linearIndex < innerSize)。

indexFuncLargeIndex Kernel我就不展示了,感兴趣的小伙伴可以直接阅读代码实现。

实现完这两个Kernel之后,我们可以在 https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cuda/Indexing.cu#L753-L778 这里看到PyTorch分别为这两个Kernel设置了不同的GridSize和BlockSize。

// selfAddDim = 0
// sourceAddDim = 0
// sliceSize = 3
// selfAddDimSize = 5
// selfNumel = 15

#define SMALL_INDEX(TENSOR_TYPE, INDICES_TYPE, TYPE, SELF_DIM, SOURCE_DIM, IDX_DIM)     \
  indexFuncSmallIndex<TENSOR_TYPE, INDICES_TYPE, TYPE, SELF_DIM, SOURCE_DIM, IDX_DIM>   \
    <<<smallIndexGrid, smallIndexBlock, 0, stream>>>(                                   \
      selfInfo, sourceInfo, indexInfo,                                                  \
      selfAddDim, sourceAddDim, sliceSize, selfAddDimSize,                              \
      selfNumel, reduce_add, alpha_value);                                              \
  C10_CUDA_KERNEL_LAUNCH_CHECK();

#define LARGE_INDEX(TENSOR_TYPE, INDICES_TYPE, TYPE,                        \
                    SELF_DIM, SOURCE_DIM, IDX_DIM, IDX_IS_MAJOR)            \
  indexFuncLargeIndex<TENSOR_TYPE, INDICES_TYPE, TYPE,                      \
                      SELF_DIM, SOURCE_DIM, IDX_DIM, IDX_IS_MAJOR>          \
    <<<largeIndexGrid, largeIndexBlock, 0, stream>>>(                       \
      selfInfo, sourceInfo, indexInfo,                                      \
      selfAddDim, sourceAddDim, sourceTotalSize,                            \
      (IDX_IS_MAJOR) ? sliceSize : numIndex,                                \
      selfAddDimSize, selfNumel, reduce_add, alpha_value);                  \
  C10_CUDA_KERNEL_LAUNCH_CHECK();

  // small index以正在索引的每个切片的大小为基准来设定GridSize和BlockSize,同时要考虑到需要满足足够多的wave保证利用率
  const dim3 smallIndexGrid(std::min(ceil_div(sliceSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
  const dim3 smallIndexBlock(std::min(sliceSize, (ptrdiff_t)128));

  // large index以source 张量的总大小为基准来设定GridSize和BlockSize,同时要考虑到需要满足足够多的wave保证利用率
  const dim3 largeIndexGrid(std::min(ceil_div(sourceTotalSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
  const dim3 largeIndexBlock(std::min(sourceTotalSize, (ptrdiff_t)128));

对于index的元素个数比较小也就是smallIndex的情况,线程块的数量由sliceSize来决定,而对于index元素个数比较大也就是largeIndex的时候线程块的数量则由输入Tensor self的总元素数量来决定。我个人感觉这里设置GridSize和BlockSize还是存在一定问题的,在profile的时候ncu对于index比较小并且输入Tensor也不太大的情况会提示grid太小无法充分发挥并行性的问题。建议阅读https://mp.weixin.qq.com/s/1_ao9xM6Qk3JaavptChXew 这篇文章设置更合理的GridSize和BlocSize,或许可以提升smallIndex Kernel的性能。

比如index很小但是输入Tensor只有一个维度的情况下,这个时候PyTorch只会启动一个Block以及一个Thread,这显然是个bad case:

在这里插入图片描述

0x2. 亮点2: 维度压缩减少坐标映射的计算量

index_add里面的第二个优化亮点是对Tensor的维度压缩,对应代码的https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cuda/Indexing.cu#L793, https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cuda/Indexing.cu#L787 ,这个维度压缩是什么意思呢?

假设index_add操作的输入Tensor是三个维度假设形状为(32, 1024, 1024),而dim设置为0。那么在cuda Kernel中索引位置的时候是可以提前把dim后面的维度给合并起来的(这里使用TensorInfo数据结构来完成,其实本质上就是操作这个TensorInfo对象维护的Tensor的stride和size,具体可见这里的实现:https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/CollapseDims.h#L22),这样子原始的输入Tensor的形状就会变成(32, 1024)。这样在indexFuncSmallIndex和indexFuncLargeIndex Kernel里面做坐标映射的时候就可以降低计算量以及降低对全局内存的访问提升带宽。注意,这里的维度压缩也可以压缩dim之前的所有维度为一个维度,这样子最终kernel需要处理的self输入张量维度只有1,2,3三种情况。

虽然这个优化是算法层面的优化,但是也间接让cuda kernel的带宽进行了提升和计算量进行了下降。实际上这个思路也启发了我在oneflow中实现index_add的kernel,我也是间接做了维度压缩。以这个例子来说:

x = torch.randn(32, 1024, 1024).to("cuda")
t = torch.randn(15, 1024, 1024).to("cuda")
index = torch.randint(0, 32, (15,)).to("cuda")
x.index_add_(0, index, t)
torch.cuda.synchronize()

使用ncu在a100 pcie 40g上profile,我发现使用了维度压缩优化之后将这个cuda kernel从接近300+us的运行速度提升到了180+ us。

0x3. 实战性能表现

我这里对比了一下PyTorch的index_add和oneflow中index_add的性能表现。做性能profile的时候,我使用了以下脚本:

import torch

x = torch.randn(32*1024*1024).to("cuda")
t = torch.randn(15).to("cuda")
index = torch.randint(0, 1024, (15,)).to("cuda")
x.index_add_(0, index, t)
torch.cuda.synchronize()

x = torch.randn(32*1024, 1024).to("cuda")
t = torch.randn(15, 1024).to("cuda")
index = torch.randint(0, 1024, (15,)).to("cuda")
x.index_add_(0, index, t)
torch.cuda.synchronize()

x = torch.randn(32, 1024, 1024).to("cuda")
t = torch.randn(15, 1024, 1024).to("cuda")
index = torch.randint(0, 32, (15,)).to("cuda")
x.index_add_(0, index, t)
torch.cuda.synchronize()

x = torch.randn(32*1024*1024).to("cuda")
t = torch.randn(1024).to("cuda")
index = torch.randint(0, 1024, (1024,)).to("cuda")
x.index_add_(0, index, t)
torch.cuda.synchronize()

x = torch.randn(32*1024, 1024).to("cuda")
t = torch.randn(1024, 1024).to("cuda")
index = torch.randint(0, 1024, (1024,)).to("cuda")
x.index_add_(0, index, t)
torch.cuda.synchronize()

测试环境为 A100 PCIE 40G,测试结果如下:

框架self tensor的shapedimsource shapeindex shape速度
PyTorch(32 * 1024 *1024,)0(15)(15)17.15us
OneFlow(32 * 1024 *1024,)0(15)(15)12us
PyTorch(32 * 1024, 1024)0(15, 1024)(15)27.78us
OneFlow(32 * 1024, 1024,)0(15, 1024)(15)26.98us
PyTorch(32, 1024, 1024)0(15, 1024, 1024)(15)186.88us
OneFlow(32 * 1024 *1024,)0(15, 1024, 1024)(15)247.10us
PyTorch(32 * 1024 *1024,)0(1024)(1024)7.9us
OneFlow(32 * 1024 *1024,)0(1024)(1024)7.79us
PyTorch(32 * 1024, 1024,)0(1024, 1024)(1024)27.87us
OneFlow(32 * 1024, 1024,)0(1024, 1024)(1024)28.67us

整体来说 PyTorch 在 index Tensor元素很小,但Tensor很大的情况下相比于oneflow有一些性能提升,其它情况和 OneFlow 基本持平,也有一些case是慢于oneflow比如index很小但是输入Tensor只有一个维度的情况下,这个时候PyTorch只会启动一个Block以及一个Thread,这显然是个bad case。OneFlow的index_add实现在 https://github.com/Oneflow-Inc/oneflow/blob/master/oneflow/user/kernels/index_add_kernel.cu ,我们并没有针对 index 的大小来单独派发kernel,所以在某些case上性能暂时比PyTorch低一些,后续有需求的话可以继续优化下。

0x4. 总结

我这里相对粗糙的学习了一下调研PyTorch index_add这个算子的cuda实现的优化技术。但PyTorch的这个index_add实现仍然有一些改进空间,比如IndexToOffset的实现有取模操作,这个可以改成一次乘法和减法,可以节省计算指令。然后index_add 的两个kernel来说,GridSize和BlockSize并不是很合理,有改进空间。

0x5. 相关链接

  • https://github.com/pytorch/pytorch
  • https://github.com/Oneflow-Inc/oneflow/blob/master/oneflow/user/kernels/index_add_kernel.cu
  • https://github.com/BBuf/how-to-optim-algorithm-in-cuda

您可能感兴趣的与本文相关的镜像

PyTorch 2.6

PyTorch 2.6

PyTorch
Cuda

PyTorch 是一个开源的 Python 机器学习库,基于 Torch 库,底层由 C++ 实现,应用于人工智能领域,如计算机视觉和自然语言处理

int mqtt_hub_proxy_handler(struct mqtt_cloud_ctx *mqtt_cloud, struct child_dev_s *child_dev, int serviceType, int nodeOption, struct blob_buf *httpPayloadBuf, struct ubus_call *call) { int ret = -1; if(!mqtt_cloud) { return ret; } struct mqtt_node_value_s mqtt_node_value = {0}; char *topic = "/proxy"; char *payload = NULL; int payloadLen = 0; char clientToken[MQTT_CLIENT_TOKEN_LEN] = {0}; const char *key[] = {"deviceId", "user", "model", "hwVer", "fwId", "oemId", "fwVer", "deviceType", "deviceToken"}; const char *value[ARRAY_SIZE(key)] = {NULL}; struct blob_buf bBuf = {NULL}; void *tableReq = NULL, *tableBody = NULL; struct tpsocket_buf *queryParams = NULL; char *host = NULL; char *path = get_service_path(serviceType); struct device_params *device = NULL; char *validateToken = NULL; char *validateUrl = NULL; blobmsg_buf_init(&bBuf); if(child_dev) { device = &child_dev->device; validateToken = child_dev->validateDevToken; validateUrl = child_dev->validateUrl; } else { device = &mqtt_cloud->device; validateToken = mqtt_cloud->validateDevToken; validateUrl = mqtt_cloud->validateUrl; } if(serviceType == SERVICE_VALIDATE || serviceType == SERVICE_VALIDATE_UPDATE || serviceType == SERVICE_DEVICESECRET_APPLY || serviceType == SERVICE_DEVICESECRET_REFRESH) { validateUrl = mqtt_cloud->validateCfgUrl; } else if(serviceType == SERVICE_DEVICESECRET_CONFIRM) { validateUrl = mqtt_cloud->validateCfgUrl; } else if(!validateToken) { DBG_ERR("validate token error\n"); DBG_ERR("serviceType is %d\n", serviceType); goto end; } if(!device || !validateUrl) { DBG_ERR("device params or validateUrl error\n"); goto end; } value[0] = device->deviceId; value[1] = device->cloudUserName ? device->cloudUserName : ""; value[2] = device->deviceModel; value[3] = device->deviceHwVer; value[4] = device->fwId; value[5] = device->oemId; value[6] = device->fwVer; value[7] = device->deviceType; if(serviceType != SERVICE_VALIDATE && serviceType != SERVICE_VALIDATE_UPDATE && serviceType != SERVICE_DEVICESECRET_APPLY && \ serviceType != SERVICE_DEVICESECRET_REFRESH && serviceType != SERVICE_DEVICESECRET_CONFIRM) { value[8] = validateToken; } else { key[8] = NULL; } mqtt_cloud_generate_client_token(clientToken, sizeof(clientToken), mqtt_cloud->device.deviceMac, mqtt_cloud->mqtt_packet_id); blobmsg_add_string(&bBuf, "clientToken", clientToken); blobmsg_add_u64(&bBuf, "timestamp", get_timestamp()); tableReq = blobmsg_open_table(&bBuf, "request"); if(!tableReq) { DBG_ERR("blobmsg_open_table error\n"); goto end; } blobmsg_add_string(&bBuf, "path", path ? path : ""); queryParams = tpsocket_query_encode(key, value, ARRAY_SIZE(key)); if(!queryParams) { DBG_ERR("tpsocket_query_encode error\n"); blobmsg_close_table(&bBuf, tableReq); goto end; } blobmsg_add_string(&bBuf, "queryParams", tpbuf_data(queryParams)); tpbuf_free(queryParams); tpsocket_parse_url(validateUrl, &host, NULL, NULL, NULL); if(!host) { DBG_ERR("tpsocket_parse_url error\n"); blobmsg_close_table(&bBuf, tableReq); goto end; } blobmsg_add_string(&bBuf, "host", host); tableBody = blobmsg_open_table(&bBuf, "body"); if(!tableBody) { DBG_ERR("blobmsg_open_table error\n"); blobmsg_close_table(&bBuf, tableReq); goto end; } if(httpPayloadBuf) { int rem = 0; struct blob_attr *cur = NULL; blobmsg_for_each_attr(cur, httpPayloadBuf->head, rem) { blobmsg_add_field(&bBuf, blobmsg_type(cur), blobmsg_name(cur), blobmsg_data(cur), blobmsg_data_len(cur)); } } else { switch(serviceType) { case SERVICE_DEVICESECRET_APPLY: case SERVICE_DEVICESECRET_REFRESH: case SERVICE_DEVICESECRET_CONFIRM: { blobmsg_add_string(&bBuf, "deviceId", device->deviceId); if(mqtt_cloud->accountInfo.bindCode && mqtt_cloud->accountInfo.bindCode[0]) { blobmsg_add_string(&bBuf, "bindCode", mqtt_cloud->accountInfo.bindCode); blobmsg_add_string(&bBuf, "tcspVer", "1.3"); } else { blobmsg_add_string(&bBuf, "tcspVer", "1.2"); } blobmsg_add_string(&bBuf, "accountId", mqtt_cloud->accountInfo.accountid); blobmsg_add_string(&bBuf, "cloudUserName", mqtt_cloud->accountInfo.username); blobmsg_add_string(&bBuf, "deviceName", mqtt_cloud->device.deviceName); blobmsg_add_string(&bBuf, "deviceMac", device->deviceMac); blobmsg_add_string(&bBuf, "hwId", device->hwId); blobmsg_add_string(&bBuf, "deviceHwVer", device->deviceHwVer); blobmsg_add_string(&bBuf, "fwId", device->fwId); blobmsg_add_string(&bBuf, "deviceModel", device->deviceModel); blobmsg_add_string(&bBuf, "oemId", device->oemId); blobmsg_add_string(&bBuf, "fwVer", device->fwVer); blobmsg_add_string(&bBuf, "deviceType", device->deviceType); if (!device->alias) { blobmsg_add_string(&bBuf, "alias", ""); } else if (!child_dev) { char aliasBase64[128] = {0}; tpsocket_base64_encode((unsigned char *)device->alias, strlen(device->alias), (unsigned char *)aliasBase64); blobmsg_add_string(&bBuf, "alias", aliasBase64); } else { blobmsg_add_string(&bBuf, "alias", device->alias); } if(device->deviceSecret && device->deviceSecret[0] && serviceType != SERVICE_DEVICESECRET_APPLY) { blobmsg_add_string(&bBuf, serviceType == SERVICE_DEVICESECRET_REFRESH ? "currentDeviceSecret" : "deviceSecret", device->deviceSecret); } break; } case SERVICE_VALIDATE_UPDATE: case SERVICE_VALIDATE: { blobmsg_add_string(&bBuf, "deviceId", device->deviceId); blobmsg_add_string(&bBuf, "deviceMac", device->deviceMac); blobmsg_add_string(&bBuf, "hwId", device->hwId); if (!device->alias) { blobmsg_add_string(&bBuf, "alias", ""); } else if (!child_dev) { char aliasBase64[128] = {0}; tpsocket_base64_encode((unsigned char *)device->alias, strlen(device->alias), (unsigned char *)aliasBase64); blobmsg_add_string(&bBuf, "alias", aliasBase64); } else { blobmsg_add_string(&bBuf, "alias", device->alias); } blobmsg_add_string(&bBuf, "deviceName", device->deviceName); /*no matter for hub or child, isSupportTCSP is false*/ if(mqtt_cloud->isSupportTCSP == false) { blobmsg_add_u8(&bBuf, "isSupportTCSP", mqtt_cloud->isSupportTCSP); } /*for hub, should set bindCode if bindCode not NULL*/ if(!child_dev && mqtt_cloud->accountInfo.bindCode && mqtt_cloud->accountInfo.bindCode[0]) { blobmsg_add_string(&bBuf, "bindCode", mqtt_cloud->accountInfo.bindCode); } /*no matter for hub or child, should set deviceSecret if deviceSecret not NULL*/ if(device->deviceSecret && device->deviceSecret[0]) { blobmsg_add_string(&bBuf, "deviceSecret", device->deviceSecret); } break; } case SERVICE_BINDDEVICE: { blobmsg_add_string(&bBuf, "deviceId", device->deviceId); if(nodeOption == MQTT_NODE_OPTION_PROXY_HUB_CHECKBINDSTATUS) { blobmsg_add_string(&bBuf, "cloudUserName", mqtt_cloud->accountInfo.username); } else { blobmsg_add_string(&bBuf, "cloudUserName", mqtt_cloud->accountInfo.username); blobmsg_add_string(&bBuf, "cloudPassword", mqtt_cloud->accountInfo.password); } break; } case SERVICE_CHECKDEVICEBINDSTATUS: { blobmsg_add_string(&bBuf, "deviceId", device->deviceId); blobmsg_add_string(&bBuf, "accountId", mqtt_cloud->accountInfo.accountid); blobmsg_add_u8(&bBuf, "emailNeeded", true); break; } case SERVICE_UNBINDDEVICE: { blobmsg_add_string(&bBuf, "deviceId", device->deviceId); blobmsg_add_string(&bBuf, "cloudUserName", mqtt_cloud->accountInfo.username); break; } case SERVICE_UNBINDDEVICEWITHACCOUNTID: { blobmsg_add_string(&bBuf, "deviceId", device->deviceId); blobmsg_add_string(&bBuf, "accountId", mqtt_cloud->accountInfo.accountid); break; } //case SERVICE_BINDCODE_BIND: case SERVICE_BINDCODE_BINDDEVICE: { blobmsg_add_string(&bBuf, "deviceId", device->deviceId); blobmsg_add_string(&bBuf, "cloudUserName", mqtt_cloud->accountInfo.username); blobmsg_add_string(&bBuf, "cloudPassword", mqtt_cloud->accountInfo.password); break; } case SERVICE_BINDCODE_VERIFYBINDCODE: { blobmsg_add_string(&bBuf, "deviceId", device->deviceId); blobmsg_add_string(&bBuf, "bindCode", mqtt_cloud->accountInfo.bindCode); break; } case SERVICE_BINDCODE_CHECKDEVICEBINDSTATUS: { blobmsg_add_string(&bBuf, "deviceId", device->deviceId); blobmsg_add_string(&bBuf, "bindCode", mqtt_cloud->accountInfo.bindCode); break; } case SERVICE_BINDCODE_UNBINDDEVICE: { blobmsg_add_string(&bBuf, "deviceId", device->deviceId); blobmsg_add_string(&bBuf, "bindCode", mqtt_cloud->accountInfo.bindCode); break; } case SERVICE_UPLOADDEVICEINFO: { blobmsg_add_string(&bBuf, "deviceId", device->deviceId); blobmsg_add_string(&bBuf, "model", device->deviceModel); blobmsg_add_string(&bBuf, "hwVer", device->deviceHwVer); blobmsg_add_string(&bBuf, "hwId", device->hwId); blobmsg_add_string(&bBuf, "fwId", device->fwId); blobmsg_add_string(&bBuf, "oemId", device->oemId); blobmsg_add_string(&bBuf, "fwVer", device->fwVer); blobmsg_add_string(&bBuf, "deviceType", device->deviceType); blobmsg_add_string(&bBuf, "deviceName", device->deviceName); break; } case SERVICE_UPDATEALIAS: { blobmsg_add_string(&bBuf, "deviceId", device->deviceId); if (!device->alias) { blobmsg_add_string(&bBuf, "alias", ""); } else if (!child_dev) { char aliasBase64[128] = {0}; tpsocket_base64_encode((unsigned char *)device->alias, strlen(device->alias), (unsigned char *)aliasBase64); blobmsg_add_string(&bBuf, "alias", aliasBase64); } else { blobmsg_add_string(&bBuf, "alias", device->alias); } break; } case SERVICE_GETDSTRULE: { struct blob_buf tb = {NULL}; char *zoneId = NULL; blobmsg_buf_init(&tb); zoneId = blobuci_get(datetime, timezone, area, blobuci_get_string, &tb); blobmsg_add_string(&bBuf, "zoneId", zoneId ? zoneId : "UTC"); blob_buf_free(&tb); break; } case SERVICE_GETSECUREINTLFWLIST: case SERVICE_GETINTLFWLIST: { blobmsg_add_string(&bBuf, "deviceId", device->deviceId); blobmsg_add_string(&bBuf, "hwId", device->hwId); blobmsg_add_string(&bBuf, "fwId", device->fwId); blobmsg_add_string(&bBuf, "oemId", device->oemId); blobmsg_add_string(&bBuf, "devFwCurrentVer", device->fwVerShort); break; } case SERVICE_GETDEVICETOKENWITHSERVICEIDS: { void *array = NULL; unsigned int flag_type = 0; blobmsg_add_string(&bBuf, "deviceId", device->deviceId); if(NULL != (array = blobmsg_open_array(&bBuf, "serviceIds"))){ flag_type = mqtt_deviceType_to_flag_type(device->deviceType); mqtt_service_token_array(&bBuf, flag_type); blobmsg_close_array(&bBuf, array); } break; } case LOG_SERVICE_DEVICE_LOG: { blobmsg_add_string(&bBuf, "deviceId", device->deviceId); break; } case SERVICE_GETSERVICE_STATUSINFO: { void *array = NULL; if(NULL != (array = blobmsg_open_array(&bBuf, "serviceNameList"))) { blobmsg_add_string(&bBuf, NULL, "userExperienceImprove"); blobmsg_close_array(&bBuf, array); } blobmsg_add_string(&bBuf, "deviceId", device->deviceId); } break; default: break; } } blobmsg_close_table(&bBuf, tableBody); blobmsg_close_table(&bBuf, tableReq); payload = blobmsg_format_json(bBuf.head, true); if (!payload) { DBG_ERR("blobmsg_format_json error\n"); goto end; } if (strcasestr(payload, "password") || strcasestr(payload, "passwd") || strcasestr(payload, "pwd") || strcasestr(payload, "usr") || strcasestr(payload, "user") || strcasestr(payload, "account") || strcasestr(payload, "email")) { DBG_DBG("/proxy FILTER\n"); } else { DBG_DBG("/proxy: [%s]\n", payload); } payloadLen = strlen(payload); mqtt_node_value.nodePublishType = PUBLISH_WAIT_REPLY_DEV; mqtt_node_value.suffixPublishTopic = topic; mqtt_node_value.service_path = path; mqtt_node_value.clientToken = clientToken; mqtt_node_value.childTopic = false; mqtt_node_value.childDevId = child_dev && child_dev->device.deviceId ? child_dev->device.deviceId : NULL; mqtt_node_value.childDev = NULL; mqtt_node_value.nodeOption = nodeOption; mqtt_node_value.cb = mqtt_hub_proxy_handler_cb; mqtt_node_value.payload = payload; mqtt_node_value.payloadLen = payloadLen; if(NULL == mqtt_node_add(call, mqtt_cloud, &mqtt_node_value, MQTT_CHANNEL_TAPO)) { DBG_ERR("mqtt node add error\n"); goto end; } ret = 0; end: STRFREE(payload); STRFREE(host); blob_buf_free(&bBuf); return ret; } 梳理代码的逻辑。
最新发布
12-12
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值