C++ AMP: uses tile_static(lds) as function param: 2 methods ( array or Pointer)

本文介绍了一个使用 C++ AMP 实现并行计算的例子,包括数组相加的方法和利用平铺进行并行计算的技术。展示了如何定义并行域、执行计算任务以及打印结果。
// cppamp1.cpp : Defines the entry point for the console application.
//

#include "stdafx.h"


#include <amp.h>
#include <iostream>
using namespace concurrency;

const int size = 5;

inline unsigned long Log2 (unsigned long num) restrict(amp)
{
    unsigned long index = 0;
    while (num > 1) {
        index++;
        num = (num+1)>>1;
    }
    return index;
}

void CppAmpMethod() {
    int aCPP[] = {1, 2, 3, 4, 5};
    int bCPP[] = {6, 7, 8, 9, 10};
    int sumCPP[size];
    
    // Create C++ AMP objects.
    array_view<const int, 1> a(size, aCPP);
    array_view<const int, 1> b(size, bCPP);
    array_view<int, 1> sum(size, sumCPP);
    sum.discard_data();

    parallel_for_each( 
        // Define the compute domain, which is the set of threads that are created.
        sum.extent, 
        // Define the code to run on each thread on the accelerator.
        [=](index<1> idx) restrict(amp)
    {
        sum[idx] = a[idx] + b[idx];
    }
    );

    // Print the results. The expected output is "7, 9, 11, 13, 15".
    for (int i = 0; i < size; i++) {
        std::cout << sum[i] << "\n";
    }
}

void AddElements(index<1> idx, array_view<int, 1> sum, array_view<int, 1> a, array_view<int, 1> b) restrict(amp)
{
    sum[idx] = a[idx] + b[idx] + Log2(b[idx]);
}

void AddArraysWithFunction() {

    int aCPP[] = {1, 2, 3, 4, 5};
    int bCPP[] = {6, 7, 8, 9, 10};
    int sumCPP[5] = {0, 0, 0, 0, 0};

    array_view<int, 1> a(5, aCPP);
    array_view<int, 1> b(5, bCPP);
    array_view<int, 1> sum(5, sumCPP);

    parallel_for_each(
        sum.extent, 
        [=](index<1> idx) restrict(amp)
        {
            AddElements(idx, sum, a, b);
        }
    );

    for (int i = 0; i < 5; i++) {
        std::cout << sum[i] << "\n";
    }
}

//METHOD 1: tile_static int lds[2][3];

void Idx_TiledAmp(tiled_index<2,3> idx,array_view<int, 2> input, array_view<int, 2> gid, array_view<int, 2> tid, array_view<int, 2> lid,  int lds[2][3]) restrict(amp)
{
    lds[idx.local[0]][idx.local[1]] = idx.global[0] | idx.global[1]* 100;
    idx.barrier.wait();
    gid[idx.global] = lds[idx.local[0]][idx.local[1]] ;
    tid[idx.global] = idx.tile[0]   | idx.tile[1] * 10000;
    lid[idx.global] = idx.local[0]  | idx.local[1] * 10000;
}

//convert to 1d array
void Idx_TiledAmp2(tiled_index<2,3> idx,array_view<int, 2> input, array_view<int, 2> gid, array_view<int, 2> tid, array_view<int, 2> lid,  int* lds) restrict(amp)
{
    lds[idx.local[0]*2 +idx.local[1]] = idx.global[0] | idx.global[1]* 100;
    idx.barrier.wait();
    gid[idx.global] = lds[idx.local[0]*2+idx.local[1]];
    tid[idx.global] = idx.tile[0]   | idx.tile[1] * 10000;
    lid[idx.global] = idx.local[0]  | idx.local[1] * 10000;
}


void TiledAmp()
{
    // Sample data:
    int sampledata[] = {
        1, 2, 3, 4, 5, 6,
        11, 12, 13, 14, 15, 16,
        21, 22, 23, 24, 25, 26,
        31, 32, 33, 34, 35, 36,};

    // The tiles:
    // 2 2    9 7    1 4
    // 4 4    8 8    3 4
    //
    // 1 5    1 2    5 2
    // 6 8    3 2    7 2

    // Averages:
    int averagedata[] = { 
        0, 0, 0, 0, 0, 0, 
        0, 0, 0, 0, 0, 0, 
        0, 0, 0, 0, 0, 0, 
        0, 0, 0, 0, 0, 0, 
    };

    int gid_data[] = { 
        0, 0, 0, 0, 0, 0, 
        0, 0, 0, 0, 0, 0, 
        0, 0, 0, 0, 0, 0, 
        0, 0, 0, 0, 0, 0, 
    };


    int tid_data[] = { 
        0, 0, 0, 0, 0, 0, 
        0, 0, 0, 0, 0, 0, 
        0, 0, 0, 0, 0, 0, 
        0, 0, 0, 0, 0, 0, 
    };

    int lid_data[] = { 
        0, 0, 0, 0, 0, 0, 
        0, 0, 0, 0, 0, 0, 
        0, 0, 0, 0, 0, 0, 
        0, 0, 0, 0, 0, 0, 
    };

    array_view<int, 2> sample(4, 6, sampledata);
    array_view<int, 2> gid(4, 6, gid_data);
    array_view<int, 2> tid(4, 6, tid_data);
    array_view<int, 2> lid(4, 6, lid_data);

    array_view<int, 2> average(4, 6, averagedata);

    parallel_for_each(
        // Create threads for sample.extent and divide the extent into 2 x 2 tiles.
        sample.extent.tile<2,3>(),
        [=](tiled_index<2,3> idx) restrict(amp)
        {
            tile_static int sample2[2][3];
            //Idx_TiledAmp(idx, sample, gid, tid, lid, sample2);
            Idx_TiledAmp2(idx, sample, gid, tid, lid, &sample2[0][0]);
        }
    );

    std::cout << "sample\n";
    for (int i = 0; i < 4; i++) {
        for (int j = 0; j < 6; j++) {
            std::cout << sample(i,j) << " ";
        }
        std::cout << "\n";
    }
    std::cout << "gid\n";
    for (int i = 0; i < 4; i++) {
        for (int j = 0; j < 6; j++) {
            std::cout << gid(i,j) << " ";
        }
        std::cout << "\n";
    }

    std::cout << "\ntid\n";
    for (int i = 0; i < 4; i++) {
        for (int j = 0; j < 6; j++) {
            std::cout << tid(i,j) << " ";
        }
        std::cout << "\n";
    }

    std::cout << "\nlid\n";
    for (int i = 0; i < 4; i++) {
        for (int j = 0; j < 6; j++) {
            std::cout << lid(i,j) << " ";
        }
        std::cout << "\n";
    }
}


int _tmain(int argc, _TCHAR* argv[])
{

    //CppAmpMethod();
    //AddArraysWithFunction();
    int a=0x01020304;
    BYTE* b;
    b = (BYTE*)&a;
    int bb= (int)*b;

    TiledAmp();
    
    char c = getc(stdin);
    c = getc(stdin);
    return 0;
}

请仔细阅读和深度思考分析下面函数,绝对保持原始代码的处理流程和步骤不变, 绝对不要遗漏各种条件判断和标志位管理的处理和各种逻辑功能处理, 采用 google::protobuf::Descriptor 和 google::protobuf::Reflection 与C/C++11标准, 绝对不要输出简化代码和处理流程和步骤,推导并重构完整的可编译的所有函数的全部代码 1.保持所有原始功能不变 2.提高执行效率,降低计算复杂度 3.已经给定的结构体名字和元素不要更改,详细的中文注释 4.自动添加中文注释说明功能逻辑 5.不使用 auto,使用显式 for 循环 6.结构体采用32位定义 7.不要使用小函数,保持原始的函数定义 8.严格保持protobuf字段映射关系 函数中的 HDDMXng::Tile::Tile 映射为 message Tile { optional uint32 graphid = 1; optional sint32 tx = 2; optional sint32 ty = 3; optional uint32 firstsiteid = 4; optional string name = 5; } 将 _BYTE tile_msg[8] 映射为 HDDMXng::Tile tile_msg; void __fastcall HDDMTile::readme_pb(HDDMTile *this, std::istream *a2) { google::protobuf::Message *v2; // rdx int dwordC; // eax int v4; // edx _WORD tile_msg[10]; // [rsp+0h] [rbp-48h] BYREF int n0x1000000; // [rsp+14h] [rbp-34h] int dword8; // [rsp+18h] [rbp-30h] int v8; // [rsp+1Ch] [rbp-2Ch] std::string *v9; // [rsp+20h] [rbp-28h] if ( HDDMDeviceDump::useXngMarks ) std::istream::read(a2, HDDMDeviceDump::markBuffer, 4); HDDMXng::Tile::Tile((HDDMXng::Tile *)tile_msg); HDDMDevice::readMessage((HDDMDevice *)a2, (std::istream *)tile_msg, v2); LOWORD(this->tileCode) = tile_msg[8] &amp; 0x3FF | this->tileCode &amp; 0xFC00; dwordC = n0x1000000; if ( n0x1000000 > 0x1000000 ) { dwordC = n0x1000000 &amp; 0xFFFFFF; if ( n0x1000000 >> 24 == 1 ) dwordC = -dwordC; } this->tileCode2 = dwordC; v4 = v8; this->tileCode1 = dword8; this->tileCode = (v4 << 10) | this->tileCode &amp; 0x3FF; std::string::assign((std::string *)&amp;this->qword18, v9); HDDMXng::Tile::~Tile((HDDMXng::Tile *)tile_msg); } void __fastcall HDDMTile::writeme_pb(HDDMTile *this, HDDMTile *HDDMTile) { std::string *p__ZN6google8protobuf8internal12kEmptyStringE_1; // rdi const google::protobuf::Message *v3; // rdx __int16 tileCode; // [rsp+0h] [rbp-88h] char v5; // [rsp+1Fh] [rbp-69h] BYREF int tile_msg; // [rsp+20h] [rbp-68h] OVERLAPPED BYREF _DWORD tile_msg_16[8]; // [rsp+30h] [rbp-58h] BYREF std::string *p__ZN6google8protobuf8internal12kEmptyStringE; // [rsp+50h] [rbp-38h] int v9; // [rsp+5Ch] [rbp-2Ch] if ( HDDMDeviceDump::useXngMarks ) std::ostream::write((std::ostream *)HDDMTile, "TILE", 4); HDDMXng::Tile::Tile((HDDMXng::Tile *)tile_msg_16); tileCode = this->tileCode; v9 |= 0xFu; tile_msg_16[4] = tileCode &amp; 0x3FF; tile_msg_16[5] = this->tileCode2; tile_msg_16[6] = this->tileCode1; tile_msg_16[7] = (unsigned int)this->tileCode >> 10; HDDMTile::getName((HDDMTile *)&amp;tile_msg); v9 |= 0x10u; p__ZN6google8protobuf8internal12kEmptyStringE_1 = p__ZN6google8protobuf8internal12kEmptyStringE; if ( p__ZN6google8protobuf8internal12kEmptyStringE == (std::string *)&amp;google::protobuf::internal::kEmptyString ) { p__ZN6google8protobuf8internal12kEmptyStringE_1 = (std::string *)operator new(8u); *(_QWORD *)p__ZN6google8protobuf8internal12kEmptyStringE_1 = (char *)&amp;std::string::_Rep::_S_empty_rep_storage + 24; p__ZN6google8protobuf8internal12kEmptyStringE = p__ZN6google8protobuf8internal12kEmptyStringE_1; } std::string::assign(p__ZN6google8protobuf8internal12kEmptyStringE_1, (const std::string *)&amp;tile_msg); std::string::_Rep::_M_dispose(*(_QWORD *)&amp;tile_msg - 24LL, &amp;v5); HDDMDevice::writeMessage((HDDMDevice *)HDDMTile, (std::ostream *)tile_msg_16, v3); HDDMXng::Tile::~Tile((HDDMXng::Tile *)tile_msg_16); } __int64 __fastcall HDDMTile::print(HDDMTile *this, std::ostream *a2, const std::string *a3) { unsigned __int16 v4; // r13 __int64 v5; // r14 __int64 v6; // r14 __int64 v7; // rax unsigned int v8; // r13d unsigned __int8 v9; // r15 __int64 v10; // r14 __int64 v11; // r14 __int64 v12; // r15 __int64 v13; // r15 __int64 v14; // r14 __int64 v15; // r14 __int64 v16; // rax __int64 dword28; // r13 __int64 v18; // rbx __int64 v19; // rbx __int64 v20; // rbx __int64 v21; // rax unsigned int tileCode1; // [rsp+4h] [rbp-44h] unsigned __int8 v24; // [rsp+Bh] [rbp-3Dh] unsigned int tileCode2; // [rsp+Ch] [rbp-3Ch] v4 = this->tileCode &amp; 0x3FF; v5 = std::__ostream_insert<char,std::char_traits<char>>(a2, *(_QWORD *)a3, *(_QWORD *)(*(_QWORD *)a3 - 24LL)); std::__ostream_insert<char,std::char_traits<char>>(v5, "TILE : ", 7); v6 = std::__ostream_insert<char,std::char_traits<char>>(v5, this->qword18, *(_QWORD *)(this->qword18 - 24LL)); std::__ostream_insert<char,std::char_traits<char>>(v6, " m_graphid : ", 13); v7 = std::ostream::_M_insert<unsigned long>(v6, v4); std::endl<char,std::char_traits<char>>(v7); tileCode1 = this->tileCode1; tileCode2 = this->tileCode2; v8 = (unsigned int)this->tileCode >> 10; v9 = this->tileCode &amp; 0x1F; v24 = (LOWORD(this->tileCode) >> 6) &amp; 0xF; v10 = std::__ostream_insert<char,std::char_traits<char>>(a2, *(_QWORD *)a3, *(_QWORD *)(*(_QWORD *)a3 - 24LL)); std::__ostream_insert<char,std::char_traits<char>>(v10, "m_wasted : ", 11); v11 = std::ostream::_M_insert<unsigned long>(v10, v9); std::__ostream_insert<char,std::char_traits<char>>(v11, " m_tx : ", 8); v12 = std::ostream::operator<<(v11, tileCode2); std::__ostream_insert<char,std::char_traits<char>>(v12, " m_wasted1 : ", 13); v13 = std::ostream::operator<<(v12, 1); std::__ostream_insert<char,std::char_traits<char>>(v13, " m_deviceid : ", 14); v14 = std::ostream::_M_insert<unsigned long>(v13, v24); std::__ostream_insert<char,std::char_traits<char>>(v14, " m_firstsiteid : ", 17); v15 = std::ostream::_M_insert<unsigned long>(v14, v8); std::__ostream_insert<char,std::char_traits<char>>(v15, " m_ty : ", 8); v16 = std::ostream::operator<<(v15, tileCode1); std::endl<char,std::char_traits<char>>(v16); dword28 = (unsigned int)this->dword28; LOWORD(v15) = HDDMTile::getGridPointX(this); LOWORD(v13) = HDDMTile::getGridPointY(this); v18 = std::__ostream_insert<char,std::char_traits<char>>(a2, *(_QWORD *)a3, *(_QWORD *)(*(_QWORD *)a3 - 24LL)); std::__ostream_insert<char,std::char_traits<char>>(v18, "tilerow: ", 9); v19 = std::ostream::_M_insert<unsigned long>(v18, (unsigned __int16)v13); std::__ostream_insert<char,std::char_traits<char>>(v19, " tilecol : ", 11); v20 = std::ostream::_M_insert<unsigned long>(v19, (unsigned __int16)v15); std::__ostream_insert<char,std::char_traits<char>>(v20, " index : ", 9); v21 = std::ostream::_M_insert<unsigned long>(v20, dword28); return std::endl<char,std::char_traits<char>>(v21); }
11-07
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值