#include <stdio.h>
#include <stdlib.h>
#include "dpc_t.cuh"
// Number of blocks
#define NUM_BLOCKS 1024
// Number of threads per block
#define NUM_COUNT 512
// Dimension of the dataset
#define DIMENSION 4
using namespace std;
/*---------------------- 核函数 ----------------------*/
static __global__ void find_cell_contents(const int N, int* cell_count, const int* cell_count_sum, int* cell_contents, float* d_data,
const int nx, const int ny, const int nz, const int nw, const double rc_inv, int M)
{
const int n1 = blockIdx.x * blockDim.x + threadIdx.x;
if (n1 < N) {
float4 temp_n1;
temp_n1.x = d_data[n1 * M + 0];
temp_n1.y = d_data[n1 * M + 1];
temp_n1.z = d_data[n1 * M + 2];
temp_n1.w = d_data[n1 * M + 3];
int cell_id;
find_cell_id(temp_n1.x, temp_n1.y, temp_n1.z, temp_n1.w, rc_inv, nx, ny, nz, nw, cell_id);
const int ind = atomicAdd(&cell_count[cell_id], 1);
cell_contents[cell_count_sum[cell_id] + ind] = n1;
}
}
void __global__ DPC_computeRho_kernel(float* d_data, int* d_NL, int N, int M, float dc)
{
int n1 = blockIdx.x * blockDim.x + threadIdx.x;
if (n1 < N)
{
float4 temp_n1;
temp_n1.x = d_data[n1 * M + 0];
temp_n1.y = d_data[n1 * M + 1];
temp_n1.z = d_data[n1 * M + 2];
temp_n1.w = d_data[n1 * M + 3];
for (int n2 = n1 + 1; n2 < N; n2++)
{
float distance_square = 0;
distance_square = distance_square + (temp_n1.x - d_data[n2 * M + 0]) * (temp_n1.x - d_data[n2 * M + 0]);
distance_square = distance_square + (temp_n1.y - d_data[n2 * M + 1]) * (temp_n1.y - d_data[n2 * M + 1]);
distance_square = distance_square + (temp_n1.z - d_data[n2 * M + 2]) * (temp_n1.z - d_data[n2 * M + 2]);
distance_square = distance_square + (temp_n1.w - d_data[n2 * M + 3]) * (temp_n1.w - d_data[n2 * M + 3]);
if (distance_square < dc)
{
//d_NL[n1]++;
//d_NL[n2]++;
atomicAdd(&d_NL[n1], 1);
atomicAdd(&d_NL[n2], 1);
}
}
}
}
void __global__ DPC_computeRho_kernel_s(float* d_data, int* d_NL, int N, int M, float dc)
{
// 当前块的起始索引
int block_start = blockIdx.x * blockDim.x;
int thread_id = threadIdx.x;
// 共享内存声明
__shared__ float4 shared_data[NUM_COUNT];
// 将当前块的数据加载到共享内存
if (block_start + thread_id < N) {
shared_data[thread_id] = *((float4*)(d_data + (block_start + thread_id) * M));
}
__syncthreads();
// 每个线程处理一个n1
int n1 = block_start + thread_id;
if (n1 >= N) return;
// 局部计数器减少全局原子操作
int local_count = 0;
float4 temp_n1 = *((float4*)(d_data + n1 * M));
// 处理块内数据点
for (int n2_in_block = 0; n2_in_block < blockDim.x; ++n2_in_block) {
int n2_global = block_start + n2_in_block;
if (n2_global <= n1 || n2_global >= N) continue;
float4 temp_n2 = shared_data[n2_in_block];
float dx = temp_n1.x - temp_n2.x;
float dy = temp_n1.y - temp_n2.y;
float dz = temp_n1.z - temp_n2.z;
float dw = temp_n1.w - temp_n2.w;
float dist_sq = dx * dx + dy * dy + dz * dz + dw * dw;
if (dist_sq < dc) {
local_count++;
}
}
// 处理后续数据块
for (int other_block = blockIdx.x + 1; other_block < gridDim.x; ++other_block) {
int other_start = other_block * blockDim.x;
// 加载其他块到共享内存
__shared__ float4 other_shared[NUM_COUNT];
if (other_start + thread_id < N) {
other_shared[thread_id] = *((float4*)(d_data + (other_start + thread_id) * M));
}
__syncthreads();
// 与当前块数据比较
for (int n2_in_other = 0; n2_in_other < blockDim.x; ++n2_in_other) {
int n2_global = other_start + n2_in_other;
if (n2_global >= N) continue;
float4 temp_n2 = other_shared[n2_in_other];
float dx = temp_n1.x - temp_n2.x;
float dy = temp_n1.y - temp_n2.y;
float dz = temp_n1.z - temp_n2.z;
float dw = temp_n1.w - temp_n2.w;
float dist_sq = dx * dx + dy * dy + dz * dz + dw * dw;
if (dist_sq < dc) {
local_count++;
}
}
__syncthreads();
}
// 最终原子操作
atomicAdd(&d_NL[n1], local_count);
}
void __global__ DPC_computeRho_kernel_s_2(float* d_data, int* d_NL, int N, int M, float dc)
{
// 修改点1:共享内存大小扩大一倍以存储两个点的数据
__shared__ float4 shared_data[NUM_COUNT * 2]; // 使用动态共享内存
// 修改点2:计算块起始索引时考虑双倍处理量
int block_start = blockIdx.x * (blockDim.x * 2);
int thread_id = threadIdx.x;
// 修改点3:每个线程处理两个相邻的n1点
int n1_1 = block_start + 2 * thread_id;
int n1_2 = n1_1 + 1;
// 修改点4:双点数据加载到共享内存
if (n1_1 < N) {
shared_data[2 * thread_id] = *((float4*)(d_data + n1_1 * M));
}
if (n1_2 < N) {
shared_data[2 * thread_id + 1] = *((float4*)(d_data + n1_2 * M));
}
__syncthreads();
// 修改点5:双点局部计数器
int local_count_1 = 0, local_count_2 = 0;
float4 temp_n1_1, temp_n1_2;
// 加载当前线程处理的两个点数据
if (n1_1 < N) temp_n1_1 = shared_data[2 * thread_id];
if (n1_2 < N) temp_n1_2 = shared_data[2 * thread_id + 1];
// 处理当前块内数据点(范围扩大为双倍)
for (int n2_in_block = 0; n2_in_block < 2 * blockDim.x; ++n2_in_block) {
int n2_global = block_start + n2_in_block;
if (n2_global >= N) continue;
float4 temp_n2 = shared_data[n2_in_block];
// 处理第一个点n1_1
if (n1_1 < N && n2_global > n1_1) {
float dx = temp_n1_1.x - temp_n2.x;
float dy = temp_n1_1.y - temp_n2.y;
float dz = temp_n1_1.z - temp_n2.z;
float dw = temp_n1_1.w - temp_n2.w;
float dist_sq = dx * dx + dy * dy + dz * dz + dw * dw;
if (dist_sq < dc) local_count_1++;
}
// 处理第二个点n1_2
if (n1_2 < N && n2_global > n1_2) {
float dx = temp_n1_2.x - temp_n2.x;
float dy = temp_n1_2.y - temp_n2.y;
float dz = temp_n1_2.z - temp_n2.z;
float dw = temp_n1_2.w - temp_n2.w;
float dist_sq = dx * dx + dy * dy + dz * dz + dw * dw;
if (dist_sq < dc) local_count_2++;
}
}
// 处理后续数据块(需要调整other_block计算)
for (int other_block = blockIdx.x + 1; other_block < gridDim.x; ++other_block) {
int other_start = other_block * (blockDim.x * 2);
// 修改点6:加载其他块的双点数据
//__shared__ float4 other_shared[NUM_COUNT * 2];
int load_pos = 2 * thread_id;
if (other_start + load_pos < N) {
shared_data[load_pos] = *((float4*)(d_data + (other_start + load_pos) * M));
}
if (other_start + load_pos + 1 < N) {
shared_data[load_pos + 1] = *((float4*)(d_data + (other_start + load_pos + 1) * M));
}
__syncthreads();
// 比较其他块中的点
for (int n2_in_other = 0; n2_in_other < 2 * blockDim.x; ++n2_in_other) {
int n2_global = other_start + n2_in_other;
if (n2_global >= N) continue;
float4 temp_n2 = shared_data[n2_in_other];
// 处理第一个点n1_1
if (n1_1 < N && n2_global > n1_1) {
float dx = temp_n1_1.x - temp_n2.x;
float dy = temp_n1_1.y - temp_n2.y;
float dz = temp_n1_1.z - temp_n2.z;
float dw = temp_n1_1.w - temp_n2.w;
float dist_sq = dx * dx + dy * dy + dz * dz + dw * dw;
if (dist_sq < dc) local_count_1++;
}
// 处理第二个点n1_2
if (n1_2 < N && n2_global > n1_2) {
float dx = temp_n1_2.x - temp_n2.x;
float dy = temp_n1_2.y - temp_n2.y;
float dz = temp_n1_2.z - temp_n2.z;
float dw = temp_n1_2.w - temp_n2.w;
float dist_sq = dx * dx + dy * dy + dz * dz + dw * dw;
if (dist_sq < dc) local_count_2++;
}
}
__syncthreads();
}
// 修改点7:双点原子操作
if (n1_1 < N) atomicAdd(&d_NL[n1_1], local_count_1);
if (n1_2 < N) atomicAdd(&d_NL[n1_2], local_count_2);
}
void __global__ DPC_computeRhoM_kernel(float* d_data, int* d_NL, int N, int M, float dc, int nx, int ny, int nz, int nw, float rc_inv,
const int* cell_counts, const int* cell_count_sum, const int* cell_contents)
{
const int n1 = blockIdx.x * blockDim.x + threadIdx.x;
int count = 0;
if (n1 < N)
{
float4 temp_n1;
temp_n1.x = d_data[n1 * M + 0];
temp_n1.y = d_data[n1 * M + 1];
temp_n1.z = d_data[n1 * M + 2];
temp_n1.w = d_data[n1 * M + 3];
int cell_id;
int cell_id_x;
int cell_id_y;
int cell_id_z;
int cell_id_w;
find_cell_id(temp_n1.x, temp_n1.y, temp_n1.z, temp_n1.w, rc_inv, nx, ny, nz, nw, cell_id_x, cell_id_y, cell_id_z, cell_id_w, cell_id);
for (int dw = -1; dw <= 1; ++dw) {
for (int dz = -1; dz <= 1; ++dz) {
for (int dy = -1; dy <= 1; ++dy) {
for (int dx = -1; dx <= 1; ++dx) {
int neighbor_cell = cell_id + dw * nx * ny * nz + dz * nx * ny + dy * nx + dx;
if (cell_id_x + dx < 0)
neighbor_cell += nx;
if (cell_id_x + dx >= nx)
neighbor_cell -= nx;
if (cell_id_y + dy < 0)
neighbor_cell += ny * nx;
if (cell_id_y + dy >= ny)
neighbor_cell -= ny * nx;
if (cell_id_z + dz < 0)
neighbor_cell += nz * ny * nx;
if (cell_id_z + dz >= nz)
neighbor_cell -= nz * ny * nx;
if (cell_id_z + dw < 0)
neighbor_cell += nz * ny * nx * nw;
if (cell_id_z + dw >= nz)
neighbor_cell -= nz * ny * nx * nw;
const int num_atoms_neighbor_cell = cell_counts[neighbor_cell];
const int num_atoms_previous_cells = cell_count_sum[neighbor_cell];
for (int m = 0; m < num_atoms_neighbor_cell; ++m) {
const int n2 = cell_contents[num_atoms_previous_cells + m];
if (n1 != n2) {
float x12 = temp_n1.x - d_data[n2 * M + 0];
float y12 = temp_n1.y - d_data[n2 * M + 1];
float z12 = temp_n1.z - d_data[n2 * M + 2];
float w12 = temp_n1.w - d_data[n2 * M + 3];
const float d2 = x12 * x12 + y12 * y12 + z12 * z12 + w12 * w12;
if (d2 < dc) {
atomicAdd(&d_NL[n1], 1);
atomicAdd(&d_NL[n2], 1);
}
}
}
}
}
}
}
}
}
void __global__ DPC_computeRhoD_kernel(float* d_data, int* d_NL, int N, int M, float dc, float* distance, float* min_dist, float* max_dist,int* min_idx)
{
int n1 = blockIdx.x * blockDim.x + threadIdx.x;
if (n1 < N)
{
float4 temp_n1;
temp_n1.x = d_data[n1 * M + 0];
temp_n1.y = d_data[n1 * M + 1];
temp_n1.z = d_data[n1 * M + 2];
temp_n1.w = d_data[n1 * M + 3];
float min = 99999999.9, max = 0;
int ti = 0, tj = 0;
for (int n2 = 0; n2 < N; n2++)
{
float distance_square = 0;
distance_square = distance_square + (temp_n1.x - d_data[n2 * M + 0]) * (temp_n1.x - d_data[n2 * M + 0]);
distance_square = distance_square + (temp_n1.y - d_data[n2 * M + 1]) * (temp_n1.y - d_data[n2 * M + 1]);
distance_square = distance_square + (temp_n1.z - d_data[n2 * M + 2]) * (temp_n1.z - d_data[n2 * M + 2]);
distance_square = distance_square + (temp_n1.w - d_data[n2 * M + 3]) * (temp_n1.w - d_data[n2 * M + 3]);
distance[n2] = distance_square;
if (distance_square < dc && n2>n1)
{
//d_NL[n1]++;
//d_NL[n2]++;
atomicAdd(&d_NL[n1], 1);
atomicAdd(&d_NL[n2], 1);
}
if (n2 == n1)continue;
if (distance[n2] > max)
{
max = distance[n2];
ti = n2;
}
if (distance[n2] < min)
{
min = distance[n2];
tj = n2;
}
}
min_dist[n1] = min;
min_idx[n1] = tj;
max_dist[n1] = max;
}
}
void __global__ DPC_computeDeltaAndNearestHigherD_kernel(float* d_delta, float* d_nearest_higher, int N, int M, int* d_indices, float* d_data)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N)
{
int current_idx = d_indices[i];
float4 temp_n1;
temp_n1.x = d_data[current_idx * M + 0];
temp_n1.y = d_data[current_idx * M + 1];
temp_n1.z = d_data[current_idx * M + 2];
temp_n1.w = d_data[current_idx * M + 3];
float min_dist = FLT_MAX;
float max_dist = 0.0;
int nearest = -1;
for (int j = 0; j < i; ++j) {
int higher_idx = d_indices[j];
float distance_square = 0;
distance_square = distance_square + (temp_n1.x - d_data[higher_idx * M + 0]) * (temp_n1.x - d_data[higher_idx * M + 0]);
distance_square = distance_square + (temp_n1.y - d_data[higher_idx * M + 1]) * (temp_n1.y - d_data[higher_idx * M + 1]);
distance_square = distance_square + (temp_n1.z - d_data[higher_idx * M + 2]) * (temp_n1.z - d_data[higher_idx * M + 2]);
distance_square = distance_square + (temp_n1.w - d_data[higher_idx * M + 3]) * (temp_n1.w - d_data[higher_idx * M + 3]);
double dist = distance_square;
if (dist < min_dist) {
min_dist = dist;
nearest = higher_idx;
}
}
if (nearest == -1) {
// 处理密度最高的点
float max_dist = 0.0;
for (int k = 0; k < N; ++k) {
float distance_square = 0;
distance_square = distance_square + (temp_n1.x - d_data[k * M + 0]) * (temp_n1.x - d_data[k * M + 0]);
distance_square = distance_square + (temp_n1.y - d_data[k * M + 1]) * (temp_n1.y - d_data[k * M + 1]);
distance_square = distance_square + (temp_n1.z - d_data[k * M + 2]) * (temp_n1.z - d_data[k * M + 2]);
distance_square = distance_square + (temp_n1.w - d_data[k * M + 3]) * (temp_n1.w - d_data[k * M + 3]);
if (k != current_idx && distance_square > max_dist) {
max_dist = distance_square;
}
}
d_delta[current_idx] = max_dist;
d_nearest_higher[current_idx] = -1;
}
else {
d_delta[current_idx] = min_dist;
d_nearest_higher[current_idx] = nearest;
}
}
}
void __global__ DPC_computeDeltaAndNearestHigherR_kernel(float* d_delta, float* d_nearest_higher, int N, int M, int* d_indices, float* d_data,
float* min_dist, float* max_dist, int* min_idx)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N)
{
int current_idx = d_indices[i];
float min_dist_r = FLT_MAX;
int nearest = min_idx[i];
min_dist_r = min_dist[i];
if (nearest == -1) {
// 处理密度最高的点
float max_dist_r = max_dist[i];
d_delta[current_idx] = max_dist_r;
d_nearest_higher[current_idx] = -1;
}
else {
d_delta[current_idx] = min_dist_r;
d_nearest_higher[current_idx] = nearest;
}
}
}
/*---------------------- 封装函数 ----------------------*/
extern "C"
void launch_kernel_DPC_computeRho(float* h_data, int* h_NL, int N, int M, float dc)
{
float dc2 = 1.0 * 1.0;
float* d_data;
int* d_NL;
cudaMalloc(&d_data, M * N * sizeof(float));
cudaMalloc(&d_NL, N * sizeof(int));
cudaMemcpy(d_data, h_data, M * N * sizeof(float), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
//DPC_computeRho_kernel << <dim3(NUM_BLOCKS, 1), dim3(NUM_COUNT, 1) >> > (d_data, d_NL, N, M, dc2);
DPC_computeRho_kernel_s << <dim3(NUM_BLOCKS, 1), dim3(NUM_COUNT, 1) >> > (d_data, d_NL, N, M, dc2);
//DPC_computeRho_kernel_s_2 << <dim3(NUM_BLOCKS / 2, 1), dim3(NUM_COUNT, 1) >> > (d_data, d_NL, N, M, dc2);
cudaMemcpy(h_NL, d_NL, N * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(d_NL);
cudaFree(d_data);
}
extern "C"
void launch_kernel_DPC_computeDeltaAndNearestHigherD(float* h_delta, float* h_nearest_higher, int N, int M, int* h_indices, float* h_data)
{
float* d_delta;
float* d_nearest_higher;
float* d_data;
int* d_indices;
cudaMalloc(&d_delta, N * sizeof(float));
cudaMalloc(&d_nearest_higher, N * sizeof(float));
cudaMalloc(&d_data, M * N * sizeof(float));
cudaMalloc(&d_indices, N * sizeof(int));
cudaMemcpy(d_data, h_data, M * N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_indices, h_indices, N * sizeof(int), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
//DPC_computeDeltaAndNearestHigherR_kernel << <dim3(NUM_BLOCKS, 1), dim3(NUM_COUNT, 1) >> > (d_delta, d_nearest_higher, N, M, d_indices, d_data);
DPC_computeDeltaAndNearestHigherD_kernel << <dim3(NUM_BLOCKS, 1), dim3(NUM_COUNT, 1) >> > (d_delta, d_nearest_higher, N, M, d_indices, d_data);
cudaMemcpy(h_delta, d_delta, N * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(h_nearest_higher, d_nearest_higher, N * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(d_data);
cudaFree(d_delta);
cudaFree(d_nearest_higher);
cudaFree(d_indices);
}
extern "C"
void launch_thrust_findClusterCenters(float* h_rho_delta, int* ord_clusters, int N)
{
thrust::host_vector<MyStruct> data_host(N);
thrust::device_vector<MyStruct> data_device(N);
for (int i = 0; i < N; ++i) {
data_host[i] = { h_rho_delta[i], i};
}
data_device = data_host; // 将host端的数据拷贝到device端
thrust::sort(data_device.rbegin(), data_device.rend()); // 进行排序
data_host = data_device;
for (int i = 0; i < N; ++i) {
ord_clusters[i] = data_host[i].id;
}
}
extern "C"
void launch_thrust_assignClusters(float* h_ord_rho, int* h_ord_indices, int N)
{
thrust::host_vector<MyStruct> data_host(N);
thrust::device_vector<MyStruct> data_device(N);
for (int i = 0; i < N; ++i) {
data_host[i] = { h_ord_rho[i], i };
}
data_device = data_host; // 将host端的数据拷贝到device端
thrust::sort(data_device.rbegin(), data_device.rend()); // 进行排序
data_host = data_device;
for (int i = 0; i < N; ++i) {
h_ord_indices[i] = data_host[i].id;
}
}
extern "C"
void launch_kernel_DPC_computeRhoM(float* h_data, int* h_NL, int N, int M, int dc, int* h_max_min)
{
float dc2 = 1.0 * 1.0;
float rc_inv = 1.0 / 1.0;
int N_cells = h_max_min[0] * h_max_min[1] * h_max_min[2] * h_max_min[3];
float* d_data;
int* d_NL;
cudaMalloc(&d_data, M * N * sizeof(float));
cudaMalloc(&d_NL, N * sizeof(int));
cudaMemcpy(d_data, h_data, M * N * sizeof(float), cudaMemcpyHostToDevice);
int* d_cell_counts;
int* d_cell_count_sum;
int* d_cell_contents;
cudaMalloc(&d_cell_counts, N_cells * sizeof(int));
cudaMalloc(&d_cell_count_sum, N_cells * sizeof(int));
cudaMalloc(&d_cell_contents, N * sizeof(int));
cudaDeviceSynchronize();
find_cell_contents << <dim3(NUM_BLOCKS, 1), dim3(NUM_COUNT, 1) >> > (N, d_cell_counts, d_cell_count_sum, d_cell_contents,
d_data, h_max_min[0], h_max_min[1], h_max_min[2], h_max_min[3], rc_inv, M);
cudaDeviceSynchronize();
DPC_computeRhoM_kernel << <dim3(NUM_BLOCKS, 1), dim3(NUM_COUNT, 1) >> > (d_data, d_NL, N, M, dc2, h_max_min[0], h_max_min[1], h_max_min[2], h_max_min[3], rc_inv,
d_cell_counts, d_cell_count_sum, d_cell_contents);
cudaMemcpy(h_NL, d_NL, N * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(d_NL);
cudaFree(d_data);
}如何调整线程大小
最新发布