添加将以下异常值输出功能#include <fstream>
#include <iostream>
#include <vector>
#include <cmath>
#include <sstream>
#include <string>
#include <unordered_set>
#include <chrono>
#include <algorithm>
#include <mpi.h>
struct Point {
std::vector<double> values;
int cluster = -1; // -1表示噪声点
bool visited = false;
};
double euclideanDistance(const Point& p1, const Point& p2) {
double sum = 0.0;
for (size_t i = 0; i < p1.values.size(); ++i) {
sum += (p1.values[i] - p2.values[i]) * (p1.values[i] - p2.values[i]);
}
return std::sqrt(sum);
}
std::vector<int> findLocalNeighbors(const std::vector<Point>& points, int index, double eps) {
std::vector<int> neighbors;
for (int j = 0; j < points.size(); ++j) {
if (index != j && euclideanDistance(points[index], points[j]) <= eps) {
neighbors.push_back(j);
}
}
return neighbors;
}
void expandCluster(std::vector<Point>& points, int index, std::vector<int>& neighbors,
int clusterId, double eps, int minPts) {
points[index].cluster = clusterId;
for (size_t i = 0; i < neighbors.size(); ++i) {
int neighborIdx = neighbors[i];
if (!points[neighborIdx].visited) {
points[neighborIdx].visited = true;
std::vector<int> newNeighbors = findLocalNeighbors(points, neighborIdx, eps);
if (newNeighbors.size() >= minPts) {
neighbors.insert(neighbors.end(), newNeighbors.begin(), newNeighbors.end());
}
}
if (points[neighborIdx].cluster == -1) {
points[neighborIdx].cluster = clusterId;
}
}
}
void parallelDBSCAN(std::vector<Point>& localPoints, double eps, int minPts, int rank, int size) {
int clusterId = rank * 100000; // 为每个进程分配唯一的clusterId范围
for (int i = 0; i < localPoints.size(); ++i) {
if (localPoints[i].visited) continue;
localPoints[i].visited = true;
std::vector<int> neighbors = findLocalNeighbors(localPoints, i, eps);
// TODO: 这里应该添加跨进程的邻居查找
// 当前简化版本只查找本地邻居
if (neighbors.size() < minPts) {
localPoints[i].cluster = -1; // 标记为噪声
}
else {
clusterId++;
expandCluster(localPoints, i, neighbors, clusterId, eps, minPts);
}
}
}
std::vector<Point> readCSV(const std::string& filename) {
std::vector<Point> points;
std::ifstream file(filename);
std::string line;
// 跳过标题行
std::getline(file, line);
while (std::getline(file, line)) {
std::istringstream iss(line);
std::string token;
Point point;
while (std::getline(iss, token, ',')) {
point.values.push_back(std::stod(token));
}
points.push_back(point);
}
return points;
}
int main(int argc, char** argv) {
MPI_Init(&argc, &argv);
int rank, size;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
// 参数设置
double eps = 0.5;
int minPts = 5;
std::vector<Point> allPoints;
int totalPoints = 0;
// 根进程读取数据
if (rank == 0) {
allPoints = readCSV("D:\\桌面\\并行程序\\naiveBayes\\naiveBayes\\iris.csv");
totalPoints = allPoints.size();
}
// 广播数据点总数
MPI_Bcast(&totalPoints, 1, MPI_INT, 0, MPI_COMM_WORLD);
// 测量并行运行时间
double startTime, endTime;
MPI_Barrier(MPI_COMM_WORLD);
startTime = MPI_Wtime();
// 计算每个进程处理的数据量
int baseCount = totalPoints / size;
int remainder = totalPoints % size;
int localCount = (rank < remainder) ? baseCount + 1 : baseCount;
// 准备分发数据
std::vector<int> counts(size);
std::vector<int> displs(size);
if (rank == 0) {
for (int i = 0; i < size; ++i) {
counts[i] = (i < remainder) ? baseCount + 1 : baseCount;
displs[i] = (i == 0) ? 0 : displs[i - 1] + counts[i - 1];
}
}
// 分发特征维度
int featureSize = 0;
if (rank == 0 && !allPoints.empty()) {
featureSize = allPoints[0].values.size();
}
MPI_Bcast(&featureSize, 1, MPI_INT, 0, MPI_COMM_WORLD);
// 准备发送和接收缓冲区
std::vector<double> sendBuffer;
if (rank == 0) {
sendBuffer.resize(totalPoints * featureSize);
for (int i = 0; i < totalPoints; ++i) {
for (int j = 0; j < featureSize; ++j) {
sendBuffer[i * featureSize + j] = allPoints[i].values[j];
}
}
}
std::vector<double> recvBuffer(localCount * featureSize);
MPI_Scatterv(sendBuffer.data(), counts.data(), displs.data(), MPI_DOUBLE,
recvBuffer.data(), localCount * featureSize, MPI_DOUBLE,
0, MPI_COMM_WORLD);
// 解包数据到本地Points
std::vector<Point> localPoints(localCount);
for (int i = 0; i < localCount; ++i) {
localPoints[i].values.resize(featureSize);
for (int j = 0; j < featureSize; ++j) {
localPoints[i].values[j] = recvBuffer[i * featureSize + j];
}
localPoints[i].cluster = -1;
localPoints[i].visited = false;
}
// 执行并行DBSCAN
parallelDBSCAN(localPoints, eps, minPts, rank, size);
// 收集聚类结果
std::vector<int> allClusters(totalPoints);
std::vector<int> localClusters(localCount);
for (int i = 0; i < localCount; ++i) {
localClusters[i] = localPoints[i].cluster;
}
// 调整counts和displs用于收集结果
for (int i = 0; i < size; ++i) {
counts[i] = (i < remainder) ? baseCount + 1 : baseCount;
displs[i] = (i == 0) ? 0 : displs[i - 1] + counts[i - 1];
}
MPI_Gatherv(localClusters.data(), localCount, MPI_INT,
allClusters.data(), counts.data(), displs.data(), MPI_INT,
0, MPI_COMM_WORLD);
// 测量结束时间
MPI_Barrier(MPI_COMM_WORLD);
endTime = MPI_Wtime();
double parallelTime = endTime - startTime;
// 主进程处理结果
if (rank == 0) {
// 合并聚类结果
for (int i = 0; i < totalPoints; ++i) {
allPoints[i].cluster = allClusters[i];
}
// 测量串行版本性能
auto startSerial = std::chrono::high_resolution_clock::now();
std::vector<int> serialLabels(totalPoints, 0);
if (!allPoints.empty()) {
std::vector<Point> serialPoints = allPoints;
for (auto& p : serialPoints) {
p.cluster = -1;
p.visited = false;
}
parallelDBSCAN(serialPoints, eps, minPts, 0, 1); // 用并行函数模拟串行执行
}
auto endSerial = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsedSerial = endSerial - startSerial;
double serialTime = elapsedSerial.count();
// 计算性能指标
double speedup = serialTime / parallelTime;
double efficiency = (speedup / size) * 100;
// 输出性能表格
std::cout << "\n性能统计:" << std::endl;
std::cout << "+------------+------------+------------+-----------------+" << std::endl;
std::cout << "| 进程数 | 运行时间(s)| 加速比 | 并行效率(%) |" << std::endl;
std::cout << "+------------+------------+------------+-----------------+" << std::endl;
printf("| %-10d | %-10.4f | %-10.4f | %-15.2f |\n",
size, parallelTime, speedup, efficiency);
std::cout << "+------------+------------+------------+-----------------+" << std::endl;
}
MPI_Finalize();
return 0;
}