以下是一个使用 C++ 实现的示例代码,用于比较使用 AVX 指令集和不使用 AVX 指令集进行向量内积的执行时间:
#include <iostream>
#include <vector>
#include <cmath>
#include <chrono>
#include <immintrin.h>
static double get_similarity_score(const std::vector<double>& emb1, const std::vector<double>& emb2) {
if (emb1.size() != emb2.size() || emb1.empty() || emb2.empty()) {
return -1; // 返回 -1 以表示错误: 向量长度不一致或为空
}
double norm_emb1 = 0, norm_emb2 = 0, score = 0;
for (auto c : emb1) {
norm_emb1 += c * c;
}
for (auto c : emb2) {
norm_emb2 += c * c;
}
norm_emb1 = sqrt(norm_emb1);
norm_emb2 = sqrt(norm_emb2);
if (norm_emb1 == 0 || norm_emb2 == 0) return -1;
for (int i = 0; i < emb1.size(); i++) {
score += emb1[i] * emb2[i];
}
return score / norm_emb1 / norm_emb2;
}
static double get_similarity_score_avx(const std::vector<double>& emb1, const std::vector<double>& emb2) {
if (emb1.size() != emb2.size() || emb1.empty() || emb2.empty()) {
return -1; // 返回 -1 以表示错误: 向量长度不一致或为空
}
int size = emb1.size();
int vectorSize = size / 4;
double norm_emb1 = 0, norm_emb2 = 0, score = 0;
__m256d sum_emb1 = _mm256_setzero_pd();
__m256d sum_emb2 = _mm256_setzero_pd();
__m256d sum_score = _mm256_setzero_pd();
const double* vec_emb1 = &emb1[0];
const double* vec_emb2 = &emb2[0];
for (int i = 0; i < vectorSize; ++i) {
__m256d v_emb1 = _mm256_loadu_pd(&vec_emb1[i * 4]);
__m256d v_emb2 = _mm256_loadu_pd(&vec_emb2[i * 4]);
__m256d mul = _mm256_mul_pd(v_emb1, v_emb2);
sum_score = _mm256_add_pd(sum_score, mul);
__m256d mul_emb1 = _mm256_mul_pd(v_emb1, v_emb1);
sum_emb1 = _mm256_add_pd(sum_emb1, mul_emb1);
__m256d mul_emb2 = _mm256_mul_pd(v_emb2, v_emb2);
sum_emb2 = _mm256_add_pd(sum_emb2, mul_emb2);
}
double tempResult[4];
_mm256_storeu_pd(tempResult, sum_emb1);
for (int i = 0; i < 4; ++i) {
norm_emb1 += tempResult[i];
}
_mm256_storeu_pd(tempResult, sum_emb2);
for (int i = 0; i < 4; ++i) {
norm_emb2 += tempResult[i];
}
_mm256_storeu_pd(tempResult, sum_score);
for (int i = 0; i < 4; ++i) {
score += tempResult[i];
}
// 处理剩余元素
for (int i = vectorSize * 4; i < size; ++i) {
norm_emb1 += emb1[i] * emb1[i];
norm_emb2 += emb2[i] * emb2[i];
score += emb1[i] * emb2[i];
}
norm_emb1 = sqrt(norm_emb1);
norm_emb2 = sqrt(norm_emb2);
if (norm_emb1 == 0 || norm_emb2 == 0) return -1;
return score / norm_emb1 / norm_emb2;
}
int main() {
const int size = 64;
const int iterations = 10000000;
std::vector<double> emb1(size);
std::vector<double> emb2(size);
for (int i = 0; i < size; ++i) {
emb1[i] = i + 1;
emb2[i] = i + 2;
}
// 普通方法计时
auto startNormal = std::chrono::high_resolution_clock::now();
double normalRes = 0.0;
for (int i = 0; i < iterations; ++i) {
normalRes = get_similarity_score(emb1, emb2);
}
auto endNormal = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> durationNormal = endNormal - startNormal;
std::cout << "Normal similarity score result: " << normalRes << std::endl;
std::cout << "Normal similarity score time: " << durationNormal.count() << " seconds" << std::endl;
// AVX 方法计时
auto startAVX = std::chrono::high_resolution_clock::now();
double avxRes = 0.0;
for (int i = 0; i < iterations; ++i) {
avxRes = get_similarity_score_avx(emb1, emb2);
}
auto endAVX = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> durationAVX = endAVX - startAVX;
std::cout << "AVX similarity score result: " << avxRes << std::endl;
std::cout << "AVX similarity score time: " << durationAVX.count() << " seconds" << std::endl;
return 0;
}
执行结果是(注意编译时加了**-mavx**):
PS D:\cpp_test> g++ -mavx .\使用AVX给向量内积加速.cpp
PS D:\cpp_test> .\a.exe
Normal similarity score result: 0.999917
Normal similarity score time: 6.12244 seconds
AVX similarity score result: 0.999917
AVX similarity score time: 3.40937 seconds
在这个代码中,首先定义了两个函数分别用于普通的向量内积和使用 AVX 指令集的向量内积。然后在main函数中生成测试数据,分别对两种方法进行计时并输出执行时间。
请注意,要确保你的编译器支持 AVX 指令集,并在编译时开启相应的选项(例如对于 GCC,可以使用-mavx选项)。