报错:在尝试内联(inline)一个总是内联的函数 _mm256_setzero_ps()
时发生了目标特定的选项不匹配错误。
error: inlining failed in call to always_inline '__m256 _mm256_setzero_ps()': target specific option mismatch
_mm256_setzero_ps (void)
^~~~~~~~~~~~~~~~~
AVX-based.cpp:27:24: note: called from here
sum = _mm256_setzero_ps();
~~~~~~~~~~~~~~~~~^~
原代码:
void avx_mul(int n, float a[][maxN], float b[][maxN], float c[][maxN])
{
__m256 t1, t2, sum;
__m128 s1, s2;
for (int i = 0; i < n; ++i) for (int j = 0; j < i; ++j)
swap(b[i][j], b[j][i]);
for (int i = 0; i < n; ++i) {
for (int j = 0; j < n; ++j) {
c[i][j] = 0.0;
sum = _mm256_setzero_ps();
for (int k = n - 8; k >= 0; k -= 8) { //sum every 8 elements
t1 = _mm256_loadu_ps(a[i] + k);
t2 = _mm256_loadu_ps(b[j] + k);
t1 = _mm256_mul_ps(t1, t2);
sum = _mm256_add_ps(sum, t1);
}s1 = _mm256_extractf128_ps(sum, 0); // s1=[a0,a1,a2,a3]
s2 = _mm256_extractf128_ps(sum, 1); // s2=[a4,a5,a6,a7]
s1 = _mm_hadd_ps(s1, s2); // s1=[a0+a1,a2+a3,a4+a5,a6+a7]
s1 = _mm_hadd_ps(s1, s1);
//s1=[a0+a1+a2+a3,a4+a5+a6+a7,a0+a1+a2+a3,a4+a5+a6+a7]
s1 = _mm_hadd_ps(s1, s1);
//s1=[a0+a1+a2+a3+a4+a5+a6+a7,...]
_mm_store_ss(c[i] + j, s1);
for (int k = (n % 8) - 1; k >= 0; --k) {
// handle the last n%8 elements
c[i][j] += a[i][k] * b[j][k];}}}
for (int i = 0; i < n; ++i) for (int j = 0; j < i; ++j)
swap(b[i][j], b[j][i]);}