c++ - An accumulated computing error in SSE version of algorithm of the sum of squared differences -
i trying optimize following code (sum of squared differences 2 arrays):
inline float square(float value) { return value*value; } float squareddifferencesum(const float * a, const float * b, size_t size) { float sum = 0; for(size_t = 0; < size; ++i) sum += square(a[i] - b[i]); return sum; }
so performed optimization using of sse instructions of cpu:
inline void squareddifferencesum(const float * a, const float * b, size_t i, __m128 & sum) { __m128 _a = _mm_loadu_ps(a + i); __m128 _b = _mm_loadu_ps(b + i); __m128 _d = _mm_sub_ps(_a, _b); sum = _mm_add_ps(sum, _mm_mul_ps(_d, _d)); } inline float extractsum(__m128 a) { float _a[4]; _mm_storeu_ps(_a, a); return _a[0] + _a[1] + _a[2] + _a[3]; } float squareddifferencesum(const float * a, const float * b, size_t size) { size_t = 0, alignedsize = size/4*4; __m128 sums = _mm_setzero_ps(); for(; < alignedsize; += 4) squareddifferencesum(a, b, i, sums); float sum = extractsum(sums); for(; < size; ++i) sum += square(a[i] - b[i]); return sum; }
this code works fine if size of arrays not large. if size big enough there large computing error between results given base function , optimized version. , have question: here bug in sse optimized code, leads computing error.
the error follows finite precision floating point numbers. each addition of 2 floating point numbers has computing error proportional difference between them. in scalar version of algorithm resulting sum greater each term (if size of arrays big enough of course). leads accumulation of big computing error.
in sse version of algorithm there 4 sums results accumulation. , difference between these sums , each term lesser in 4 times relative scalar code. leads lesser computing error.
there 2 ways solve error:
1) using of floating point numbers of double precision accumulating sum.
2) using of the kahan summation algorithm (also known compensated summation) reduces numerical error in total obtained adding sequence of finite precision floating point numbers, compared obvious approach.
https://en.wikipedia.org/wiki/kahan_summation_algorithm
with using of kahan summation algorithm scalar code like:
inline void kahansum(float value, float & sum, float & correction) { float term = value - correction; float temp = sum + term; correction = (temp - sum) - term; sum = temp; } float squareddifferencekahansum(const float * a, const float * b, size_t size) { float sum = 0, correction = 0; for(size_t = 0; < size; ++i) kahansum(square(a[i] - b[i]), sum, correction); return sum; }
and sse optimized code follow:
inline void squareddifferencekahansum(const float * a, const float * b, size_t i, __m128 & sum, __m128 & correction) { __m128 _a = _mm_loadu_ps(a + i); __m128 _b = _mm_loadu_ps(b + i); __m128 _d = _mm_sub_ps(_a, _b); __m128 term = _mm_sub_ps(_mm_mul_ps(_d, _d), correction); __m128 temp = _mm_add_ps(sum, term); correction = _mm_sub_ps(_mm_sub_ps(temp, sum), term); sum = temp; } float squareddifferencekahansum(const float * a, const float * b, size_t size) { size_t = 0, alignedsize = size/4*4; __m128 sums = _mm_setzero_ps(), corrections = _mm_setzero_ps(); for(; < alignedsize; += 4) squareddifferencekahansum(a, b, i, sums, corrections); float sum = extractsum(sums), correction = 0; for(; < size; ++i) kahansum(square(a[i] - b[i]), sum, correction); return sum; }
Comments
Post a Comment