From e305b72f68126aa8d1700b366eb5d3361d0c7fc5 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Fri, 25 Oct 2019 00:51:03 +0200 Subject: [PATCH] Extend the min trick to the other multiplication methods, change the default. --- Number_types/include/CGAL/Interval_nt.h | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/Number_types/include/CGAL/Interval_nt.h b/Number_types/include/CGAL/Interval_nt.h index 968f9d75370..41859a8d287 100644 --- a/Number_types/include/CGAL/Interval_nt.h +++ b/Number_types/include/CGAL/Interval_nt.h @@ -539,9 +539,9 @@ private: #endif Internal_protector P; #ifdef CGAL_USE_SSE2 -# if 1 +# if !defined __SSE4_1__ && !defined __AVX__ // Brutal, compute all products in all directions. - // The actual winner (by a hair) on recent hardware + // The actual winner (by a hair) on recent hardware before removing NaNs. __m128d aa = IA_opacify128_weak(a.simd()); // {-ai,as} __m128d bb = b.simd(); // {-bi,bs} __m128d m = _mm_set_sd(-0.); // {-0,+0} @@ -569,13 +569,14 @@ private: __m128d r = _mm_max_pd (y1, y2); // __m128d r = _mm_max_pd(x1,_mm_max_pd(x2,_mm_max_pd(x3,_mm_min_pd(x4,big)))); return IA (IA_opacify128(r)); -# elif 0 +# elif 1 // we want to multiply ai,as with {ai<0?-bs:-bi,as<0?bi:bs} // we want to multiply as,ai with {as<0?-bs:-bi,ai<0?bi:bs} // requires SSE4 (otherwise use _mm_cmplt_pd, _mm_and_pd, _mm_andnot_pd and _mm_or_pd to avoid blendv) // probably faster on older hardware __m128d m = _mm_set_sd(-0.); // {-0,+0} __m128d m1 = _mm_set1_pd(-0.); // {-0,-0} + __m128d big = IA::largest().simd(); __m128d aa = a.simd(); // {-ai,as} __m128d az = _mm_xor_pd(aa, m); // {ai,as} az = IA_opacify128_weak(az); @@ -586,7 +587,9 @@ private: __m128d x = _mm_blendv_pd (bb, bp, az); // {ai<0?-bs:-bi,as<0?bi:bs} __m128d y = _mm_blendv_pd (bb, bp, azp); // {as<0?-bs:-bi,ai<0?bi:bs} __m128d p1 = _mm_mul_pd (az, x); + p1 = _mm_min_pd(p1,big); // no NaN __m128d p2 = _mm_mul_pd (azp, y); + p2 = _mm_min_pd(p2,big); // no NaN __m128d r = _mm_max_pd (p1, p2); return IA (IA_opacify128(r)); # elif 0 @@ -594,6 +597,7 @@ private: // we want to multiply -as,ai with {as<0?bs:bi,ai>0?bs:bi} // slightly worse than the previous one __m128d m1 = _mm_set1_pd(-0.); // {-0,-0} + __m128d big = IA::largest().simd(); __m128d aa = IA_opacify128_weak(a.simd()); // {-ai,as} __m128d ax = swap_m128d (aa); // {as,-ai} __m128d ap = _mm_xor_pd (ax, m1); // {-as,ai} @@ -605,13 +609,16 @@ private: __m128d x = _mm_blendv_pd (bbs, bbi, aa); // {ai>0?bi:bs,as<0?bi:bs} __m128d y = _mm_blendv_pd (bbi, bbs, ax); // {as<0?bs:bi,ai>0?bs:bi} __m128d p1 = _mm_mul_pd (aa, x); + p1 = _mm_min_pd(p1,big); // no NaN __m128d p2 = _mm_mul_pd (ap, y); + p2 = _mm_min_pd(p2,big); // no NaN __m128d r = _mm_max_pd (p1, p2); return IA (IA_opacify128(r)); # else // AVX version of the brutal method, same running time or slower __m128d aa = IA_opacify128_weak(a.simd()); // {-ai,as} __m128d bb = b.simd(); // {-bi,bs} + __m256d big = _mm256_set1_pd(std::numeric_limits::infinity()); __m128d m = _mm_set_sd(-0.); // {-0,+0} __m128d m1 = _mm_set1_pd(-0.); // {-0,-0} __m128d ax = swap_m128d (aa); // {as,-ai} @@ -622,7 +629,9 @@ private: __m256d Y1 = _mm256_set_m128d(bz,bz); // {bi,bs,bi,bs} __m256d Y2 = _mm256_permute_pd(Y1,5); // {bs,bi,bs,bi} __m256d Z1 = _mm256_mul_pd(X,Y1); + Z1 = _mm256_min_pd(Z1,big); // no NaN __m256d Z2 = _mm256_mul_pd(X,Y2); + Z2 = _mm256_min_pd(Z2,big); // no NaN __m256d Z = _mm256_max_pd(Z1,Z2); __m128d z1 = _mm256_castpd256_pd128(Z); __m128d z2 = _mm256_extractf128_pd(Z,1);