Vector-Parallel Modernization Tim Prince PhD (ME) Intel Black Belt Software Developer Sept. 24, 2015.

2 Introduction This presentation shows how to optimize some difficult loops with Intel compilers for Fortran, C, and C++. Examples are selected from the classic vector benchmark, cases which are not optimized automatically by current Intel compilers, but exhibit good vector or parallel performance after modernization. This format shows a variety of applications and the relationship among the source languages.

3 Resolve suspected anti-dependence by omp4 equivalence(array(64),x(1)) ! this example has true anti-dependence: stores 64 elements beyond load ! illustrates a pitfall of equivalence and equivalent C pointer overlaps #if _OPENMP >= 201307 !$omp simd safelen(32) ! this allows 2 cache lines of read ahead for 32-bit data type #endif do i= 1,n-1 x(i+1)= array(i)+a(i) enddo

4 Anti-dependence example in C and C++ #define x ((real *)&cdata_1 + 63) i2 = *n - 1; #if _OPENMP >= 201307 #pragma omp simd safelen(32) #endif for (int i = 1; i <= i2; ++i) cdata_1.array[i] = x[i - 1] + a[i]; // C++ : // no pragma needed by g++ #pragma ivdep transform(&x[0],&x[i2],&a[1],&cdata_1.array[1],plus ());

5 False suspected anti-dependence do i= 1,n-1 if(a(i) < 0.)then ! order as in netlib public source !if(b(i) < 0.) a(i)= a(i)+c(i)*d(i) !b(i+1)= c(i)+d(i)*e(i) ! switch order to remove vector dependence b(i+1)= c(i)+d(i)*e(i) if(b(i) < 0.) a(i)= a(i)+c(i)*d(i) endif enddo

6 C false anti-dependence avoided // no pragma needed if pointers are qualified by __restrict for (int i = 1; i <= i2; ++i) if (a[i] < 0.f) { b[i + 1] = c[i] + d[i] * e[i]; if (b[i] < 0.f) a[i] += c[i] * d[i]; }

7 Optimize circular loop carried dependency (avoid f90) ! a(:n)= (b(:n)+cshift(b(:n),1)+cshift(b(:n),2))*.333 x= b(n) y= b(n-1) !$omp simd do i= 1,n a(i)= (b(i)+x+y)*.333 y= x x= b(i) enddo

8 C pragma circular dependency optimization x = b[*n]; y = b[*n - 1]; i2 = *n; #pragma omp simd for (int i = 1; i <= i2; ++i) { a[i] = (b[i] + x + y) *.333f; y = x; x = b[i]; }

9 Partial read after write dependency ! do i= 1,n! Hidden partial dependency ! x= a(n-i+1)+b(i)*c(i) ! a(i)= x-1.0 ! b(i)= x ! enddo ! resolve by separating the dependencies b(1:(n+1)/2)= a(n:n/2+1:-1)+b(1:(n+1)/2)*c(1:(n+1)/2) ! ifort fuses here at -O3 a(1:(n+1)/2)= b(1:(n+1)/2)-1.0 b((n+3)/2:n)= a(n/2:1:-1)+b((n+3)/2:n)*c((n+3)/2:n) a((n+3)/2:n)= b((n+3)/2:n)-1.0

10 Resolve false assumed WAR by C omp4 (explicit fusion creates false WAR dependence) #pragma omp simd for (int i= 1; i <= (i2+1)/2; ++i) a[i] = (b[i] = a[i2 - i + 1] + b[i] * c[i])- 1.f; #pragma omp simd for (int i= (i2+3)/2; i <= i2; ++i) a[i] = (b[i] = a[i2 - i + 1] + b[i] * c[i])- 1.f;

11 Vectorize by splitting search and compute ! i= 1 ! do while (a(i) >= 0.) ! Not vectorized ! a(i)= a(i)+b(i)*c(i) ! i= i+1 ! enddo ! no more old-fashioned explicit masking do i= 1,n if(a(i) < 0) exit enddo a(:i-1)= a(:i-1)+b(:i-1)*c(:i-1)

12 C vectorized linear search and compute i2 = *n; // first i has scope outside for for (i = 1; i <= i2; ++i) if (a[i] < 0.f) break; i2 = i - 1; // this one needs * __restrict a or pragma for (int i = 1; i <= i2; ++i) a[i] += b[i] * c[i];

13 Overcome “protects against exception” by taking arithmetic outside, and directive ! do i= 1,n ! if(d(i) < 0)then ! a(i)= a(i)+b(i)*c(i) ! else !if(d(i).ne.0)then ! a(i)= a(i)+c(i)*c(i) ! else ! a(i)= a(i)+b(i)*b(i) ! endif ! enddo !dir$ vector aligned a(:n)= a(:n)+merge(b(:n),c(:n),d(:n)<=0)*merge(c(:n),b(:n),d(:n)/=0)

14 C avoidance of “protects against exception” #pragma vector aligned // using __restrict (or another pragma) for (int i = 1; i <= i2; ++i) a[i] +=(d[i] <= 0.f?b[i]:c[i]) * (d[i]==0.f?b[i]:c[i]);

15 linear search not optimized ifort doesn’t resolve 2 level reduction max= aa(1,1) xindex= 1 yindex= 1 do j= 1,n do i= 1,n if(aa(i,j) > max)then max= aa(i,j) xindex= i yindex= j endif enddo

16 Parallel-vector linear search max_= aa(1,1) xindex=1 yindex=1 !$omp parallel do private(ml) if(n>103) reduction(max: max_) & !$omp& lastprivate(xindex,yindex) do j=1,n ml= maxloc(aa(:n,j),dim=1) if(aa(ml,j)>max_.or. aa(ml,j)==max_.and. j<yindex)then xindex= ml yindex= j max_=aa(ml,j) endif enddo

17 C parallel-vector linear search max__ = aa[aa_dim1 + 1]; xindex = yindex = 1; i2 = i3 = *n; #pragma omp parallel for if(i2 > 103) reduction(max: max__) lastprivate(xindex,yindex) for (int j = 1; j <= i2; ++j) { int indxj=0; float maxj=max__; #pragma omp simd reduction(max: maxj) lastprivate(indxj) for (int i = 1; i maxj){ maxj = aa[i + j * aa_dim1]; indxj = i; } if(maxj > max__) { // fixme: take care of the case of ties max__= maxj; xindex=indxj; yindex=j;}}

18 Parallel vector convolution #if defined __INTEL_COMPILER !$omp parallel do if(n>103) do i= 1,m a(i)= a(i)+dot_product(b(i:i+m-1),c(m:1:-1)) #else ! single thread version (slightly less accurate) Ifort auto-parallel does an array reduction, it’s OK for small no. cores do j= 1,m a(:m)= a(:m)+b(1+m-j:m+m-j)*c(j) #endif enddo

19 C parallel vector convolution #pragma omp parallel for if(i3 > 103) for (int i = 1; i <= i3; ++i) { float sum = 0; #pragma omp simd reduction(+: sum) for (int j = 1; j <= i2; ++j) sum += b[i + j - 1] * c[i2 - j + 1]; a[i] += sum; }

20 C++ parallel vector convolution // and here's a C++ version, which doesn't need AVX2 to optimize // reverse the vector which is used repeatedly vector Cr(m); reverse_copy(&c[1],&c[i3]+1,Cr.begin()); // It won't optimize with /Qprotect-parens (investigation requested) #pragma omp parallel for if(i3 > 103) for (int i = 1; i <= i3; ++i) a[i] += inner_product(Cr.begin(),Cr.end(),&b[i],0.f);

21 False indexing dependency, no optimization k = 1 do 10 i = 1,n do 20 j = 2,n bb(i,j) = bb(i,j-1) + array(k) * cc(i,j) k = k + 1 20 continue k = k + 1 10 continue

22 Optimize by making inner loops independent !$omp parallel do private(k) if(n>103) do i= 1,n k= i*n+1-n do j= 2,n bb(i,j)= bb(i,j-1)+array(k)*cc(i,j) k= k+1 enddo ! version for single core do j= 2,n bb(:n,j)= bb(:n,j-1)+array(j-1:n*n+j-1:n)*cc(:n,j) enddo

23 Loop nesting not corrected due to indexing ! do 10 i = 1,n ! k = i*(i-1)/2+i ! do 20 j = i,n ! array(k) = array(k) + bb(i,j) ! k = k + j ! 20 continue ! 10 continue ! swap loops for inner loop data locality do j= 1,n k= j*(j-1)/2 array(k+1:k+j)= array(k+1:k+j)+bb(:j,j) enddo ! That's good enough for single CPU auto-parallel auto-vectorizer

24 another auto-renesting failure ! do 30 i = 2,n ! do 20 j = 2,n ! aa(i,j) = aa(i,j-1) + cc(i,j) ! 20 continue ! do 30 j = 2,n ! bb(i,j) = bb(i-1,j) + cc(i,j) ! 30 continue do j= 2,n do i= 2,n aa(i,j)= aa(i,j-1)+cc(i,j) bb(i,j)= bb(i-1,j)+cc(i,j) enddo

25 Explicit parallel C code #pragma omp parallel if(i2 > 53) { #pragma omp for nowait // setting up to proceed to the next loop when some cores finish here #pragma novector for (int j = 2; j <= i3; ++j) for (int i = 2; i <= i2; ++i) bb[i + j * bb_dim1] = bb[i - 1 + j*bb_dim1] + cc[i + j * cc_dim1]; #pragma omp for simd for (int i = 2; i <= i2; ++i) for (int j = 2; j <= i3; ++j) aa[i + j * aa_dim1] = aa[i + (j - 1) * aa_dim1] + cc[i + j * cc_dim1]; }

26 Fallacy: compilers always optimize out of loop ! do 10 i = 1,n-1 ! a(i) = b(i) + c(i) * d(i) ! b(i) = c(i) + b(i) ! a(i+1) = b(i) + a(i+1) * d(i) ! 10 continue do i= 1,n-1 a(i)= b(i)+c(i)*d(i) b(i)= c(i)+b(i) enddo a(n)= b(n-1)+a(n)*d(n-1)

27 Repeated update of 1D array in 2D loop ! do 10 i = 1,n ! do 20 j = 2,n ! a(j) = aa(i,j) - a(j-1) ! aa(i,j) = a(j) + bb(i,j) ! 20 continue ! 10 continue ! so store only the final values of a(:): do j= 2,n aa(:n,j)= aa(:n,j)+bb(:n,j)-a(j-1) a(j)=aa(n,j)-bb(n,j) enddo

28 Non-vectorizable 1D array removal ! loop swap should have been obvious do 10 i = 2,n do 20 j = 1,n a(i) = aa(i,j) - a(i-1) aa(i,j) = a(i) + bb(i,j) 20 continue 10 continue

29 it’s parallelizable, relatively tedious !$omp parallel if(n>103) !$omp do private(tmp) do j= 1,n-1 tmp= a(1) do i= 2,n tmp= aa(i,j)-tmp aa(i,j)= tmp+bb(i,j) enddo !$omp end do nowait !$omp single do i= 2,n a(i)= aa(i,n)-a(i-1) aa(i,n)= a(i)+bb(i,n) enddo !$omp end single !$omp end parallel

30 Showing the C with the final loop first #pragma omp parallel if(i2 > 103){ #pragma omp single for (int i = 2; i <= i2; ++i) { a[i] = aa[i + i3 * aa_dim1] - a[i - 1]; aa[i + i3 * aa_dim1] = a[i] + bb[i + i3 * bb_dim1]; } #pragma omp for nowait for (int j = 1; j < i3; ++j){ float tmp= a[1]; for (int i = 2; i <= i2; ++i) { tmp = aa[i + j * aa_dim1] - tmp; aa[i + j * aa_dim1] = tmp + bb[i + j * bb_dim1]; }}}

31 Save embarrassing one for last do 10 i = 1,n a(i) = a(1) 10 continue ! don't over-write the rhs in a potentially recursive manner ! (even by same value) unless by array assignment; a(:n)= a(1) In case you wondered, it’s the same between C and CEAN a[1:i2] = a[1]; // is OK even though over-writing a[1]

32 Conclusions Auto-vectorization often needs help Parallelization often needs more explicit “modernization” Use tools to identify where to “modernize:” Vtune, Advisor

