WHT Package Jeremy Johnson
Walsh-Hadamard Transform y = WHTN x, N = 2n
WHT Algorithms Recursive Iterative General
Recursive Algorithm WHT8 = (WHT2 I4)(I2 (WHT2 I2)(I2 WHT2))
Iterative Algorithm WHT8 = (WHT2 I4)(I2 WHT2 I2)(I4 WHT2) ú û ù ê ë é - = 1 WHT8 = (WHT2 I4)(I2 WHT2 I2)(I4 WHT2)
WHT Implementation Õ WHT ( I Ä WHT Ä I ) R=N; S=1; Definition/formula N=N1* N2Nt Ni=2ni x=WHTN*x x =(x(b),x(b+s),…x(b+(M-1)s)) Implementation(nested loop) R=N; S=1; for i=t,…,1 R=R/Ni for j=0,…,R-1 for k=0,…,S-1 S=S* Ni; M b,s t Õ WHT = ( I Ä WHT Ä I ) n i +1 + ··· + t 2 n 2 n 1 + ··· + i - 2 n i 2 i = 1
Ordered Partitions There is a 1-1 mapping from ordered partitions of n onto (n-1)-bit binary numbers. There are 2n-1 ordered partitions of n. 162 = 1 0 1 0 0 0 1 0 1|1 1|1 1 1 1|1 1 1+2+4+2 = 9
Enumerating Partition Trees 00 01 01 3 3 3 2 1 2 1 1 1 10 10 11 3 3 1 2 3 1 2 1 1 1
Partition Trees 9 3 4 2 1 Left Recursive Right Recursive 4 1 3 2 4 1 3 Balanced 4 2 1 Iterative 4 1
WHT Package Püschel & Johnson (ICASSP ’00) Allows easy implementation of any of the possible WHT algorithms Partition tree representation W(n)=small[n] | split[W(n1),…W(nt)] Tools Measure runtime of any algorithm Measure hardware events Search for good implementation Dynamic programming Evolutionary algorithm
WHT Measure & Verify [jjohnson@tux64-12 wht]$ ./measure -w split[small[1],small[1],small[1]] 1.323223e-07 [jjohnson@tux64-12 wht]$ ./measure -w split[small[1],split[small[1],small[1]]] 2.181530e-07 [jjohnson@tux64-12 wht]$ ./measure -w small[3] 1.206994e-08 [jjohnson@tux64-12 wht]$ ./verify -w split[small[1],small[1],small[1]] correct [jjohnson@tux64-12 wht]$ ./verify -w split[small[1],split[small[1],small[1]]] [jjohnson@tux64-12 wht]$ ./verify -w small[3]
WHT Data Structure /* data type for the wht */ typedef struct wht { int type, /* the method */ N, /* signal length */ n; /* N = 2^n */ void (*apply)(struct wht *W, long S, wht_value *x); void (*deleteObj)(struct wht *W); /* delete is a C++ keyword */ union tagPriv { struct tagSplit { int nn; /* number of factors */ int ns[SPLIT_MAX_FACTORS]; /* size of factors */ struct wht *Ws[SPLIT_MAX_FACTORS]; /* the smaller wht's */ } split; } priv; } Wht;
Apply Split static void apply_split(Wht *W, long S, wht_value *x) { int nn; long N, R, S1, Ni, i, j, k; nn = W->priv.split.nn; if (nn > SPLIT_MAX_FACTORS) { Error_int2("%d > %d, too many factors", nn, SPLIT_MAX_FACTORS); } N = W->N; R = N; S1 = 1; for (i = nn-1; i >= 0; i--) { Ni = W->priv.split.ns[i]; R /= Ni; for (j = 0; j < R; j++) for (k = 0; k < S1; k++) wht_apply(W->priv.split.Ws[i], S*S1, x+(k+j*Ni*S1)*S); S1 *= Ni; } }
Code Generation [jjohnson@tux64-12 wht]$ ./whtgen -n 3 void apply_small3(Wht *W, long S, wht_value *x) { wht_value t0; wht_value t1; wht_value t2; wht_value t3; wht_value t4; wht_value t5; wht_value t6; wht_value t7; wht_value t8; wht_value t9; wht_value t10; wht_value t11; wht_value t12; wht_value t13; wht_value t14 ; wht_value t15; t0 = x[0 * S] + x[1 * S]; t1 = x[0 * S] - x[1 * S]; t2 = x[2 * S] + x[3 * S]; t3 = x[2 * S] - x[3 * S]; t4 = t0 + t2; t6 = t0 - t2; t5 = t1 + t3; t7 = t1 - t3; t8 = x[4 * S] + x[5 * S]; t9 = x[4 * S] - x[5 * S]; t10 = x[6 * S] + x[7 * S]; t11 = x[6 * S] - x[7 * S]; t12 = t8 + t10; t14 = t8 - t10; t13 = t9 + t11; t15 = t9 - t11; x[0 * S] = t4 + t12; x[4 * S] = t4 - t12; x[1 * S] = t5 + t13; x[5 * S] = t5 - t13; x[2 * S] = t6 + t14; x[6 * S] = t6 - t14; x[3 * S] = t7 + t15; x[7 * S] = t7 - t15; }
Code Generation [jjohnson@tux64-12 wht]$ ./whtgen -n 3 | ./simplop.pl t0 = x[0] + x[S]; t1 = x[0] - x[S]; t2 = x[2 * S] + x[3 * S]; t3 = x[2 * S] - x[3 * S]; t4 = t0 + t2; t6 = t0 - t2; t5 = t1 + t3; t7 = t1 - t3; t8 = x[4 * S] + x[5 * S]; t9 = x[4 * S] - x[5 * S]; t10 = x[6 * S] + x[7 * S]; t11 = x[6 * S] - x[7 * S]; t12 = t8 + t10; t14 = t8 - t10; t13 = t9 + t11; t15 = t9 - t11; x[0] = t4 + t12; x[4 * S] = t4 - t12; x[S] = t5 + t13; x[5 * S] = t5 - t13; x[2 * S] = t6 + t14; x[6 * S] = t6 - t14; x[3 * S] = t7 + t15; x[7 * S] = t7 - t15; }
whtgen void pushwht(long init, long size, long stride, long var) { long i; if (size == 2) { /* write from vector to temporaries */ push3( " t%ld = x[%ld * S] + x[%ld * S];\n", var, init, init + stride ); " t%ld = x[%ld * S] - x[%ld * S];\n", var + 1, }
whtgen else { /* write from temporaries to temporaries */ pushwht(init, size/2, stride, var); pushwht(init+size/2, size/2, stride, var + (mylog2(size) - 1) * size/2); var += (mylog2(size) - 1) * size; for (i = var; i < var+size/2; i++) { push3( " t%ld = t%ld + t%ld;\n", i, i - (mylog2(size) - 1) * size/2 - size/2, i - size/2 ); " t%ld = t%ld - t%ld;\n", i + size/2, ); } } }
WHT Verify static void apply_direct(Wht *W, long S, wht_value *x) { long N = W->N; int n = W->n; wht_value *y = (wht_value *) Malloc(N * sizeof(wht_value)); long i, j; /* matrix multiplication, result is in y */ for (i = 0; i < N; i++) { y[i] = 0; for (j = 0; j < N; j++) y[i] += wht_entry(n, i+1, j+1) * x[j*S]; } /* copy y to x with stride S */ for (i = 0; i < N; i++) x[i*S] = y[i];
WHT Entry int wht_entry(int k, long row, long col) { long h = (long) pow(2, k-1); int b = 0; if (k == 1) { return (row == 2 && col == 2) ? -1 : 1; } if (row > h) { row = row - h; b = 1; if (col > h) { col = col - h; if (b) return -wht_entry(k-1, row, col); return wht_entry(k-1, row, col);
Parse WHT static int parse_wht(char *in, Wht **W) { int len, n, nn; char *msg; Wht *Ws[SPLIT_MAX_FACTORS]; /* read the functor up to '[' */ len = 0; while ((in[len] != (char)0) && (in[len] != '[')) len++; if isFunctor("null") { require('['); read_int(n); require(']'); *W = wht_new_null(n); return len; }
Parse WHT else if isFunctor("direct") { require('['); read_int(n); require(']'); *W = wht_new_direct(n); return len; } else if isFunctor("small") { *W = wht_new_small(n); else if isFunctor("split") { require('['); nn = 0; read_wht(Ws[0]); nn++;; while (in[len] == ',') { if (nn == SPLIT_MAX_FACTORS) Error("too man arguments for split[ ] in wht_parse()"); require(','); read_wht(Ws[nn]); nn++; } require(']'); *W = wht_new_split(nn, Ws); return len;
Histogram (n = 16, 10,000 samples) Wide range in performance despite equal number of arithmetic operations (n2n flops) Pentium III consumes more run time (more pipeline stages) Ultra SPARC II spans a larger range
Algorithm Comparison
Dynamic Programming ), Cost( min n n1+… + nt= n … T T where Tn is the optimial tree of size n. This depends on the assumption that Cost only depends on the size of a tree and not where it is located. (true for IC, but false for runtime). For IC, the optimal tree is iterative with appropriate leaves. For runtime, DP is a good heuristic (used with binary trees).
Optimal Formulas UltraSPARC Pentium [1], [2], [3], [4], [5], [6] [7] [[4],[4]] [[5],[4]] [[5],[5]] [[5],[6]] [[2],[[5],[5]]] [[2],[[5],[6]]] [[2],[[2],[[5],[5]]]] [[2],[[2],[[5],[6]]]] [[2],[[2],[[2],[[5],[5]]]]] [[2],[[2],[[2],[[5],[6]]]]] [[2],[[2],[[2],[[2],[[5],[5]]]]]] [[2],[[2],[[2],[[2],[[5],[6]]]]]] [[2],[[2],[[2],[[2],[[2],[[5],[5]]]]]]] UltraSPARC [1], [2], [3], [4], [5], [6] [[3],[4]] [[4],[4]] [[4],[5]] [[5],[5]] [[5],[6]] [[4],[[4],[4]]] [[[4],[5]],[4]] [[4],[[5],[5]]] [[[5],[5]],[5]] [[[5],[5]],[6]] [[4],[[[4],[5]],[4]]] [[4],[[4],[[5],[5]]]] [[4],[[[5],[5]],[5]]] [[5],[[[5],[5]],[5]]]
Different Strides Dynamic programming assumption is not true. Execution time depends on stride.