Related Links: http://gcc.gnu.org/onlinedocs/gcc-3.4.3/gcc/index.html http://gnu.ghks.de/software/gcc/projects/tree-ssa/vectorization.html http://gcc.gnu.org/onlinedocs/gcc-3.4.3/gcc/Vector-Extensions.html#Vector-Extensions http://gcc.gnu.org/onlinedocs/gcc-3.4.3/gcc/X86-Built_002din-Functions.html http://parallel.ru/docs/Intel/c_ug/index.htm http://www.clifford.at/cfun/gccfeat/ http://ds9a.nl/gcc-simd/ -------------------------------------------------------- ---------------- Listing 1: ---------------------------- ---------------- The innocent current C code ----------- -------------------------------------------------------- for( i = 0; i < pnn->cOutput; i++ ) { float r = pnn->arOutputThreshold[ i ]; for( j = 0; j < pnn->cHidden; j++ ) /* LOOP WAS VECTORIZED. */ r += ar[ j ] * *prWeight++; arOutput[ i ] = sigmoid( -pnn->rBetaOutput * r ); } --------------------------------------------------------- ---------------- Listing 2: ---------------------------- ---------------- gcc 3.4.1 result for 1 ------------------ --------------------------------------------------------- movl -16(%ebp), %ecx movl 4(%ecx), %edx xorl %eax, %eax cmpl $0, %edx jmp L164 L168: ;; ;; Note that the "ss" variants only use the lowest of the 4 parallel ;; registers. gnubg does not even try to work in parallel, it uses ;; SSE2 registers as if they were mere x87 ... no comment. ;; Also note it does not even try to align, neither on 4 nor on 16 ;; (16 is really fast). ;; movss (%esi), %xmm3 addl $4, %esi mulss (%ebx,%eax,4), %xmm3 addl $1, %eax ;;; Doh! Look at this, every interim result is stored in memory addss -28(%ebp), %xmm3 cmpl %eax, %edx ;; ... and immediately read back! Does it make sense? Not a lot ... movss %xmm3, -28(%ebp) L164: jg L168 movl $LC12, (%esp) ----------------------------------------------------------- ---------------- Listing 3: ------------------------------- -------------- icc 8.1.24 result for 1--------------------- ----------------------------------------------------------- ;; ;; Loop length zero? Just skip it all. ;; $B6$47: ; Preds $B6$101 mov eax, DWORD PTR [ebp+8] ;532.18 mov edi, DWORD PTR [eax+4] ;532.18 test edi, edi ;532.2 jle $B6$66 ; Prob 2% ;532.2 ; LOE ebx esi edi ;; ;; Zero storage for result and check if the loop length is ;; to short for the "big iron" loops. If so, go to a simple, ;; single step loop. ;; Analyze loop length and branch to "suiteable" loop ;; ;; Version 1: Biggest iron, aligned and unrolled to step 16 ;; Version 2: Second biggest, unaligned and unrolled to step 16 ;; Version 1/2': Cleanup for step 16 versions ;; Version 3: Single step for trailer (%4) cleanup and small loops ;; $B6$48: ; Preds $B6$47 xor eax, eax ;532.2 mov DWORD PTR [esp+32], eax ;532.2 cmp edi, 11 ;532.2 jb $B6$62 ; Prob 10% ;532.2 ; LOE ebx esi edi $B6$49: ; Preds $B6$48 mov edx, DWORD PTR [ebp+16] ;453.59 mov ecx, edi ;532.18 mov eax, esi ;532.2 and eax, 15 ;532.2 je $B6$54 ; Prob 50% ;532.2 ; LOE eax edx ecx ebx esi edi $B6$50: ; Preds $B6$49 test al, 3 ;532.2 jne $B6$62 ; Prob 1% ;532.2 ; LOE eax ebx esi edi ;; ;; Reasoning about what loop to use, based on adress alignment of ;; data and length of loop ;; $B6$51: ; Preds $B6$50 mov ecx, DWORD PTR [ebp+16] ;532.2 movss xmm1, DWORD PTR [esp+48] ;532.2 mov DWORD PTR [esp+44], edi ;532.2 mov DWORD PTR [esp+40], ebx ;532.2 mov ebx, DWORD PTR [esp+32] ;532.2 neg eax ;532.2 lea edx, DWORD PTR [eax+16] ;532.2 mov DWORD PTR [esp+36], edx ;532.2 lea edx, DWORD PTR [ecx+eax+16] ;532.2 mov eax, DWORD PTR [esp+36] ;532.2 shr eax, 2 ;532.2 mov ecx, edi ;532.2 sub ecx, eax ;532.2 mov DWORD PTR [esp+36], eax ;532.2 mov edi, eax ;532.2 mov eax, DWORD PTR [ebp+16] ;532.2 ;; ;; Pre-loop for alignment ;; ALIGN 4 ; LOE eax edx ecx ebx esi edi xmm1 $B6$52: ; Preds $B6$52 $B6$51 movss xmm0, DWORD PTR [eax+ebx*4] ;533.11 mulss xmm0, DWORD PTR [esi+ebx*4] ;533.22 add ebx, 1 ;532.2 cmp ebx, edi ;532.2 addss xmm1, xmm0 ;533.6 jb $B6$52 ; Prob 90% ;532.2 ; LOE eax edx ecx ebx esi edi xmm1 $B6$53: ; Preds $B6$52 mov edi, DWORD PTR [esp+44] ; movss DWORD PTR [esp+48], xmm1 ; mov DWORD PTR [esp+32], ebx ; mov ebx, DWORD PTR [esp+40] ; ; LOE edx ecx ebx esi edi ;;; ;;; We now know we need the "big iron" loops. Now check if we can ;; use a 16 byte aligned version (perfect!) or have to run unaligned. ;;; $B6$54: ; Preds $B6$53 $B6$49 movss xmm1, DWORD PTR [esp+48] ;533.6 and ecx, 7 ;532.2 neg ecx ;532.2 add ecx, edi ;532.2 test dl, 15 ;532.2 pxor xmm0, xmm0 ;533.6 movss xmm0, xmm1 ;533.6 pxor xmm1, xmm1 ;533.6 jne $B6$58 ; Prob 50% ;532.2 ;; ;; Crunching loop #1: 2-fold unrolled and using "ps" variants, thus ;; 16 floats are crunched each loop (both multiply and sum). It's ;; all done in registers, memory access uses sophisticated adressing ;; ;; Note "movaps" access, so when we are here, we are sure that all ;; adresses are aligned 16 and we run maximum speed (a for aligned). ;; ; LOE ecx ebx esi edi xmm0 xmm1 $B6$55: ; Preds $B6$54 mov edx, DWORD PTR [esp+32] ; mov eax, DWORD PTR [ebp+16] ; ALIGN 4 ; LOE eax edx ecx ebx esi edi xmm0 xmm1 $B6$56: ; Preds $B6$56 $B6$55 movaps xmm2, XMMWORD PTR [eax+edx*4] ;533.11 movaps xmm3, XMMWORD PTR [eax+edx*4+16] ;533.11 mulps xmm2, XMMWORD PTR [esi+edx*4] ;533.22 mulps xmm3, XMMWORD PTR [esi+edx*4+16] ;533.22 addps xmm0, xmm2 ;533.6 addps xmm1, xmm3 ;533.6 add edx, 8 ;532.2 cmp edx, ecx ;532.2 jb $B6$56 ; Prob 97% ;532.2 ; LOE eax edx ecx ebx esi edi xmm0 xmm1 $B6$57: ; Preds $B6$56 mov DWORD PTR [esp+32], edx ; jmp $B6$61 ; Prob 100% ; ALIGN 4 ;; ;; Crunching loop #2: ;; Same construct as #1, but does not take aligned adresses as ;; granted ("movups", u for unaligned). ;; ; LOE edx ebx esi edi dl dh xmm0 xmm1 $B6$58: ; Preds $B6$54 mov edx, DWORD PTR [esp+32] ; mov eax, DWORD PTR [ebp+16] ; ;; Note, btw, how carefully icc aligns all instructions. Gnu does not ;; really care ... ALIGN 4 ; LOE eax edx ecx ebx esi edi xmm0 xmm1 $B6$59: ; Preds $B6$59 $B6$58 movups xmm2, XMMWORD PTR [eax+edx*4] ;533.11 movups xmm3, XMMWORD PTR [eax+edx*4+16] ;533.11 mulps xmm2, XMMWORD PTR [esi+edx*4] ;533.22 mulps xmm3, XMMWORD PTR [esi+edx*4+16] ;533.22 addps xmm0, xmm2 ;533.6 addps xmm1, xmm3 ;533.6 add edx, 8 ;532.2 cmp edx, ecx ;532.2 jb $B6$59 ; Prob 97% ;532.2 ; LOE eax edx ecx ebx esi edi xmm0 xmm1 $B6$60: ; Preds $B6$59 mov DWORD PTR [esp+32], edx ; ; LOE edx ebx esi edi dl dh xmm0 xmm1 ;; ;; Big iron Crunch loop aftermath, transfer results to memory ;; $B6$61: ; Preds $B6$60 $B6$57 addps xmm0, xmm1 ;533.6 mov eax, edx ;532.2 cmp eax, edi ;532.2 movaps xmm1, xmm0 ;532.2 movhlps xmm1, xmm1 ;532.2 addps xmm0, xmm1 ;533.6 movaps xmm2, xmm0 ;532.2 shufps xmm2, xmm2, 1 ;532.2 addss xmm0, xmm2 ;533.6 movss DWORD PTR [esp+48], xmm0 ;532.2 jae $B6$65 ; Prob 10% ;532.2 ; LOE ebx esi edi ;; ;; Crunching loop #3: ;; Single float loop ("ss") to handle the last (<=3) numbers, if any. Clean up ;; for what was not suited for the heavy crunchers. Please note that ;; this loop is identical to what gcc does at all, but faster as it is ;; all register based and does not access memory every loop ... ;; $B6$62: ; Preds $B6$50 $B6$48 $B6$61 movss xmm1, DWORD PTR [esp+48] ; mov edx, DWORD PTR [esp+32] ; mov eax, DWORD PTR [ebp+16] ; ALIGN 4 ; LOE eax edx ebx esi edi xmm1 $B6$63: ; Preds $B6$63 $B6$62 movss xmm0, DWORD PTR [eax+edx*4] ;533.11 mulss xmm0, DWORD PTR [esi+edx*4] ;533.22 add edx, 1 ;532.2 cmp edx, edi ;532.2 addss xmm1, xmm0 ;533.6 jb $B6$63 ; Prob 90% ;532.2 ; LOE eax edx ebx esi edi xmm1 ;;; ;;; Done, store result in memory ;;; $B6$64: ; Preds $B6$63 movss DWORD PTR [esp+48], xmm1 ; ; LOE ebx esi edi $B6$65: ; Preds $B6$61 $B6$64 [...] ----------------------------------------------------------------------- ---------------- Listing 4: ------------------------------------------- ------------ GCC Code using SSE intrinsics (not perfect) -------------- ------------ See Listing 6 for a improved version in ICC syntax ------- ----------------------------------------------------------------------- ;;; ;;; This is equivalent to ;;; for( i = 0; i < pnn->cOutput; i++ ) { float r = pnn->arOutputThreshold[ i ]; for( j = 0; j < pnn->cHidden; j++ ) /* LOOP WAS VECTORIZED. */ r += ar[ j ] * *prWeight++; arOutput[ i ] = sigmoid( -pnn->rBetaOutput * r ); } ;;; ;;; End of quote equivalent ;;; #if defined(__GNUC_MINOR__) && (__GNUC__ >= 3) && (__GNUC_MINOR__ >= 1) typedef int v4sf __attribute__ ((mode(V4SF))); // vector of four single floats for( i = 0; i < pnn->cOutput; i++ ) { float r = pnn->arOutputThreshold[ i ]; for( j = 0; j < pnn->cHidden; j++ ) { /* LOOP WAS VECTORIZED. */ // Part 0: Caclculate main and post-loop int rest = pnn->cHidden - pnn->cHidden%4; float *armainloopEnd = &ar[j] + (rest); int k; { float result[4]; /* the vi are used much like the 8 xmmN registers */ // Clear result to (0, 0, 0, 0) v4sf v2 = __builtin_ia32_xorps (v2, v2); // Part 1 // Vector strip-mined loop (main loop, step 4) for (/*NOP*/ ; /*NOP*/ ; ar += 4, prWeight+=4 ) { v4sf v0, v1, v3; if (ar == armainloopEnd) { __builtin_ia32_storeups((float *) &result, v2); break; } v0 = __builtin_ia32_loadups(ar); v1 = __builtin_ia32_loadups(prWeight); v3 = __builtin_ia32_mulps(v0, v1); v2 = __builtin_ia32_addps(v2, v3); } r = r + result[0] + result[1] + result[2] + result[3]; // Part 2 // Scalar clean-up loop (step 1) for (k = rest ; k < pnn->cHidden ; k++ ) { r += ar[k] * *prWeight++; } } } arOutput[ i ] = sigmoid( -pnn->rBetaOutput * r ); } #endif ---------------------------------------------------------------------- ---------------- Listing 5: ------------------------------------------ ----------------- GCC 3.4.1 assembly output for 4 -------------------- ---------------------------------------------------------------------- [...] ;; for( j = 0; j < pnn->cHidden; j++ ) { /* LOOP WAS VECTORIZED. */ movl -44(%ebp), %eax movl $0, -52(%ebp) movl 8(%eax), %edi testl %edi, %edi jle L149 L168: movl -44(%ebp), %edx ;; for( j = 0; j < pnn->cHidden; j++ ) { movl -52(%ebp), %eax movl 40(%edx), %edi movl 4(%edx), %edx ; pnn->cHidden ;;; Note: gcc is not totally hopeless, it keeps "float r" ;;; in a register all along the loop. Yipeee! movss (%edi,%eax,4), %xmm2 ; float r = pnn->arOutputThreshold[ i ]; xorl %edi, %edi ; j = 0 cmpl $0, %edx ; pnn->cHidden == 0 ? jle L151 ; if so, exit inner loop L196: leal 3(%edx), %eax cmpl $-1, %edx movl %edx, %ecx cmovle %eax, %ecx movdqa -72(%ebp), %xmm1 andl $-4, %ecx xorps %xmm1, %xmm1 leal (%edi,%ecx), %eax movaps %xmm1, -72(%ebp) leal (%ebx,%eax,4), %eax jmp L101 ;;; Hand knitted crunch loop, not perfect, but at least does ;;; 4 floats in parallel. Could be easily made step 16 etc. ;;; by simply unrolling it L104: ; Crunch loop (intrinsics) movups (%ebx), %xmm5 ; v0 = __builtin_ia32_loadups(ar); movups (%esi), %xmm6 ; v1 = __builtin_ia32_loadups(prWeight); movaps -72(%ebp), %xmm4 mulps %xmm6, %xmm5 ; v3 = __builtin_ia32_mulps(v0, v1); addl $16, %ebx ; ar += 4, prWeight+=4 addl $16, %esi addps %xmm5, %xmm4 ; v2 = __builtin_ia32_addps(v2, v3); movaps %xmm4, -72(%ebp) L101: cmpl %eax, %ebx ; for (/*NOP*/ ; /*NOP*/ ; ar += 4, prWeight+=4 ) { jne L104 movaps -72(%ebp), %xmm3 movl %ecx, %eax movups %xmm3, -40(%ebp) ; __builtin_ia32_storeups((float *) &result, v2); addss -40(%ebp), %xmm2 ; r = r + result[0] + result[1] + result[2] + result[3]; addss -36(%ebp), %xmm2 addss -32(%ebp), %xmm2 addss -28(%ebp), %xmm2 jmp L105 ;;; Note GCC even without intrinsics hint uses SSE math for the scalar loop ;;; however, in "ss" mode. It surely has to here :) L108: movss (%esi), %xmm7 addl $4, %esi mulss (%ebx,%eax,4), %xmm7 ; ar[k] * *prWeight++; addl $1, %eax addss %xmm7, %xmm2 ; r += ^ L105: cmpl %eax, %edx jg L108 ; for (k = rest ; k < pnn->cHidden ; k++ ) { addl $1, %edi cmpl %edi, %edx jg L196 ;; Exit of inner [j] loop, sigmoid to follow L151: movl -44(%ebp), %eax movss 24(%eax), %xmm0 ; pnn->rBetaOutput xorps LC9, %xmm0 ; -pnn->rBetaOutput mulss %xmm2, %xmm0 ; this is still r, isn't it amazing movss %xmm0, (%esp) call _sigmoid_original ---------------------------------------------------------------------- ---------------- Listing 6: ------------------------------------------ ----------------- ICC 8.1.24 code (intrinsics) ----------------------- ------------------ (Inner loop only [j]) ----------------------------- ---------------------------------------------------------------------- for( j = 0; j < pnn->cHidden; j++ ) { /* r += ar[ j ] * *prWeight++; */ __m128 vec0, vec1, vec2; const int k =0; vec0 = _mm_load_ps ((float *) ar); vec1 = _mm_load_ps ((float *) prWeight); vec2 = _mm_setzero_ps(); // loop is fully unrolled for the N=128 case in this example // #pragma vector aligned (not needed if we are unrolled) // for (k=0; kcHidden/4;k+=16) { /* 128/4 = 32, OK! */ // vec0 = ; // vec1 = ; vec0 = _mm_mul_ps(_mm_load_ps(ar + k*4), _mm_load_ps(prWeight + k*4)); vec2 = _mm_add_ps(vec2, vec0); vec0 = _mm_load_ps(ar + (k+1)*4); vec1 = _mm_load_ps(prWeight + (k+1)*4); vec0 = _mm_mul_ps(vec0, vec1); vec2 = _mm_add_ps(vec2, vec0); vec0 = _mm_load_ps(ar + (k+2)*4); vec1 = _mm_load_ps(prWeight + (k+2)*4); vec0 = _mm_mul_ps(vec0, vec1); vec2 = _mm_add_ps(vec2, vec0); vec0 = _mm_load_ps(ar + (k+3)*4); vec1 = _mm_load_ps(prWeight + (k+3)*4); vec0 = _mm_mul_ps(vec0, vec1); vec2 = _mm_add_ps(vec2, vec0); vec0 = _mm_load_ps(ar + (k+4)*4); vec1 = _mm_load_ps(prWeight + (k+4)*4); vec0 = _mm_mul_ps(vec0, vec1); vec2 = _mm_add_ps(vec2, vec0); vec0 = _mm_load_ps(ar + (k+5)*4); vec1 = _mm_load_ps(prWeight + (k+5)*4); vec0 = _mm_mul_ps(vec0, vec1); vec2 = _mm_add_ps(vec2, vec0); vec0 = _mm_load_ps(ar + (k+6)*4); vec1 = _mm_load_ps(prWeight + (k+6)*4); vec0 = _mm_mul_ps(vec0, vec1); vec2 = _mm_add_ps(vec2, vec0); vec0 = _mm_load_ps(ar + (k+7)*4); vec1 = _mm_load_ps(prWeight + (k+7)*4); vec0 = _mm_mul_ps(vec0, vec1); vec2 = _mm_add_ps(vec2, vec0); // } /* r = a b c d swapLo = b a d c sumLo = a+b b+a c+d d+c swapHi = c+d c+d a+b a+b sum = 4 copies of a+b+d+c input is vec2, aux are vec1 & vec0 */ /* __m128 swapLo */ vec0 = _mm_shuffle_ps(vec2,vec2, _MM_SHUFFLE(2,3,0,1)); /* __m128 sumLo */ vec1 = _mm_add_ps(vec2, vec0); /*__m128 swapHi */ vec0 = _mm_shuffle_ps(vec1,vec1, _MM_SHUFFLE(1,1,3,3)); /*__m128 sum */ vec2 = _mm_add_ps(vec1,vec0); _mm_store_ss (&r, vec2); } arOutput[ i ] = sigmoid( -pnn->rBetaOutput * r ); } ---------------------------------------------------------------------- ---------------- Listing 7: ------------------------------------------ ----------------- ICC 8.1.24 code (result assembly for 6) ------------ ------------------ (Inner loop only [j]) ----------------------------- ---------------------------------------------------------------------- $B6$46: ; Preds $B6$45 mov edx, DWORD PTR [esp+44] ; mov ecx, DWORD PTR [ebp+16] ; ALIGN 4 ; LOE eax edx ecx ebx esi edi $B6$47: ; Preds $B6$48 $B6$46 movaps xmm0, XMMWORD PTR [ecx] ;552.34 movaps xmm1, XMMWORD PTR [ecx+16] ;554.23 movaps xmm2, XMMWORD PTR [ecx+32] ;558.23 mulps xmm0, XMMWORD PTR [edx] ;552.11 mulps xmm1, XMMWORD PTR [edx+16] ;556.11 mulps xmm2, XMMWORD PTR [edx+32] ;560.11 movaps xmm3, XMMWORD PTR [ecx+48] ;562.23 movaps xmm4, XMMWORD PTR [ecx+64] ;566.23 movaps xmm5, XMMWORD PTR [ecx+80] ;570.23 mulps xmm3, XMMWORD PTR [edx+48] ;564.11 mulps xmm4, XMMWORD PTR [edx+64] ;568.11 mulps xmm5, XMMWORD PTR [edx+80] ;572.11 movaps xmm6, XMMWORD PTR [ecx+96] ;574.23 movaps xmm7, XMMWORD PTR [ecx+112] ;578.23 mulps xmm6, XMMWORD PTR [edx+96] ;576.11 mulps xmm7, XMMWORD PTR [edx+112] ;580.11 addps xmm0, xmm1 ;557.11 addps xmm0, xmm2 ;561.11 addps xmm0, xmm3 ;565.11 addps xmm0, xmm4 ;569.11 addps xmm0, xmm5 ;573.11 addps xmm0, xmm6 ;577.11 addps xmm0, xmm7 ;581.11 movaps xmm1, xmm0 ;590.31 shufps xmm1, xmm1, 177 ;590.31 addps xmm0, xmm1 ;591.30 movaps xmm1, xmm0 ;592.30 shufps xmm1, xmm1, 95 ;592.30 addps xmm0, xmm1 ;593.27 movss DWORD PTR [esp+64], xmm0 ;594.3 ; LOE eax edx ecx ebx esi edi $B6$48: ; Preds $B6$47 add eax, 1 ;539.32 cmp eax, DWORD PTR [edi+4] ;539.2 jl $B6$47 ; Prob 90% ;539.2 ------------------------------------------------------------------------ -------------- Listing 8 ----------------------------------------------- ---------- Optimization attempt using fixed length loops --------------- ---------- Body In Evaluate -------------------------------------------- ------------------------------------------------------------------------ switch (pnn->cInput) { case 250: { /* assert(pnn->cInput == 250 && pnn->cHidden == 128 && pnn->cOutput == 5 && pnn->rBetaHidden == 0.1f && pnn->rBetaOutput == 1.0f); */ EVALUATE_OPTIMIZED_255(250, 128, 5, 0.1, 1.0) } case 214: { /* assert(pnn->cInput == 214 && pnn->cHidden == 128 && pnn->cOutput == 5 && pnn->rBetaHidden == 0.1f && pnn->rBetaOutput == 1.0f); */ EVALUATE_OPTIMIZED(214, 128, 5, 0.1, 1.0) } case 200: { switch (pnn->cHidden) { case 5: { /* assert(pnn->cInput == 200 && pnn->cHidden == 5 && pnn->cOutput == 5 && pnn->rBetaHidden == 0.1f && pnn->rBetaOutput == 1.0f); */ EVALUATE_OPTIMIZED(200, 5, 5, 0.1, 1.0) } case 10: { /* assert(pnn->cInput == 200 && pnn->cHidden == 10 && pnn->cOutput == 5 && pnn->rBetaHidden == 0.1f && pnn->rBetaOutput == 1.0f); */ EVALUATE_OPTIMIZED(200, 10, 5, 0.1, 1.0) } } } default: { EVALUATE_OPTIMIZED(pnn->cInput, pnn->cHidden, pnn->cOutput, pnn->rBetaHidden, pnn->rBetaOutput) } } [...] ------------------------------------------------------------------------ -------------- Listing 9 ----------------------------------------------- ---------- Optimization attempt using fixed length loops --------------- ---------- Header with templates---------------------------------------- ------------------------------------------------------------------------ /* * neuralnet_optimized_weights_015.c * * by Ingo Macherius 20050222 * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * $Id$ */ // EVALUATE_OPTIMIZED(250, 128, 5, 0.1, 1.0) #define EVALUATE_OPTIMIZED(INPUT, HIDDEN, OUTPUT, BETAHIDDEN, BETAOUTPUT) \ int i, j;\ float *prWeight;\ \ /* Calculate activity at hidden nodes */\ for( i = 0; i < HIDDEN; i++ )\ ar[ i ] = pnn->arHiddenThreshold[ i ];\ \ prWeight = pnn->arHiddenWeight;\ \ for( i = 0; i < INPUT; i++ ) {\ float const ari = arInput[ i ];\ \ if( ari ) {\ float *pr = ar;\ \ if( ari == 1.0f )\ for( j = HIDDEN; j; j-- )\ *pr++ += *prWeight++;\ else\ for( j = HIDDEN; j; j-- )\ *pr++ += *prWeight++ * ari;\ } else\ prWeight += HIDDEN;\ }\ \ if( saveAr)\ memcpy( saveAr, ar, HIDDEN * sizeof( *saveAr));\ \ \ for( i = 0; i < HIDDEN; i++ )\ ar[ i ] = sigmoid( -BETAHIDDEN * ar[ i ] );\ \ /* Calculate activity at output nodes */\ prWeight = pnn->arOutputWeight;\ \ for( i = 0; i < OUTPUT; i++ ) {\ float r = pnn->arOutputThreshold[ i ];\ \ for( j = 0; j < HIDDEN; j++ )\ r += ar[ j ] * *prWeight++;\ \ /* In Network 0.15, BETAOUTPUT is always 1.0, we do not need the multiplication */\ /* arOutput[ i ] = sigmoid( -BETAOUTPUT * r ); */\ arOutput[ i ] = sigmoid( -r );\ }\ \ return; /* ---------------------- */ #define EVALUATE_FROM_BASE_OPTIMIZED(INPUT, HIDDEN, OUTPUT, BETAHIDDEN, BETAOUTPUT) \ int i, j;\ float *prWeight;\ \ /* Calculate activity at hidden nodes */\ /* for( i = 0; i < HIDDEN; i++ )\ ar[ i ] = pnn->arHiddenThreshold[ i ]; */\ \ prWeight = pnn->arHiddenWeight;\ \ for( i = 0; i < INPUT; ++i ) {\ float const ari = arInputDif[ i ];\ \ if( ari ) {\ float *pr = ar;\ \ if( ari == 1.0f )\ for( j = HIDDEN; j; j-- )\ *pr++ += *prWeight++;\ else\ if(ari == -1.0f)\ for(j = HIDDEN; j; j-- ) \ *pr++ -= *prWeight++;\ else\ for( j = HIDDEN; j; j-- )\ *pr++ += *prWeight++ * ari;\ } else\ prWeight += HIDDEN;\ }\ \ for( i = 0; i < HIDDEN; i++ )\ ar[ i ] = sigmoid( -BETAHIDDEN * ar[ i ] );\ \ /* Calculate activity at output nodes */\ prWeight = pnn->arOutputWeight;\ \ for( i = 0; i < OUTPUT; i++ ) {\ float r = pnn->arOutputThreshold[ i ];\ \ for( j = 0; j < HIDDEN; j++ )\ r += ar[ j ] * *prWeight++;\ \ /* In Network 0.15, BETAOUTPUT is always 1.0, we do not need the multiplication */\ /* arOutput[ i ] = sigmoid( -BETAOUTPUT * r ); */\ arOutput[ i ] = sigmoid( -r );\ }\ \ return; ------------------------------------------------------------------------ -------------- Listing 10 ----------------------------------------------- ---------- Heavily optimizing GCC 3.4 batchfile -------- --------------- ---------- (tested on cygwin with gcc 3.4.1) -------------------------- ------------------------------------------------------------------------ # gcc (GCC) 3.4.1 (cygming special) # Copyright (C) 2004 Free Software Foundation, Inc. # This is free software; see the source for copying conditions. There is NO # warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # http://gcc.gnu.org/onlinedocs/gcc-3.4.3/gcc/i386-and-x86_002d64-Options.html#i386-and-x86_002d64-Options export OPT_386="-march=i386 -mtune=pentium" export OPT_SSE="-march=pentium3 -mtune=pentium3 -msse -mfpmath=sse" export OPT_SSE2="-march=pentium4 -mtune=pentium4 -msse2 -mfpmath=sse" export OPT_SSE3="-march=prescott -mtune=prescott -msse3 -mfpmath=sse" export OPT_ALL_386="-maccumulate-outgoing-args -malign-double -fomit-frame-pointer -momit-leaf-frame-pointer" # http://gcc.gnu.org/onlinedocs/gcc-3.4.3/gcc/Optimize-Options.html#Optimize-Options export OPT_OPT="-O3 -minline-all-stringops" # Check out if agressive float optimization harms NN accuracy export OPT_FLOAT="-ffast-math" # http://gcc.gnu.org/onlinedocs/gcc-3.4.3/gcc/Code-Gen-Options.html#Code-Gen-Options export OPT_GEN="-freg-struct-return" export DEBUG_INLINE="-fno-inline" export OPTCFLAGS="$OPT_SSE2 $OPT_ALL_386 $OPT_OPT $OPT_FLOAT $OPT_GEN $DEBUG_INLINE" export CFLAGS="-I .. -I . -S $OPTCFLAGS" echo gcc $CFLAGS neuralnet.c gcc $CFLAGS neuralnet.c ------------------------------------------------------------------------ -------------- Listing 11 --------------------------------------------- ---------- Heavily optimizing ICC 8.1 batchfile ------------------------ ---------- (tested on WinXPSP2, VS7.1, ICC8.1.24)---------------------- ------------------------------------------------------------------------ setlocal rem *** Intel(R) C++ Compiler for 32-bit applications, Version 8.1 rem *** Build 20041019Z Package ID: W_CC_PU_8.1.020 rem *** Copyright (C) 1985-2004 Intel Corporation. All rights reserved. set VS=E:\Programme\Microsoft Visual Studio .NET 2003\Vc7\ set OPT=/c /Qvc7.1 /Qlocation,link,"%VS%\bin" /Ot /Oy /GT /G7 /GA /QaxN /QxN /Qparallel /GF /FD /MD /I ".." /I "." /I "%VS%\include" /I "E:\programme\Intel\MKL701\include" /nologo /W1 /D "WIN32" /D "NDEBUG" /D "_LIB" /D "HAVE_CONFIG_H=1" /D "_MBCS" /YX"StdAfx.h" /Fp"..\msdev\Intermediate\Release/lib.pch" /Fo"..\msdev\Intermediate\Release/" /Fd"..\msdev\Intermediate\Release/" /Gd /TC /Qunroll254 rem *** no inlining optimizazions so the assembler output keeps readable set INLINE=/Ob2 /Oi set INLINE= icl %OPT% %INLINE% /S neuralnet.c endlocal