Related Links:

http://gcc.gnu.org/onlinedocs/gcc-3.4.3/gcc/index.html
http://gnu.ghks.de/software/gcc/projects/tree-ssa/vectorization.html
http://gcc.gnu.org/onlinedocs/gcc-3.4.3/gcc/Vector-Extensions.html#Vector-Extensions
http://gcc.gnu.org/onlinedocs/gcc-3.4.3/gcc/X86-Built_002din-Functions.html
http://parallel.ru/docs/Intel/c_ug/index.htm
http://www.clifford.at/cfun/gccfeat/
http://ds9a.nl/gcc-simd/


--------------------------------------------------------
---------------- Listing 1: ----------------------------
---------------- The innocent current C code -----------
--------------------------------------------------------
for( i = 0; i < pnn->cOutput; i++ ) {
	float r = pnn->arOutputThreshold[ i ];
	for( j = 0; j < pnn->cHidden; j++ )	/* LOOP WAS VECTORIZED. */
		r += ar[ j ] * *prWeight++;
	arOutput[ i ] = sigmoid( -pnn->rBetaOutput * r );
}
		    

---------------------------------------------------------
---------------- Listing 2: ----------------------------
---------------- gcc 3.4.1 result for 1 ------------------
---------------------------------------------------------

	movl	-16(%ebp), %ecx
	movl	4(%ecx), %edx
	xorl	%eax, %eax
	cmpl	$0, %edx
	jmp	L164
L168:

;;
;; Note that the "ss" variants only use the lowest of the 4 parallel
;; registers. gnubg does not even try to work in parallel, it uses
;; SSE2 registers as if they were mere x87 ... no comment.
;; Also note it does not even try to align, neither on 4 nor on 16
;; (16 is really fast).
;;

	movss	(%esi), %xmm3
	addl	$4, %esi
	mulss	(%ebx,%eax,4), %xmm3
	addl	$1, %eax

;;; Doh! Look at this, every interim result is stored in memory

	addss	-28(%ebp), %xmm3
	cmpl	%eax, %edx

;; ... and immediately read back! Does it make sense? Not a lot ...

	movss	%xmm3, -28(%ebp)
L164:
	jg	L168
	movl	$LC12, (%esp)

-----------------------------------------------------------
---------------- Listing 3: -------------------------------
-------------- icc 8.1.24 result for 1---------------------
-----------------------------------------------------------

;;
;; Loop length zero? Just skip it all.
;; 

$B6$47:                         ; Preds $B6$101
        mov       eax, DWORD PTR [ebp+8]                        ;532.18
        mov       edi, DWORD PTR [eax+4]                        ;532.18
        test      edi, edi                                      ;532.2
        jle       $B6$66        ; Prob 2%                       ;532.2
                                ; LOE ebx esi edi

;;
;; Zero storage for result and check if the loop length is 
;; to short for the "big iron" loops. If so, go to a simple,
;; single step loop.
;; Analyze loop length and branch to "suiteable" loop
;;
;; Version 1: Biggest iron, aligned and unrolled to step 16
;; Version 2: Second biggest, unaligned and unrolled to step 16
;; Version 1/2': Cleanup for step 16 versions
;; Version 3: Single step for trailer (%4) cleanup and small loops
;;


$B6$48:                         ; Preds $B6$47
        xor       eax, eax                                      ;532.2
        mov       DWORD PTR [esp+32], eax                       ;532.2
        cmp       edi, 11                                       ;532.2
        jb        $B6$62        ; Prob 10%                      ;532.2
                                ; LOE ebx esi edi
$B6$49:                         ; Preds $B6$48
        mov       edx, DWORD PTR [ebp+16]                       ;453.59
        mov       ecx, edi                                      ;532.18
        mov       eax, esi                                      ;532.2
        and       eax, 15                                       ;532.2
        je        $B6$54        ; Prob 50%                      ;532.2
                                ; LOE eax edx ecx ebx esi edi
$B6$50:                         ; Preds $B6$49
        test      al, 3                                         ;532.2
        jne       $B6$62        ; Prob 1%                       ;532.2
                                ; LOE eax ebx esi edi

;;
;; Reasoning about what loop to use, based on adress alignment of
;; data and length of loop
;;


$B6$51:                         ; Preds $B6$50
        mov       ecx, DWORD PTR [ebp+16]                       ;532.2
        movss     xmm1, DWORD PTR [esp+48]                      ;532.2
        mov       DWORD PTR [esp+44], edi                       ;532.2
        mov       DWORD PTR [esp+40], ebx                       ;532.2
        mov       ebx, DWORD PTR [esp+32]                       ;532.2
        neg       eax                                           ;532.2
        lea       edx, DWORD PTR [eax+16]                       ;532.2
        mov       DWORD PTR [esp+36], edx                       ;532.2
        lea       edx, DWORD PTR [ecx+eax+16]                   ;532.2
        mov       eax, DWORD PTR [esp+36]                       ;532.2
        shr       eax, 2                                        ;532.2
        mov       ecx, edi                                      ;532.2
        sub       ecx, eax                                      ;532.2
        mov       DWORD PTR [esp+36], eax                       ;532.2
        mov       edi, eax                                      ;532.2
        mov       eax, DWORD PTR [ebp+16]                       ;532.2


;;
;; Pre-loop for alignment
;;

        ALIGN     4
                                ; LOE eax edx ecx ebx esi edi xmm1
$B6$52:                         ; Preds $B6$52 $B6$51
        movss     xmm0, DWORD PTR [eax+ebx*4]                   ;533.11
        mulss     xmm0, DWORD PTR [esi+ebx*4]                   ;533.22
        add       ebx, 1                                        ;532.2
        cmp       ebx, edi                                      ;532.2
        addss     xmm1, xmm0                                    ;533.6
        jb        $B6$52        ; Prob 90%                      ;532.2
                                ; LOE eax edx ecx ebx esi edi xmm1
$B6$53:                         ; Preds $B6$52
        mov       edi, DWORD PTR [esp+44]                       ;
        movss     DWORD PTR [esp+48], xmm1                      ;
        mov       DWORD PTR [esp+32], ebx                       ;
        mov       ebx, DWORD PTR [esp+40]                       ;
                                ; LOE edx ecx ebx esi edi

;;;
;;; We now know we need the "big iron" loops. Now check if we can
;; use a 16 byte aligned version (perfect!) or have to run unaligned.
;;;


$B6$54:                         ; Preds $B6$53 $B6$49
        movss     xmm1, DWORD PTR [esp+48]                      ;533.6
        and       ecx, 7                                        ;532.2
        neg       ecx                                           ;532.2
        add       ecx, edi                                      ;532.2
        test      dl, 15                                        ;532.2
        pxor      xmm0, xmm0                                    ;533.6
        movss     xmm0, xmm1                                    ;533.6
        pxor      xmm1, xmm1                                    ;533.6
        jne       $B6$58        ; Prob 50%                      ;532.2

;;
;; Crunching loop #1: 2-fold unrolled and using "ps" variants, thus
;; 16 floats are crunched each loop (both multiply and sum). It's
;; all done in registers, memory access uses sophisticated adressing
;;
;; Note "movaps" access, so when we are here, we are sure that all
;; adresses are aligned 16 and we run maximum speed (a for aligned).
;;


                                ; LOE ecx ebx esi edi xmm0 xmm1
$B6$55:                         ; Preds $B6$54
        mov       edx, DWORD PTR [esp+32]                       ;
        mov       eax, DWORD PTR [ebp+16]                       ;
        ALIGN     4
        

                                ; LOE eax edx ecx ebx esi edi xmm0 xmm1
$B6$56:                         ; Preds $B6$56 $B6$55
        movaps    xmm2, XMMWORD PTR [eax+edx*4]                 ;533.11
        movaps    xmm3, XMMWORD PTR [eax+edx*4+16]              ;533.11
        mulps     xmm2, XMMWORD PTR [esi+edx*4]                 ;533.22
        mulps     xmm3, XMMWORD PTR [esi+edx*4+16]              ;533.22
        addps     xmm0, xmm2                                    ;533.6
        addps     xmm1, xmm3                                    ;533.6
        add       edx, 8                                        ;532.2
        cmp       edx, ecx                                      ;532.2
        jb        $B6$56        ; Prob 97%                      ;532.2
                                ; LOE eax edx ecx ebx esi edi xmm0 xmm1
$B6$57:                         ; Preds $B6$56
        mov       DWORD PTR [esp+32], edx                       ;
        jmp       $B6$61        ; Prob 100%                     ;
        ALIGN     4

;;
;; Crunching loop #2:
;; Same construct as #1, but does not take aligned adresses as 
;; granted ("movups", u for unaligned).
;;
                                ; LOE edx ebx esi edi dl dh xmm0 xmm1
$B6$58:                         ; Preds $B6$54
        mov       edx, DWORD PTR [esp+32]                       ;
        mov       eax, DWORD PTR [ebp+16]                       ;

;; Note, btw, how carefully icc aligns all instructions. Gnu does not 
;; really care ...

        ALIGN     4
                                ; LOE eax edx ecx ebx esi edi xmm0 xmm1

$B6$59:                         ; Preds $B6$59 $B6$58
        movups    xmm2, XMMWORD PTR [eax+edx*4]                 ;533.11
        movups    xmm3, XMMWORD PTR [eax+edx*4+16]              ;533.11
        mulps     xmm2, XMMWORD PTR [esi+edx*4]                 ;533.22
        mulps     xmm3, XMMWORD PTR [esi+edx*4+16]              ;533.22
        addps     xmm0, xmm2                                    ;533.6
        addps     xmm1, xmm3                                    ;533.6
        add       edx, 8                                        ;532.2
        cmp       edx, ecx                                      ;532.2
        jb        $B6$59        ; Prob 97%                      ;532.2
                                ; LOE eax edx ecx ebx esi edi xmm0 xmm1
$B6$60:                         ; Preds $B6$59
        mov       DWORD PTR [esp+32], edx                       ;
                                ; LOE edx ebx esi edi dl dh xmm0 xmm1

;;
;; Big iron Crunch loop aftermath, transfer results to memory
;;

$B6$61:                         ; Preds $B6$60 $B6$57
        addps     xmm0, xmm1                                    ;533.6
        mov       eax, edx                                      ;532.2
        cmp       eax, edi                                      ;532.2
        movaps    xmm1, xmm0                                    ;532.2
        movhlps   xmm1, xmm1                                    ;532.2
        addps     xmm0, xmm1                                    ;533.6
        movaps    xmm2, xmm0                                    ;532.2
        shufps    xmm2, xmm2, 1                                 ;532.2
        addss     xmm0, xmm2                                    ;533.6
        movss     DWORD PTR [esp+48], xmm0                      ;532.2
        jae       $B6$65        ; Prob 10%                      ;532.2
                                ; LOE ebx esi edi

;;
;; Crunching loop #3:
;; Single float loop ("ss") to handle the last (<=3) numbers, if any. Clean up
;; for what was not suited for the heavy crunchers. Please note that
;; this loop is identical to what gcc does at all, but faster as it is
;; all register based and does not access memory every loop ...
;;

$B6$62:                         ; Preds $B6$50 $B6$48 $B6$61
        movss     xmm1, DWORD PTR [esp+48]                      ;
        mov       edx, DWORD PTR [esp+32]                       ;
        mov       eax, DWORD PTR [ebp+16]                       ;
        ALIGN     4
                                ; LOE eax edx ebx esi edi xmm1

$B6$63:                         ; Preds $B6$63 $B6$62
        movss     xmm0, DWORD PTR [eax+edx*4]                   ;533.11
        mulss     xmm0, DWORD PTR [esi+edx*4]                   ;533.22
        add       edx, 1                                        ;532.2
        cmp       edx, edi                                      ;532.2
        addss     xmm1, xmm0                                    ;533.6
        jb        $B6$63        ; Prob 90%                      ;532.2
                                ; LOE eax edx ebx esi edi xmm1

;;;
;;; Done, store result in memory
;;;

$B6$64:                         ; Preds $B6$63
        movss     DWORD PTR [esp+48], xmm1                      ;
                                ; LOE ebx esi edi
$B6$65:                         ; Preds $B6$61 $B6$64
[...]


-----------------------------------------------------------------------
---------------- Listing 4: -------------------------------------------
------------ GCC Code using SSE intrinsics (not perfect) --------------
------------ See Listing 6 for a improved version in ICC syntax -------
-----------------------------------------------------------------------

;;;
;;; This is equivalent to
;;;

    for( i = 0; i < pnn->cOutput; i++ ) {
	float r = pnn->arOutputThreshold[ i ];

	for( j = 0; j < pnn->cHidden; j++ )	/* LOOP WAS VECTORIZED. */
	    r += ar[ j ] * *prWeight++;

	arOutput[ i ] = sigmoid( -pnn->rBetaOutput * r );
    }


;;;
;;; End of quote equivalent
;;;

#if defined(__GNUC_MINOR__) && (__GNUC__ >= 3) && (__GNUC_MINOR__ >= 1)
typedef int v4sf __attribute__ ((mode(V4SF))); // vector of four single floats

for( i = 0; i < pnn->cOutput; i++ ) {
	float r = pnn->arOutputThreshold[ i ];

	for( j = 0; j < pnn->cHidden; j++ ) {	/* LOOP WAS VECTORIZED. */
		// Part 0: Caclculate main and post-loop
		int rest = pnn->cHidden - pnn->cHidden%4;
		float *armainloopEnd  = &ar[j] + (rest);
		int k;
		{
			float result[4];
			/* the vi are used much like the 8 xmmN registers */
			// Clear result to (0, 0, 0, 0)
			v4sf v2 = __builtin_ia32_xorps (v2, v2);

			// Part 1
			// Vector strip-mined loop (main loop, step 4)
			for (/*NOP*/ ; /*NOP*/ ; ar += 4, prWeight+=4 ) {
				v4sf v0, v1, v3; 
				if (ar == armainloopEnd) {
					__builtin_ia32_storeups((float *) &result, v2);
					break;
				}
				v0 =  __builtin_ia32_loadups(ar);
				v1 =  __builtin_ia32_loadups(prWeight);
				v3 =  __builtin_ia32_mulps(v0, v1);
				v2 =  __builtin_ia32_addps(v2, v3);
			}
			r = r + result[0] + result[1] + result[2] + result[3];

			// Part 2
			// Scalar clean-up loop (step 1)
			for (k = rest ; k < pnn->cHidden ; k++ ) {
				r += ar[k] * *prWeight++;
			}
		}
	}
	arOutput[ i ] = sigmoid( -pnn->rBetaOutput * r );
}
#endif

----------------------------------------------------------------------
---------------- Listing 5: ------------------------------------------
----------------- GCC 3.4.1 assembly output for 4 --------------------
----------------------------------------------------------------------

[...]

;; 	for( j = 0; j < pnn->cHidden; j++ ) {	/* LOOP WAS VECTORIZED. */

	movl	-44(%ebp), %eax
	movl	$0, -52(%ebp)
	movl	8(%eax), %edi
	testl	%edi, %edi
	jle	L149
L168:
	movl	-44(%ebp), %edx			;; 	for( j = 0; j < pnn->cHidden; j++ ) {
	movl	-52(%ebp), %eax
	movl	40(%edx), %edi
	movl	4(%edx), %edx			; pnn->cHidden

;;; Note: gcc is not totally hopeless, it keeps "float r"
;;; in a register all along the loop. Yipeee!
	
	movss	(%edi,%eax,4), %xmm2		; float r = pnn->arOutputThreshold[ i ];
	xorl	%edi, %edi			; j = 0
	cmpl	$0, %edx			; pnn->cHidden == 0 ?
	jle	L151				; if so, exit inner loop
L196:
	leal	3(%edx), %eax
	cmpl	$-1, %edx
	movl	%edx, %ecx
	cmovle	%eax, %ecx
	movdqa	-72(%ebp), %xmm1
	andl	$-4, %ecx
	xorps	%xmm1, %xmm1
	leal	(%edi,%ecx), %eax
	movaps	%xmm1, -72(%ebp)
	leal	(%ebx,%eax,4), %eax
	jmp	L101

;;; Hand knitted crunch loop, not perfect, but at least does
;;; 4 floats in parallel. Could be easily made step 16 etc.
;;; by simply unrolling it
	
L104:						; Crunch loop (intrinsics)
	movups	(%ebx), %xmm5			; v0 =  __builtin_ia32_loadups(ar);
	movups	(%esi), %xmm6			; v1 =  __builtin_ia32_loadups(prWeight);
	movaps	-72(%ebp), %xmm4
	mulps	%xmm6, %xmm5			; v3 =  __builtin_ia32_mulps(v0, v1);
	addl	$16, %ebx			; ar += 4, prWeight+=4
	addl	$16, %esi
	addps	%xmm5, %xmm4			; v2 =  __builtin_ia32_addps(v2, v3);
	movaps	%xmm4, -72(%ebp)		
L101:
	cmpl	%eax, %ebx			; for (/*NOP*/ ; /*NOP*/ ; ar += 4, prWeight+=4 ) {
	jne	L104
	movaps	-72(%ebp), %xmm3
	movl	%ecx, %eax
	movups	%xmm3, -40(%ebp)		; __builtin_ia32_storeups((float *) &result, v2);
	addss	-40(%ebp), %xmm2		; r =  r + result[0] + result[1] + result[2] + result[3];
	addss	-36(%ebp), %xmm2
	addss	-32(%ebp), %xmm2
	addss	-28(%ebp), %xmm2
	jmp	L105

;;; Note GCC even without intrinsics hint uses SSE math for the scalar loop
;;; however, in "ss" mode. It surely has to here :)

L108:
	movss	(%esi), %xmm7
	addl	$4, %esi
	mulss	(%ebx,%eax,4), %xmm7		; ar[k] * *prWeight++;
	addl	$1, %eax
	addss	%xmm7, %xmm2			; r += ^
L105:
	cmpl	%eax, %edx
	jg	L108				; for (k = rest ; k < pnn->cHidden ; k++ ) {
	addl	$1, %edi
	cmpl	%edi, %edx
	jg	L196

;; Exit of inner [j] loop, sigmoid to follow

L151:
	movl	-44(%ebp), %eax
	movss	24(%eax), %xmm0			;  pnn->rBetaOutput
	xorps	LC9, %xmm0			; -pnn->rBetaOutput
	mulss	%xmm2, %xmm0			; this is still r, isn't it amazing
	movss	%xmm0, (%esp)
	call	_sigmoid_original

----------------------------------------------------------------------
---------------- Listing 6: ------------------------------------------
----------------- ICC 8.1.24 code (intrinsics) -----------------------
------------------ (Inner loop only [j]) -----------------------------
----------------------------------------------------------------------

for( j = 0; j < pnn->cHidden; j++ ) {
    /* r += ar[ j ] * *prWeight++; */
	__m128 vec0, vec1, vec2;
	const int k =0;

	vec0 = _mm_load_ps ((float *) ar);
	vec1 = _mm_load_ps ((float *) prWeight);

	vec2 = _mm_setzero_ps();
	// loop is fully unrolled for the N=128 case in this example
	// #pragma vector aligned (not needed if we are unrolled)
	// for (k=0; k<pnn->cHidden/4;k+=16) { /* 128/4 = 32, OK! */
		// vec0 = ;
		// vec1 = ;
		vec0 = _mm_mul_ps(_mm_load_ps(ar + k*4), _mm_load_ps(prWeight + k*4));
		vec2 = _mm_add_ps(vec2, vec0);
		vec0 = _mm_load_ps(ar + (k+1)*4);
		vec1 = _mm_load_ps(prWeight + (k+1)*4);
		vec0 = _mm_mul_ps(vec0, vec1);
		vec2 = _mm_add_ps(vec2, vec0);
		vec0 = _mm_load_ps(ar + (k+2)*4);
		vec1 = _mm_load_ps(prWeight + (k+2)*4);
		vec0 = _mm_mul_ps(vec0, vec1);
		vec2 = _mm_add_ps(vec2, vec0);
		vec0 = _mm_load_ps(ar + (k+3)*4);
		vec1 = _mm_load_ps(prWeight + (k+3)*4);
		vec0 = _mm_mul_ps(vec0, vec1);
		vec2 = _mm_add_ps(vec2, vec0);
		vec0 = _mm_load_ps(ar + (k+4)*4);
		vec1 = _mm_load_ps(prWeight + (k+4)*4);
		vec0 = _mm_mul_ps(vec0, vec1);
		vec2 = _mm_add_ps(vec2, vec0);
		vec0 = _mm_load_ps(ar + (k+5)*4);
		vec1 = _mm_load_ps(prWeight + (k+5)*4);
		vec0 = _mm_mul_ps(vec0, vec1);
		vec2 = _mm_add_ps(vec2, vec0);
		vec0 = _mm_load_ps(ar + (k+6)*4);
		vec1 = _mm_load_ps(prWeight + (k+6)*4);
		vec0 = _mm_mul_ps(vec0, vec1);
		vec2 = _mm_add_ps(vec2, vec0);
		vec0 = _mm_load_ps(ar + (k+7)*4);
		vec1 = _mm_load_ps(prWeight + (k+7)*4);
		vec0 = _mm_mul_ps(vec0, vec1);
		vec2 = _mm_add_ps(vec2, vec0);
	// }
	/* r = a b c d
	swapLo = b a d c
	sumLo = a+b b+a c+d d+c
	swapHi = c+d c+d a+b a+b
	sum = 4 copies of a+b+d+c
	input is vec2, aux are vec1 & vec0
	*/
	/* __m128  swapLo */ vec0 = _mm_shuffle_ps(vec2,vec2, _MM_SHUFFLE(2,3,0,1));
	/* __m128  sumLo */ vec1 = _mm_add_ps(vec2, vec0);
	/*__m128  swapHi */ vec0 = _mm_shuffle_ps(vec1,vec1, _MM_SHUFFLE(1,1,3,3));
	/*__m128  sum */ vec2 = _mm_add_ps(vec1,vec0);
	_mm_store_ss (&r, vec2); 
}

arOutput[ i ] = sigmoid( -pnn->rBetaOutput * r );
}

----------------------------------------------------------------------
---------------- Listing 7: ------------------------------------------
----------------- ICC 8.1.24 code (result assembly for 6) ------------
------------------ (Inner loop only [j]) -----------------------------
----------------------------------------------------------------------

$B6$46:                         ; Preds $B6$45
        mov       edx, DWORD PTR [esp+44]                       ;
        mov       ecx, DWORD PTR [ebp+16]                       ;
        ALIGN     4
                                ; LOE eax edx ecx ebx esi edi
$B6$47:                         ; Preds $B6$48 $B6$46
        movaps    xmm0, XMMWORD PTR [ecx]                       ;552.34
        movaps    xmm1, XMMWORD PTR [ecx+16]                    ;554.23
        movaps    xmm2, XMMWORD PTR [ecx+32]                    ;558.23
        mulps     xmm0, XMMWORD PTR [edx]                       ;552.11
        mulps     xmm1, XMMWORD PTR [edx+16]                    ;556.11
        mulps     xmm2, XMMWORD PTR [edx+32]                    ;560.11
        movaps    xmm3, XMMWORD PTR [ecx+48]                    ;562.23
        movaps    xmm4, XMMWORD PTR [ecx+64]                    ;566.23
        movaps    xmm5, XMMWORD PTR [ecx+80]                    ;570.23
        mulps     xmm3, XMMWORD PTR [edx+48]                    ;564.11
        mulps     xmm4, XMMWORD PTR [edx+64]                    ;568.11
        mulps     xmm5, XMMWORD PTR [edx+80]                    ;572.11
        movaps    xmm6, XMMWORD PTR [ecx+96]                    ;574.23
        movaps    xmm7, XMMWORD PTR [ecx+112]                   ;578.23
        mulps     xmm6, XMMWORD PTR [edx+96]                    ;576.11
        mulps     xmm7, XMMWORD PTR [edx+112]                   ;580.11
        addps     xmm0, xmm1                                    ;557.11
        addps     xmm0, xmm2                                    ;561.11
        addps     xmm0, xmm3                                    ;565.11
        addps     xmm0, xmm4                                    ;569.11
        addps     xmm0, xmm5                                    ;573.11
        addps     xmm0, xmm6                                    ;577.11
        addps     xmm0, xmm7                                    ;581.11
        movaps    xmm1, xmm0                                    ;590.31
        shufps    xmm1, xmm1, 177                               ;590.31
        addps     xmm0, xmm1                                    ;591.30
        movaps    xmm1, xmm0                                    ;592.30
        shufps    xmm1, xmm1, 95                                ;592.30
        addps     xmm0, xmm1                                    ;593.27
        movss     DWORD PTR [esp+64], xmm0                      ;594.3
                                ; LOE eax edx ecx ebx esi edi
$B6$48:                         ; Preds $B6$47
        add       eax, 1                                        ;539.32
        cmp       eax, DWORD PTR [edi+4]                        ;539.2
        jl        $B6$47        ; Prob 90%                      ;539.2

------------------------------------------------------------------------
-------------- Listing 8 -----------------------------------------------
---------- Optimization attempt using fixed length loops ---------------
---------- Body In Evaluate --------------------------------------------
------------------------------------------------------------------------

switch (pnn->cInput) {
	case 250: {
		/* assert(pnn->cInput == 250 && pnn->cHidden == 128 && pnn->cOutput == 5 &&  pnn->rBetaHidden == 0.1f && pnn->rBetaOutput == 1.0f); */
		EVALUATE_OPTIMIZED_255(250, 128, 5, 0.1, 1.0)
		}
	case 214: {
		/* assert(pnn->cInput == 214 && pnn->cHidden == 128 && pnn->cOutput == 5 &&  pnn->rBetaHidden == 0.1f && pnn->rBetaOutput == 1.0f); */
		EVALUATE_OPTIMIZED(214, 128, 5, 0.1, 1.0)
		}
	case 200: {
		switch (pnn->cHidden) {
			case 5: {
				/* assert(pnn->cInput == 200 && pnn->cHidden == 5 && pnn->cOutput == 5 &&  pnn->rBetaHidden == 0.1f && pnn->rBetaOutput == 1.0f); */
				EVALUATE_OPTIMIZED(200, 5, 5, 0.1, 1.0)
			}			
			case 10: {
				/* assert(pnn->cInput == 200 && pnn->cHidden == 10 && pnn->cOutput == 5 &&  pnn->rBetaHidden == 0.1f && pnn->rBetaOutput == 1.0f); */
				EVALUATE_OPTIMIZED(200, 10, 5, 0.1, 1.0)
			}
		}
	}
	default: {
		EVALUATE_OPTIMIZED(pnn->cInput, pnn->cHidden,  pnn->cOutput, pnn->rBetaHidden, pnn->rBetaOutput)
	}
}

[...]

------------------------------------------------------------------------
-------------- Listing 9 -----------------------------------------------
---------- Optimization attempt using fixed length loops ---------------
---------- Header with templates----------------------------------------
------------------------------------------------------------------------

 /*
 * neuralnet_optimized_weights_015.c
 *
 * by Ingo Macherius 20050222
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of version 2 of the GNU General Public License as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 * $Id$
 */

// EVALUATE_OPTIMIZED(250, 128, 5, 0.1, 1.0)

#define EVALUATE_OPTIMIZED(INPUT, HIDDEN, OUTPUT, BETAHIDDEN, BETAOUTPUT) \
    int i, j;\
    float *prWeight;\
\
    /* Calculate activity at hidden nodes */\
    for( i = 0; i < HIDDEN; i++ )\
	ar[ i ] = pnn->arHiddenThreshold[ i ];\
\
    prWeight = pnn->arHiddenWeight;\
    \
    for( i = 0; i < INPUT; i++ ) {\
	float const ari = arInput[ i ];\
\
	if( ari ) {\
	    float *pr = ar;\
\
	    if( ari == 1.0f )\
		for( j = HIDDEN; j; j-- )\
		    *pr++ += *prWeight++;\
	    else\
		for( j = HIDDEN; j; j-- )\
		    *pr++ += *prWeight++ * ari;\
	} else\
	    prWeight += HIDDEN;\
    }\
\
    if( saveAr)\
      memcpy( saveAr, ar, HIDDEN * sizeof( *saveAr));\
\
\
    for( i = 0; i < HIDDEN; i++ )\
	ar[ i ] = sigmoid( -BETAHIDDEN * ar[ i ] );\
\
    /* Calculate activity at output nodes */\
    prWeight = pnn->arOutputWeight;\
\
    for( i = 0; i < OUTPUT; i++ ) {\
	float r = pnn->arOutputThreshold[ i ];\
\
	for( j = 0; j < HIDDEN; j++ )\
	    r += ar[ j ] * *prWeight++;\
\
	/* In Network 0.15, BETAOUTPUT is always 1.0, we do not need the multiplication */\
	/* arOutput[ i ] = sigmoid( -BETAOUTPUT * r ); */\
	arOutput[ i ] = sigmoid( -r );\
    }\
\
    return;
    
/* ---------------------- */

#define EVALUATE_FROM_BASE_OPTIMIZED(INPUT, HIDDEN, OUTPUT, BETAHIDDEN, BETAOUTPUT) \
    int i, j;\
    float *prWeight;\
\
    /* Calculate activity at hidden nodes */\
/*    for( i = 0; i < HIDDEN; i++ )\
	ar[ i ] = pnn->arHiddenThreshold[ i ]; */\
\
    prWeight = pnn->arHiddenWeight;\
    \
    for( i = 0; i < INPUT; ++i ) {\
	float const ari = arInputDif[ i ];\
\
	if( ari ) {\
	    float *pr = ar;\
\
	    if( ari == 1.0f )\
		for( j = HIDDEN; j; j-- )\
		    *pr++ += *prWeight++;\
	    else\
            if(ari == -1.0f)\
              for(j = HIDDEN; j; j-- ) \
                *pr++ -= *prWeight++;\
            else\
		for( j = HIDDEN; j; j-- )\
		    *pr++ += *prWeight++ * ari;\
	} else\
	    prWeight += HIDDEN;\
    }\
    \
    for( i = 0; i < HIDDEN; i++ )\
	ar[ i ] = sigmoid( -BETAHIDDEN * ar[ i ] );\
\
    /* Calculate activity at output nodes */\
    prWeight = pnn->arOutputWeight;\
\
    for( i = 0; i < OUTPUT; i++ ) {\
	float r = pnn->arOutputThreshold[ i ];\
	\
	for( j = 0; j < HIDDEN; j++ )\
	    r += ar[ j ] * *prWeight++;\
\
	/* In Network 0.15, BETAOUTPUT is always 1.0, we do not need the multiplication */\
	/* arOutput[ i ] = sigmoid( -BETAOUTPUT * r ); */\
	arOutput[ i ] = sigmoid( -r );\
    }\
\
    return;
    
------------------------------------------------------------------------
-------------- Listing 10 -----------------------------------------------
---------- Heavily optimizing GCC 3.4 batchfile -------- ---------------
---------- (tested on cygwin with gcc 3.4.1) --------------------------
------------------------------------------------------------------------

# gcc (GCC) 3.4.1 (cygming special)
# Copyright (C) 2004 Free Software Foundation, Inc.
# This is free software; see the source for copying conditions.  There is NO
# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

# http://gcc.gnu.org/onlinedocs/gcc-3.4.3/gcc/i386-and-x86_002d64-Options.html#i386-and-x86_002d64-Options
export OPT_386="-march=i386 -mtune=pentium"
export OPT_SSE="-march=pentium3 -mtune=pentium3 -msse -mfpmath=sse"
export OPT_SSE2="-march=pentium4 -mtune=pentium4 -msse2 -mfpmath=sse"
export OPT_SSE3="-march=prescott -mtune=prescott -msse3 -mfpmath=sse"

export OPT_ALL_386="-maccumulate-outgoing-args -malign-double -fomit-frame-pointer -momit-leaf-frame-pointer"

# http://gcc.gnu.org/onlinedocs/gcc-3.4.3/gcc/Optimize-Options.html#Optimize-Options     

export OPT_OPT="-O3 -minline-all-stringops"

# Check out if agressive float optimization harms NN accuracy
export OPT_FLOAT="-ffast-math"

# http://gcc.gnu.org/onlinedocs/gcc-3.4.3/gcc/Code-Gen-Options.html#Code-Gen-Options

export OPT_GEN="-freg-struct-return"
export DEBUG_INLINE="-fno-inline"

export OPTCFLAGS="$OPT_SSE2 $OPT_ALL_386 $OPT_OPT $OPT_FLOAT $OPT_GEN $DEBUG_INLINE"
export CFLAGS="-I .. -I . -S $OPTCFLAGS"

echo gcc $CFLAGS neuralnet.c
gcc $CFLAGS neuralnet.c

------------------------------------------------------------------------
-------------- Listing 11 ---------------------------------------------
---------- Heavily optimizing ICC 8.1 batchfile ------------------------
---------- (tested on WinXPSP2, VS7.1, ICC8.1.24)----------------------
------------------------------------------------------------------------

setlocal

rem *** Intel(R) C++ Compiler for 32-bit applications, Version 8.1    
rem *** Build 20041019Z Package ID: W_CC_PU_8.1.020
rem *** Copyright (C) 1985-2004 Intel Corporation.  All rights reserved.

set VS=E:\Programme\Microsoft Visual Studio .NET 2003\Vc7\
set OPT=/c /Qvc7.1 /Qlocation,link,"%VS%\bin" /Ot /Oy /GT /G7 /GA /QaxN /QxN /Qparallel /GF /FD /MD /I ".." /I "." /I "%VS%\include" /I "E:\programme\Intel\MKL701\include" /nologo /W1 /D "WIN32" /D "NDEBUG" /D "_LIB" /D "HAVE_CONFIG_H=1" /D "_MBCS" /YX"StdAfx.h" /Fp"..\msdev\Intermediate\Release/lib.pch" /Fo"..\msdev\Intermediate\Release/" /Fd"..\msdev\Intermediate\Release/" /Gd /TC /Qunroll254

rem *** no inlining optimizazions so the assembler output keeps readable
set INLINE=/Ob2 /Oi
set INLINE=

icl %OPT% %INLINE% /S neuralnet.c
endlocal