bug-gnulib
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: ffsl: optimization for non-GCC


From: Bruno Haible
Subject: Re: ffsl: optimization for non-GCC
Date: Fri, 14 Oct 2011 22:24:50 +0200
User-agent: KMail/1.13.6 (Linux/2.6.37.6-0.5-desktop; KDE/4.6.0; x86_64; ; )

> 2011-10-13  Bruno Haible  <address@hidden>
> 
>       ffsl: Optimize on 32-bit platforms.
>       * lib/ffsl.h (FUNC): If TYPE has the same representation as 'int', just
>       use ffs() without a loop.

The other frequent case is that sizeof (TYPE) == 2 * sizeof (int).
This means, the loop is executed just twice. And in the last loop
round, we are making an unnecessary test: If j != 0 and the low 32 bits
of j are known to be 0, we know that the high 32 bits of j must be non-zero.
We don't need to test that.

So I
  - introduced a variable k that counts the number of loop rounds,
    so as to avoid the non-zero test in the last loop round,
  - unrolled the loop, extracting the first and last loop round,
    letting the compiler realize that the middle part of the loop
    is empty code,
  - realized that this code also works when sizeof (TYPE) == sizeof (int),
  - removed the workaround for the gcc warning, that no longer exists.

On MacOS X in 32-bit mode, disabling the GCC built-ins, the code for
ffsll.c got simplified from

=================================================================
.globl _ffsll
_ffsll:
        pushl   %esi
        movl    12(%esp), %ecx
        xorl    %eax, %eax
        movl    8(%esp), %edx
        movl    %ecx, %esi
        orl     %edx, %esi
        je      L6
        xorl    %esi, %esi
        testl   %edx, %edx
        movl    %edx, %eax
        jne     L5
        .align 4,0x90
L8:
        movl    %ecx, %edx
        addl    $32, %esi
        xorl    %ecx, %ecx
        testl   %edx, %edx
        movl    %edx, %eax
        je      L8
L5:
        bsfl    %eax, %eax
        movl    $-1, %edx
        cmove   %edx, %eax
        incl    %eax
        addl    %esi, %eax
L6:
        popl    %esi
        ret
=================================================================

to

=================================================================
.globl _ffsll
_ffsll:
        pushl   %esi
        movl    12(%esp), %edx
        xorl    %ecx, %ecx
        movl    8(%esp), %eax
        movl    %edx, %esi
        orl     %eax, %esi
        je      L4
        testl   %eax, %eax
        movl    %eax, %ecx
        jne     L9
        movl    %edx, %eax
        movl    $-1, %ecx
        bsfl    %eax, %eax
        cmove   %ecx, %eax
        incl    %eax
        leal    32(%eax), %ecx
L4:
        popl    %esi
        movl    %ecx, %eax
        ret
        .align 4,0x90
L9:
        bsfl    %ecx, %ecx
        movl    $-1, %eax
        popl    %esi
        cmove   %eax, %ecx
        incl    %ecx
        movl    %ecx, %eax
        ret
=================================================================

Yes, this is simpler: it contains only 2 conditional branches,
rather than 3.


2011-10-14  Bruno Haible  <address@hidden>

        ffsl: Optimize on 64-bit platforms.
        * lib/ffsl.h (FUNC): Omit a test from the last loop round. Do loop
        unrolling.

*** lib/ffsl.h.orig     Fri Oct 14 22:13:03 2011
--- lib/ffsl.h  Fri Oct 14 22:12:47 2011
***************
*** 34,67 ****
  #if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) && defined 
GCC_BUILTIN
    return GCC_BUILTIN (i);
  #else
!   if (sizeof (TYPE) == sizeof (int))
!     return ffs (i);
!   else
      {
!       unsigned TYPE j = i;
!       /* Split j into chunks, and look at one chunk after the other.  */
!       /* Define chunk_bits so as to avoid a GCC warning
!            "right shift count >= width of type"
!          if sizeof (TYPE) == sizeof (int).  */
!       enum
!         {
!           chunk_bits = (sizeof (TYPE) != sizeof (int)
!                         ? CHAR_BIT * sizeof (unsigned int)
!                         : 0)
!         };
!       int result = 0;
  
        /* It is tempting to write  if (!j)  here, but if we do this,
           Solaris 10/x86 "cc -O" miscompiles the code.  */
        if (!i)
          return 0;
!       while (1)
!         {
!           if ((unsigned int) j)
!             return result + ffs ((unsigned int) j);
!           j >>= chunk_bits;
!           result += chunk_bits;
!         }
      }
  #endif
  }
--- 34,64 ----
  #if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) && defined 
GCC_BUILTIN
    return GCC_BUILTIN (i);
  #else
!   unsigned TYPE j = i;
!   /* Split j into chunks, and look at one chunk after the other.  */
!   enum { chunk_bits = CHAR_BIT * sizeof (unsigned int) };
!   /* The number of chunks is ceil (sizeof (TYPE) / sizeof (unsigned int))
!      = (sizeof (TYPE) - 1) / sizeof (unsigned int) + 1. */
!   enum { chunk_count = (sizeof (TYPE) - 1) / sizeof (unsigned int) + 1 };
! 
!   if (chunk_count > 1)
      {
!       size_t k;
  
        /* It is tempting to write  if (!j)  here, but if we do this,
           Solaris 10/x86 "cc -O" miscompiles the code.  */
        if (!i)
          return 0;
!       /* Unroll the first loop round.  k = 0.  */
!       if ((unsigned int) j)
!         return ffs ((unsigned int) j);
!       /* Generic loop.  */
!       for (k = 1; k < chunk_count - 1; k++)
!         if ((unsigned int) (j >> (k * chunk_bits)) != 0)
!           return k * chunk_bits + ffs ((unsigned int) (j >> (k * 
chunk_bits)));
      }
+   /* Last loop round.  k = chunk_count - 1.  */
+   return (chunk_count - 1) * chunk_bits
+          + ffs ((unsigned int) (j >> ((chunk_count - 1) * chunk_bits)));
  #endif
  }

-- 
In memoriam Robert Dziekański <http://en.wikipedia.org/wiki/Robert_Dziekański>



reply via email to

[Prev in Thread] Current Thread [Next in Thread]