freetype-commit
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Git][freetype/freetype][master] [smooth] Remove SSE2.


From: Alexei Podtelezhnikov (@apodtele)
Subject: [Git][freetype/freetype][master] [smooth] Remove SSE2.
Date: Sun, 14 Jan 2024 13:03:56 +0000

Alexei Podtelezhnikov pushed to branch master at FreeType / FreeType

Commits:

  • 57617782
    by Alexei Podtelezhnikov (Алексей Подтележников) at 2024-01-14T13:03:51+00:00
    [smooth] Remove SSE2.
    
    As a result of 7b308a29dd10, the regular 64-bit execution is now faster
    than SSE2. The rendering speed of script fonts at 64 ppem or larger is
    improved by about 3% without SSE2. See !314 for the testing results.   
    
    * src/smooth/ftgrays.c (gray_render_conic)[FT_INT64]: Remove SSE2 code.
    

1 changed file:

Changes:

  • src/smooth/ftgrays.c
    ... ... @@ -997,49 +997,12 @@ typedef ptrdiff_t FT_PtrDist;
    997 997
     #endif
    
    998 998
     
    
    999 999
       /*
    
    1000
    -   * Benchmarking shows that using DDA to flatten the quadratic Bézier arcs
    
    1001
    -   * is slightly faster in the following cases:
    
    1002
    -   *
    
    1003
    -   *   - When the host CPU is 64-bit.
    
    1004
    -   *   - When SSE2 SIMD registers and instructions are available (even on
    
    1005
    -   *     x86).
    
    1006
    -   *
    
    1007
    -   * For other cases, using binary splits is actually slightly faster.
    
    1008
    -   */
    
    1009
    -#if ( defined( __SSE2__ )                          ||   \
    
    1010
    -      defined( __x86_64__ )                        ||   \
    
    1011
    -      defined( _M_AMD64 )                          ||   \
    
    1012
    -      ( defined( _M_IX86_FP ) && _M_IX86_FP >= 2 ) ) && \
    
    1013
    -    !defined( __VMS )
    
    1014
    -#  define FT_SSE2 1
    
    1015
    -#else
    
    1016
    -#  define FT_SSE2 0
    
    1017
    -#endif
    
    1018
    -
    
    1019
    -#if FT_SSE2                || \
    
    1020
    -    defined( __aarch64__ ) || \
    
    1021
    -    defined( _M_ARM64 )
    
    1022
    -#  define BEZIER_USE_DDA  1
    
    1023
    -#else
    
    1024
    -#  define BEZIER_USE_DDA  0
    
    1025
    -#endif
    
    1026
    -
    
    1027
    -  /*
    
    1028
    -   * For now, the code that depends on `BEZIER_USE_DDA` requires `FT_Int64`
    
    1029
    -   * to be defined.  If `FT_INT64` is not defined, meaning there is no
    
    1030
    -   * 64-bit type available, disable it to avoid compilation errors.  See for
    
    1031
    -   * example https://gitlab.freedesktop.org/freetype/freetype/-/issues/1071.
    
    1000
    +   * For now, the code that uses DDA to render conic curves requires
    
    1001
    +   * `FT_Int64` to be defined.  See for example
    
    1002
    +   *    https://gitlab.freedesktop.org/freetype/freetype/-/issues/1071.
    
    1032 1003
        */
    
    1033
    -#if !defined( FT_INT64 )
    
    1034
    -#  undef BEZIER_USE_DDA
    
    1035
    -#  define BEZIER_USE_DDA  0
    
    1036
    -#endif
    
    1037 1004
     
    
    1038
    -#if BEZIER_USE_DDA
    
    1039
    -
    
    1040
    -#if FT_SSE2
    
    1041
    -#  include <emmintrin.h>
    
    1042
    -#endif
    
    1005
    +#ifdef FT_INT64
    
    1043 1006
     
    
    1044 1007
     #define LEFT_SHIFT( a, b )  (FT_Int64)( (FT_UInt64)(a) << (b) )
    
    1045 1008
     
    
    ... ... @@ -1151,61 +1114,6 @@ typedef ptrdiff_t FT_PtrDist;
    1151 1114
          *             = (B << (33 - N)) + (A << (32 - 2*N))
    
    1152 1115
          */
    
    1153 1116
     
    
    1154
    -#if FT_SSE2
    
    1155
    -    /* Experience shows that for small counts, SSE2 is actually slower. */
    
    1156
    -    if ( count > 4 )
    
    1157
    -    {
    
    1158
    -      union
    
    1159
    -      {
    
    1160
    -        struct { FT_Int64  ax, ay, bx, by; }  i;
    
    1161
    -        struct { __m128i  a, b; }  vec;
    
    1162
    -
    
    1163
    -      } u;
    
    1164
    -
    
    1165
    -      union
    
    1166
    -      {
    
    1167
    -        struct { FT_Int32  px_lo, px_hi, py_lo, py_hi; }  i;
    
    1168
    -        __m128i  vec;
    
    1169
    -
    
    1170
    -      } v;
    
    1171
    -
    
    1172
    -      __m128i  p, q, r;
    
    1173
    -
    
    1174
    -
    
    1175
    -      u.i.ax = ax;
    
    1176
    -      u.i.ay = ay;
    
    1177
    -      u.i.bx = bx;
    
    1178
    -      u.i.by = by;
    
    1179
    -
    
    1180
    -      q = _mm_load_si128( &u.vec.b );
    
    1181
    -      r = _mm_load_si128( &u.vec.a );
    
    1182
    -
    
    1183
    -      q = _mm_slli_epi64( q, shift + 17);
    
    1184
    -      r = _mm_slli_epi64( r, shift + shift );
    
    1185
    -      q = _mm_add_epi64( q, r );
    
    1186
    -      r = _mm_add_epi64( r, r );
    
    1187
    -
    
    1188
    -      v.i.px_lo = 0;
    
    1189
    -      v.i.px_hi = p0.x;
    
    1190
    -      v.i.py_lo = 0;
    
    1191
    -      v.i.py_hi = p0.y;
    
    1192
    -
    
    1193
    -      p = _mm_load_si128( &v.vec );
    
    1194
    -
    
    1195
    -      do
    
    1196
    -      {
    
    1197
    -        p = _mm_add_epi64( p, q );
    
    1198
    -        q = _mm_add_epi64( q, r );
    
    1199
    -
    
    1200
    -        _mm_store_si128( &v.vec, p );
    
    1201
    -
    
    1202
    -        gray_render_line( RAS_VAR_ v.i.px_hi, v.i.py_hi );
    
    1203
    -      } while ( --count );
    
    1204
    -
    
    1205
    -      return;
    
    1206
    -    }
    
    1207
    -#endif  /* FT_SSE2 */
    
    1208
    -
    
    1209 1117
         rx = LEFT_SHIFT( ax, shift + shift );
    
    1210 1118
         ry = LEFT_SHIFT( ay, shift + shift );
    
    1211 1119
     
    
    ... ... @@ -1230,7 +1138,7 @@ typedef ptrdiff_t FT_PtrDist;
    1230 1138
         } while ( --count );
    
    1231 1139
       }
    
    1232 1140
     
    
    1233
    -#else  /* !BEZIER_USE_DDA */
    
    1141
    +#else  /* !FT_INT64 */
    
    1234 1142
     
    
    1235 1143
       /*
    
    1236 1144
        * Note that multiple attempts to speed up the function below
    
    ... ... @@ -1324,7 +1232,7 @@ typedef ptrdiff_t FT_PtrDist;
    1324 1232
         } while ( --draw );
    
    1325 1233
       }
    
    1326 1234
     
    
    1327
    -#endif  /* !BEZIER_USE_DDA */
    
    1235
    +#endif  /* !FT_INT64 */
    
    1328 1236
     
    
    1329 1237
     
    
    1330 1238
       /*
    


  • reply via email to

    [Prev in Thread] Current Thread [Next in Thread]