freetype-commit
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Git][freetype/freetype][master] 2 commits: [smooth] Minor speedup to sm


From: David Turner (@david.freetype)
Subject: [Git][freetype/freetype][master] 2 commits: [smooth] Minor speedup to smooth rasterizer
Date: Thu, 15 Jul 2021 11:31:57 +0000

David Turner pushed to branch master at FreeType / FreeType

Commits:

2 changed files:

Changes:

  • ChangeLog
    1
    +2021-07-15  David Turner  <david@freetype.org>
    
    2
    +
    
    3
    +	[smooth] Implement Bezier quadratic arc flattenning with DDA
    
    4
    +
    
    5
    +	Benchmarking shows that this provides a very slighty performance
    
    6
    +	boost when rendering fonts with lots of quadratic bezier arcs,
    
    7
    +	compared to the recursive arc splitting, but only when SSE2 is
    
    8
    +	available, or on 64-bit CPUs.
    
    9
    +
    
    10
    +	* src/smooth/ftgrays.c (gray_render_conic): New implementation
    
    11
    +	based on DDA and optionally SSE2.
    
    12
    +
    
    13
    +2021-07-15  David Turner  <david@freetype.org>
    
    14
    +
    
    15
    +	[smooth] Minor speedup to smooth rasterizer
    
    16
    +
    
    17
    +	This speeds up the smooth rasterizer by avoiding a conditional
    
    18
    +	branches in the hot path.
    
    19
    +
    
    20
    +	* src/smooth/ftgrays.c: Define a null cell used to both as a
    
    21
    +	sentinel for all linked-lists, and to accumulate coverage and
    
    22
    +	area values for "out-of-bounds" cell positions without a
    
    23
    +	conditional check.
    
    24
    +
    
    1 25
     2021-07-15  David Turner  <david@freetype.org>
    
    2 26
     
    
    3 27
     	Replaces download-test-fonts.sh with download-test-fonts.py which
    

  • src/smooth/ftgrays.c
    ... ... @@ -479,19 +479,24 @@ typedef ptrdiff_t FT_PtrDist;
    479 479
       {
    
    480 480
         ft_jmp_buf  jump_buffer;
    
    481 481
     
    
    482
    -    TCoord  min_ex, max_ex;
    
    482
    +    TCoord  min_ex, max_ex;  /* min and max integer pixel coordinates */
    
    483 483
         TCoord  min_ey, max_ey;
    
    484
    +    TCoord  count_ey;        /* same as (max_ey - min_ey) */
    
    484 485
     
    
    485
    -    PCell       cell;
    
    486
    -    PCell*      ycells;
    
    487
    -    PCell       cells;
    
    488
    -    FT_PtrDist  max_cells;
    
    489
    -    FT_PtrDist  num_cells;
    
    486
    +    PCell       cell;        /* current cell                             */
    
    487
    +    PCell       cell_free;   /* call allocation next free slot           */
    
    488
    +    PCell       cell_limit;  /* cell allocation limit                    */
    
    490 489
     
    
    491
    -    TPos    x,  y;
    
    490
    +    PCell*      ycells;      /* array of cell linked-lists, one per      */
    
    491
    +							 /* vertical coordinate in the current band. */
    
    492 492
     
    
    493
    -    FT_Outline  outline;
    
    494
    -    TPixmap     target;
    
    493
    +    PCell       cells;       /* cell storage area     */
    
    494
    +    FT_PtrDist  max_cells;   /* cell storage capacity */
    
    495
    +
    
    496
    +    TPos        x,  y;       /* last point position */
    
    497
    +
    
    498
    +    FT_Outline  outline;     /* input outline */
    
    499
    +    TPixmap     target;      /* target pixmap */
    
    495 500
     
    
    496 501
         FT_Raster_Span_Func  render_span;
    
    497 502
         void*                render_span_data;
    
    ... ... @@ -502,21 +507,34 @@ typedef ptrdiff_t FT_PtrDist;
    502 507
     #pragma warning( pop )
    
    503 508
     #endif
    
    504 509
     
    
    505
    -
    
    506 510
     #ifndef FT_STATIC_RASTER
    
    507 511
     #define ras  (*worker)
    
    508 512
     #else
    
    509 513
       static gray_TWorker  ras;
    
    510 514
     #endif
    
    511 515
     
    
    512
    -#define FT_INTEGRATE( ras, a, b )                                       \
    
    513
    -           if ( ras.cell )                                              \
    
    514
    -             ras.cell->cover += (a), ras.cell->area += (a) * (TArea)(b)
    
    516
    +/* Return a pointer to the "null cell", used as a sentinel at the end   */
    
    517
    +/* of all ycells[] linked lists. Its x coordinate should be maximal     */
    
    518
    +/* to ensure no NULL checks are necessary when looking for an insertion */
    
    519
    +/* point in gray_set_cell(). Other loops should check the cell pointer  */
    
    520
    +/* with CELL_IS_NULL() to detect the end of the list.                   */
    
    521
    +#define NULL_CELL_PTR(ras)  (ras).cells
    
    522
    +
    
    523
    +/* The |x| value of the null cell. Must be the largest possible */
    
    524
    +/* integer value stored in a TCell.x field.                     */
    
    525
    +#define CELL_MAX_X_VALUE    INT_MAX
    
    526
    +
    
    527
    +/* Return true iff |cell| points to the null cell. */
    
    528
    +#define CELL_IS_NULL(cell)  ((cell)->x == CELL_MAX_X_VALUE)
    
    529
    +
    
    530
    +
    
    531
    +#define FT_INTEGRATE( ras, a, b )                                     \
    
    532
    +           ras.cell->cover += (a), ras.cell->area += (a) * (TArea)(b)
    
    515 533
     
    
    516 534
     
    
    517 535
       typedef struct gray_TRaster_
    
    518 536
       {
    
    519
    -    void*         memory;
    
    537
    +    void*  memory;
    
    520 538
     
    
    521 539
       } gray_TRaster, *gray_PRaster;
    
    522 540
     
    
    ... ... @@ -538,7 +556,7 @@ typedef ptrdiff_t FT_PtrDist;
    538 556
     
    
    539 557
           printf( "%3d:", y );
    
    540 558
     
    
    541
    -      for ( ; cell != NULL; cell = cell->next )
    
    559
    +      for ( ; !CELL_IS_NULL(cell); cell = cell->next )
    
    542 560
             printf( " (%3d, c:%4d, a:%6d)",
    
    543 561
                     cell->x, cell->cover, cell->area );
    
    544 562
           printf( "\n" );
    
    ... ... @@ -566,11 +584,12 @@ typedef ptrdiff_t FT_PtrDist;
    566 584
         /* Note that if a cell is to the left of the clipping region, it is    */
    
    567 585
         /* actually set to the (min_ex-1) horizontal position.                 */
    
    568 586
     
    
    569
    -    if ( ey >= ras.max_ey || ey < ras.min_ey || ex >= ras.max_ex )
    
    570
    -      ras.cell = NULL;
    
    587
    +    TCoord ey_index = ey - ras.min_ey;
    
    588
    +    if ( ey_index < 0 || ey_index >= ras.count_ey || ex >= ras.max_ex )
    
    589
    +      ras.cell = NULL_CELL_PTR(ras);
    
    571 590
         else
    
    572 591
         {
    
    573
    -      PCell*  pcell = ras.ycells + ey - ras.min_ey;
    
    592
    +      PCell*  pcell = ras.ycells + ey_index;
    
    574 593
           PCell   cell;
    
    575 594
     
    
    576 595
     
    
    ... ... @@ -580,7 +599,7 @@ typedef ptrdiff_t FT_PtrDist;
    580 599
           {
    
    581 600
             cell = *pcell;
    
    582 601
     
    
    583
    -        if ( !cell || cell->x > ex )
    
    602
    +        if ( cell->x > ex )
    
    584 603
               break;
    
    585 604
     
    
    586 605
             if ( cell->x == ex )
    
    ... ... @@ -589,11 +608,11 @@ typedef ptrdiff_t FT_PtrDist;
    589 608
             pcell = &cell->next;
    
    590 609
           }
    
    591 610
     
    
    592
    -      if ( ras.num_cells >= ras.max_cells )
    
    611
    +      /* insert new cell */
    
    612
    +      cell = ras.cell_free++;
    
    613
    +      if (cell >= ras.cell_limit)
    
    593 614
             ft_longjmp( ras.jump_buffer, 1 );
    
    594 615
     
    
    595
    -      /* insert new cell */
    
    596
    -      cell        = ras.cells + ras.num_cells++;
    
    597 616
           cell->x     = ex;
    
    598 617
           cell->area  = 0;
    
    599 618
           cell->cover = 0;
    
    ... ... @@ -974,6 +993,188 @@ typedef ptrdiff_t FT_PtrDist;
    974 993
     
    
    975 994
     #endif
    
    976 995
     
    
    996
    +/* Benchmarking shows that using DDA to flatten the quadratic bezier
    
    997
    + * arcs is slightly faster in the following cases:
    
    998
    + *
    
    999
    + *   - When the host CPU is 64-bit.
    
    1000
    + *   - When SSE2 SIMD registers and instructions are available (even on x86).
    
    1001
    + *
    
    1002
    + * For other cases, using binary splits is actually slightly faster.
    
    1003
    + */
    
    1004
    +#if defined(__SSE2__) || defined(__x86_64__) || defined(__aarch64__) || defined(_M_AMD64) || defined(_M_ARM64)
    
    1005
    +#define BEZIER_USE_DDA  1
    
    1006
    +#else
    
    1007
    +#define BEZIER_USE_DDA  0
    
    1008
    +#endif
    
    1009
    +
    
    1010
    +#if BEZIER_USE_DDA
    
    1011
    +
    
    1012
    +#include <emmintrin.h>
    
    1013
    +
    
    1014
    +  static void
    
    1015
    +  gray_render_conic( RAS_ARG_ const FT_Vector*  control,
    
    1016
    +                              const FT_Vector*  to )
    
    1017
    +  {
    
    1018
    +    FT_Vector  p0, p1, p2;
    
    1019
    +
    
    1020
    +    p0.x = ras.x;
    
    1021
    +    p0.y = ras.y;
    
    1022
    +    p1.x = UPSCALE( control->x );
    
    1023
    +    p1.y = UPSCALE( control->y );
    
    1024
    +    p2.x = UPSCALE( to->x );
    
    1025
    +    p2.y = UPSCALE( to->y );
    
    1026
    +
    
    1027
    +    /* short-cut the arc that crosses the current band */
    
    1028
    +    if ( ( TRUNC( p0.y ) >= ras.max_ey &&
    
    1029
    +           TRUNC( p1.y ) >= ras.max_ey &&
    
    1030
    +           TRUNC( p2.y ) >= ras.max_ey ) ||
    
    1031
    +         ( TRUNC( p0.y ) <  ras.min_ey &&
    
    1032
    +           TRUNC( p1.y ) <  ras.min_ey &&
    
    1033
    +           TRUNC( p2.y ) <  ras.min_ey ) )
    
    1034
    +    {
    
    1035
    +      ras.x = p2.x;
    
    1036
    +      ras.y = p2.y;
    
    1037
    +      return;
    
    1038
    +    }
    
    1039
    +
    
    1040
    +    TPos dx = FT_ABS( p0.x + p2.x - 2 * p1.x );
    
    1041
    +    TPos dy = FT_ABS( p0.y + p2.y - 2 * p1.y );
    
    1042
    +    if ( dx < dy )
    
    1043
    +      dx = dy;
    
    1044
    +
    
    1045
    +    if ( dx <= ONE_PIXEL / 4 )
    
    1046
    +    {
    
    1047
    +      gray_render_line( RAS_VAR_ p2.x, p2.y );
    
    1048
    +      return;
    
    1049
    +    }
    
    1050
    +
    
    1051
    +    /* We can calculate the number of necessary bisections because  */
    
    1052
    +    /* each bisection predictably reduces deviation exactly 4-fold. */
    
    1053
    +    /* Even 32-bit deviation would vanish after 16 bisections.      */
    
    1054
    +    int shift = 0;
    
    1055
    +    do
    
    1056
    +    {
    
    1057
    +      dx   >>= 2;
    
    1058
    +      shift += 1;
    
    1059
    +    }
    
    1060
    +    while (dx > ONE_PIXEL / 4);
    
    1061
    +
    
    1062
    +    /*
    
    1063
    +     * The (P0,P1,P2) arc equation, for t in [0,1] range:
    
    1064
    +     *
    
    1065
    +     * P(t) = P0*(1-t)^2 + P1*2*t*(1-t) + P2*t^2
    
    1066
    +     *
    
    1067
    +     * P(t) = P0 + 2*(P1-P0)*t + (P0+P2-2*P1)*t^2
    
    1068
    +     *      = P0 + 2*B*t + A*t^2
    
    1069
    +     *
    
    1070
    +     *    for A = P0 + P2 - 2*P1
    
    1071
    +     *    and B = P1 - P0
    
    1072
    +     *
    
    1073
    +     * Let's consider the difference when advancing by a small
    
    1074
    +     * parameter h:
    
    1075
    +     *
    
    1076
    +     *    Q(h,t) = P(t+h) - P(t) = 2*B*h + A*h^2 + 2*A*h*t
    
    1077
    +     *
    
    1078
    +     * And then its own difference:
    
    1079
    +     *
    
    1080
    +     *    R(h,t) = Q(h,t+h) - Q(h,t) = 2*A*h*h = R (constant)
    
    1081
    +     *
    
    1082
    +     * Since R is always a constant, it is possible to compute
    
    1083
    +     * successive positions with:
    
    1084
    +     *
    
    1085
    +     *     P = P0
    
    1086
    +     *     Q = Q(h,0) = 2*B*h + A*h*h
    
    1087
    +     *     R = 2*A*h*h
    
    1088
    +     *
    
    1089
    +     *   loop:
    
    1090
    +     *     P += Q
    
    1091
    +     *     Q += R
    
    1092
    +     *     EMIT(P)
    
    1093
    +     *
    
    1094
    +     * To ensure accurate results, perform computations on 64-bit
    
    1095
    +     * values, after scaling them by 2^32:
    
    1096
    +     *
    
    1097
    +     *     R << 32   = 2 * A << (32 - N - N)
    
    1098
    +     *               = A << (33 - 2 *N)
    
    1099
    +     *
    
    1100
    +     *     Q << 32   = (2 * B << (32 - N)) + (A << (32 - N - N))
    
    1101
    +     *               = (B << (33 - N)) + (A << (32 - N - N))
    
    1102
    +     */
    
    1103
    +#ifdef __SSE2__
    
    1104
    +    /* Experience shows that for small shift values, SSE2 is actually slower. */
    
    1105
    +    if (shift > 2) {
    
    1106
    +      union {
    
    1107
    +        struct { FT_Int64 ax, ay, bx, by; } i;
    
    1108
    +        struct { __m128i a, b; } vec;
    
    1109
    +      } u;
    
    1110
    +
    
    1111
    +      u.i.ax = p0.x + p2.x - 2 * p1.x;
    
    1112
    +      u.i.ay = p0.y + p2.y - 2 * p1.y;
    
    1113
    +      u.i.bx = p1.x - p0.x;
    
    1114
    +      u.i.by = p1.y - p0.y;
    
    1115
    +
    
    1116
    +      __m128i a = _mm_load_si128(&u.vec.a);
    
    1117
    +      __m128i b = _mm_load_si128(&u.vec.b);
    
    1118
    +
    
    1119
    +      __m128i r = _mm_slli_epi64(a, 33 - 2 * shift);
    
    1120
    +      __m128i q = _mm_slli_epi64(b, 33 - shift);
    
    1121
    +      __m128i q2 = _mm_slli_epi64(a, 32 - 2 * shift);
    
    1122
    +      q = _mm_add_epi64(q2, q);
    
    1123
    +
    
    1124
    +      union {
    
    1125
    +        struct { FT_Int32  px_lo, px_hi, py_lo, py_hi; } i;
    
    1126
    +        __m128i vec;
    
    1127
    +      } v;
    
    1128
    +      v.i.px_lo = 0;
    
    1129
    +      v.i.px_hi = p0.x;
    
    1130
    +      v.i.py_lo = 0;
    
    1131
    +      v.i.py_hi = p0.y;
    
    1132
    +
    
    1133
    +      __m128i p = _mm_load_si128(&v.vec);
    
    1134
    +
    
    1135
    +      for (unsigned count = (1u << shift); count > 0; count--) {
    
    1136
    +        p = _mm_add_epi64(p, q);
    
    1137
    +        q = _mm_add_epi64(q, r);
    
    1138
    +
    
    1139
    +        _mm_store_si128(&v.vec, p);
    
    1140
    +
    
    1141
    +        gray_render_line( RAS_VAR_ v.i.px_hi, v.i.py_hi);
    
    1142
    +      }
    
    1143
    +      return;
    
    1144
    +    }
    
    1145
    +#endif  /* !__SSE2__ */
    
    1146
    +    FT_Int64 ax = p0.x + p2.x - 2 * p1.x;
    
    1147
    +    FT_Int64 ay = p0.y + p2.y - 2 * p1.y;
    
    1148
    +    FT_Int64 bx = p1.x - p0.x;
    
    1149
    +    FT_Int64 by = p1.y - p0.y;
    
    1150
    +
    
    1151
    +    FT_Int64 rx = ax << (33 - 2 * shift);
    
    1152
    +    FT_Int64 ry = ay << (33 - 2 * shift);
    
    1153
    +
    
    1154
    +    FT_Int64 qx = (bx << (33 - shift)) + (ax << (32 - 2 * shift));
    
    1155
    +    FT_Int64 qy = (by << (33 - shift)) + (ay << (32 - 2 * shift));
    
    1156
    +
    
    1157
    +    FT_Int64 px = (FT_Int64)p0.x << 32;
    
    1158
    +    FT_Int64 py = (FT_Int64)p0.y << 32;
    
    1159
    +
    
    1160
    +	FT_UInt count = 1u << shift;
    
    1161
    +
    
    1162
    +    for (; count > 0; count--) {
    
    1163
    +      px += qx;
    
    1164
    +      py += qy;
    
    1165
    +      qx += rx;
    
    1166
    +      qy += ry;
    
    1167
    +
    
    1168
    +      gray_render_line( RAS_VAR_ (FT_Pos)(px >> 32), (FT_Pos)(py >> 32));
    
    1169
    +    }
    
    1170
    +  }
    
    1171
    +
    
    1172
    +#else  /* !BEZIER_USE_DDA */
    
    1173
    +
    
    1174
    +  /* Note that multiple attempts to speed up the function below
    
    1175
    +   * with SSE2 intrinsics, using various data layouts, have turned
    
    1176
    +   * out to be slower than the non-SIMD code below.
    
    1177
    +   */
    
    977 1178
       static void
    
    978 1179
       gray_split_conic( FT_Vector*  base )
    
    979 1180
       {
    
    ... ... @@ -1059,7 +1260,15 @@ typedef ptrdiff_t FT_PtrDist;
    1059 1260
         } while ( --draw );
    
    1060 1261
       }
    
    1061 1262
     
    
    1263
    +#endif  /* !BEZIER_USE_DDA */
    
    1062 1264
     
    
    1265
    +  /* For cubic bezier, binary splits are still faster than DDA
    
    1266
    +   * because the splits are adaptive to how quickly each sub-arc
    
    1267
    +   * approaches their chord trisection points.
    
    1268
    +   *
    
    1269
    +   * It might be useful to experiment with SSE2 to speed up
    
    1270
    +   * gray_split_cubic() though.
    
    1271
    +   */
    
    1063 1272
       static void
    
    1064 1273
       gray_split_cubic( FT_Vector*  base )
    
    1065 1274
       {
    
    ... ... @@ -1150,7 +1359,6 @@ typedef ptrdiff_t FT_PtrDist;
    1150 1359
         }
    
    1151 1360
       }
    
    1152 1361
     
    
    1153
    -
    
    1154 1362
       static int
    
    1155 1363
       gray_move_to( const FT_Vector*  to,
    
    1156 1364
                     gray_PWorker      worker )
    
    ... ... @@ -1218,7 +1426,7 @@ typedef ptrdiff_t FT_PtrDist;
    1218 1426
           unsigned char*  line = ras.target.origin - ras.target.pitch * y;
    
    1219 1427
     
    
    1220 1428
     
    
    1221
    -      for ( ; cell != NULL; cell = cell->next )
    
    1429
    +      for ( ; !CELL_IS_NULL(cell); cell = cell->next )
    
    1222 1430
           {
    
    1223 1431
             if ( cover != 0 && cell->x > x )
    
    1224 1432
             {
    
    ... ... @@ -1266,7 +1474,7 @@ typedef ptrdiff_t FT_PtrDist;
    1266 1474
           TArea   area;
    
    1267 1475
     
    
    1268 1476
     
    
    1269
    -      for ( ; cell != NULL; cell = cell->next )
    
    1477
    +      for ( ; !CELL_IS_NULL(cell); cell = cell->next )
    
    1270 1478
           {
    
    1271 1479
             if ( cover != 0 && cell->x > x )
    
    1272 1480
             {
    
    ... ... @@ -1646,8 +1854,8 @@ typedef ptrdiff_t FT_PtrDist;
    1646 1854
           FT_TRACE7(( "band [%d..%d]: %ld cell%s\n",
    
    1647 1855
                       ras.min_ey,
    
    1648 1856
                       ras.max_ey,
    
    1649
    -                  ras.num_cells,
    
    1650
    -                  ras.num_cells == 1 ? "" : "s" ));
    
    1857
    +                  ras.cell_free - ras.cells.,
    
    1858
    +                  ras.cell_free - ras.cells == 1 ? "" : "s" ));
    
    1651 1859
         }
    
    1652 1860
         else
    
    1653 1861
         {
    
    ... ... @@ -1690,8 +1898,18 @@ typedef ptrdiff_t FT_PtrDist;
    1690 1898
     
    
    1691 1899
         ras.cells     = buffer + n;
    
    1692 1900
         ras.max_cells = (FT_PtrDist)( FT_MAX_GRAY_POOL - n );
    
    1901
    +    ras.cell_limit = ras.cells + ras.max_cells;
    
    1693 1902
         ras.ycells    = (PCell*)buffer;
    
    1694 1903
     
    
    1904
    +	/* Initialize the null cell is at the start of the 'cells' array. */
    
    1905
    +	/* Note that this requires ras.cell_free initialization to skip   */
    
    1906
    +	/* over the first entry in the array.                             */
    
    1907
    +	PCell null_cell  = NULL_CELL_PTR(ras);
    
    1908
    +	null_cell->x     = CELL_MAX_X_VALUE;
    
    1909
    +	null_cell->area  = 0;
    
    1910
    +	null_cell->cover = 0;
    
    1911
    +	null_cell->next  = NULL;;
    
    1912
    +
    
    1695 1913
         for ( y = yMin; y < yMax; )
    
    1696 1914
         {
    
    1697 1915
           ras.min_ey = y;
    
    ... ... @@ -1705,15 +1923,17 @@ typedef ptrdiff_t FT_PtrDist;
    1705 1923
           do
    
    1706 1924
           {
    
    1707 1925
             TCoord  width = band[0] - band[1];
    
    1926
    +        TCoord  w;
    
    1708 1927
             int     error;
    
    1709 1928
     
    
    1929
    +        for (w = 0; w < width; ++w)
    
    1930
    +          ras.ycells[w] = null_cell;
    
    1710 1931
     
    
    1711
    -        FT_MEM_ZERO( ras.ycells, height * sizeof ( PCell ) );
    
    1712
    -
    
    1713
    -        ras.num_cells = 0;
    
    1714
    -        ras.cell      = NULL;
    
    1932
    +        ras.cell_free = ras.cells + 1;  /* NOTE: Skip over the null cell. */
    
    1933
    +        ras.cell      = null_cell;
    
    1715 1934
             ras.min_ey    = band[1];
    
    1716 1935
             ras.max_ey    = band[0];
    
    1936
    +        ras.count_ey  = width;
    
    1717 1937
     
    
    1718 1938
             error     = gray_convert_glyph_inner( RAS_VAR, continued );
    
    1719 1939
             continued = 1;
    


  • reply via email to

    [Prev in Thread] Current Thread [Next in Thread]