freetype-commit
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Git][freetype/freetype][gsoc-anurag-2022-final] [dense] Re-enable SIMD


From: Anurag Thakur (@AdbhutDev)
Subject: [Git][freetype/freetype][gsoc-anurag-2022-final] [dense] Re-enable SIMD to work with fixed-point
Date: Sat, 19 Nov 2022 07:37:23 +0000

Anurag Thakur pushed to branch gsoc-anurag-2022-final at FreeType / FreeType

Commits:

  • 763f1108
    by Anurag Thakur at 2022-11-19T13:05:14+05:30
    [dense] Re-enable SIMD to work with fixed-point
    
    * src/dense/ftdense.c: Use integer SIMD functions for accumulation
    
    * src/dense/ftdense.h: Change types of FT26D6, FT20D12 to better fit
    their usage
    

2 changed files:

Changes:

  • src/dense/ftdense.c
    ... ... @@ -24,7 +24,7 @@
    24 24
     
    
    25 25
     #if FT_SSE4_1
    
    26 26
     
    
    27
    -    #include <tmmintrin.h>
    
    27
    +    #include <immintrin.h>
    
    28 28
     
    
    29 29
     #endif
    
    30 30
     
    
    ... ... @@ -384,26 +384,47 @@ dense_render_glyph( dense_worker* worker, const FT_Bitmap* target )
    384 384
       unsigned char* dest     = target->buffer;
    
    385 385
       unsigned char* dest_end = target->buffer + worker->m_w * worker->m_h;
    
    386 386
     
    
    387
    -//#if FT_SSE4_1
    
    388
    -
    
    389
    -  // __m128 offset = _mm_setzero_ps();
    
    390
    -  // __m128i mask = _mm_set1_epi32(0x0c080400);
    
    391
    -  // __m128 sign_mask = _mm_set1_ps(-0.f);
    
    392
    -  // for (int i = 0; i < worker->m_h*worker->m_w; i += 4) {
    
    393
    -  //   __m128 x = _mm_load_ps(&source[i]);
    
    394
    -  //   x = _mm_add_ps(x, _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(x), 4)));
    
    395
    -  //   x = _mm_add_ps(x, _mm_shuffle_ps(_mm_setzero_ps(), x, 0x40));
    
    396
    -  //   x = _mm_add_ps(x, offset);
    
    397
    -  //   __m128 y = _mm_andnot_ps(sign_mask, x);  // fabs(x)
    
    398
    -  //   y = _mm_min_ps(y, _mm_set1_ps(1.0f));
    
    399
    -  //   y = _mm_mul_ps(y, _mm_set1_ps(255.0f));
    
    400
    -  //   __m128i z = _mm_cvtps_epi32(y);
    
    401
    -  //   z = _mm_shuffle_epi8(z, mask);
    
    402
    -  //   _mm_store_ss((float *)&dest[i], (__m128)z);
    
    403
    -  //   offset = _mm_shuffle_ps(x, x, _MM_SHUFFLE(3, 3, 3, 3));
    
    404
    -  // }
    
    405
    -
    
    406
    -//#else /* FT_SSE4_1 */
    
    387
    +#if FT_SSE4_1
    
    388
    +
    
    389
    +__m128i offset = _mm_setzero_si128();
    
    390
    +  __m128i mask   = _mm_set1_epi32( 0x0c080400 );
    
    391
    +
    
    392
    +  for (int i = 0; i < worker->m_h*worker->m_w; i += 4)
    
    393
    +  {
    
    394
    +    // load 4 floats from source
    
    395
    +
    
    396
    +    __m128i x = _mm_load_si128( (__m128i*)&source[i] );
    
    397
    +
    
    398
    +    x = _mm_add_epi32( x, _mm_slli_si128( x, 4 ) );
    
    399
    +
    
    400
    +    x = _mm_add_epi32(
    
    401
    +        x, _mm_castps_si128( _mm_shuffle_ps( _mm_setzero_ps(),
    
    402
    +                                             _mm_castsi128_ps( x ), 0x40 ) ) );
    
    403
    +
    
    404
    +    // add the prefsum of previous 4 floats to all current floats
    
    405
    +    x = _mm_add_epi32( x, offset );
    
    406
    +
    
    407
    +    // take absolute value
    
    408
    +    __m128i y = _mm_abs_epi32( x );  // fabs(x)
    
    409
    +
    
    410
    +    // cap max value to 1
    
    411
    +    y = _mm_min_epi32( y, _mm_set1_epi32( 4080 ) );
    
    412
    +
    
    413
    +    // reduce to 255
    
    414
    +    y = _mm_srli_epi32( y, 4 );
    
    415
    +
    
    416
    +    // shuffle
    
    417
    +    y = _mm_shuffle_epi8( y, mask );
    
    418
    +
    
    419
    +    _mm_store_ss( (float*)&dest[i], (__m128)y );
    
    420
    +
    
    421
    +    // store the current prefix sum in offset
    
    422
    +    offset = _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( x ),
    
    423
    +                                               _mm_castsi128_ps( x ),
    
    424
    +                                               _MM_SHUFFLE( 3, 3, 3, 3 ) ) );
    
    425
    +  }
    
    426
    +
    
    427
    +#else /* FT_SSE4_1 */
    
    407 428
     
    
    408 429
       FT20D12 value = 0;
    
    409 430
     
    
    ... ... @@ -422,7 +443,7 @@ dense_render_glyph( dense_worker* worker, const FT_Bitmap* target )
    422 443
         dest++;
    
    423 444
       }
    
    424 445
     
    
    425
    -//#endif /* FT_SSE4_1 */
    
    446
    +#endif /* FT_SSE4_1 */
    
    426 447
     
    
    427 448
       free(worker->m_a);
    
    428 449
       return error;
    

  • src/dense/ftdense.h
    ... ... @@ -20,8 +20,8 @@ extern "C"
    20 20
     #endif
    
    21 21
     
    
    22 22
     
    
    23
    -  typedef signed long long FT26D6;            /* 26.6 fixed-point representation  */
    
    24
    -  typedef signed long long FT20D12;           /* 20.12 fixed-point representation  */
    
    23
    +  typedef signed long FT26D6;            /* 26.6 fixed-point representation  */
    
    24
    +  typedef signed int FT20D12;            /* 20.12 fixed-point representation  */
    
    25 25
     
    
    26 26
       typedef struct
    
    27 27
       {
    


  • reply via email to

    [Prev in Thread] Current Thread [Next in Thread]