Skip to content

Commit

Permalink
avx2: Improve mean color extractor
Browse files Browse the repository at this point in the history
Smaller loads perform slightly better than load + extract. Tested
on GCC 13/Haswell.

Cut down on code verbosity.
  • Loading branch information
hpjansson committed Apr 21, 2024
1 parent 0e7d5fb commit ac72d6f
Showing 1 changed file with 20 additions and 35 deletions.
55 changes: 20 additions & 35 deletions chafa/internal/chafa-avx2.c
Original file line number Diff line number Diff line change
Expand Up @@ -83,51 +83,36 @@ void
calc_colors_avx2 (const ChafaPixel *pixels, ChafaColorAccum *accums_out,
const guint32 *sym_mask_u32)
{
const __m256i *pixels_8x_p = (const __m256i *) pixels;
const __m256i *sym_mask_8x_p = (const __m256i *) sym_mask_u32;
__m256i accum_fg [2] = { { 0 }, { 0 } };
__m256i accum_bg [2] = { { 0 }, { 0 } };
const __m128i *pixels_4x_p = (const __m128i *) pixels;
const __m128i *sym_mask_4x_p = (const __m128i *) sym_mask_u32;
__m256i accum_fg = { 0 };
__m256i accum_bg = { 0 };
__m128i accum_fg_128;
__m128i accum_bg_128;
gint i;

for (i = 0; i < CHAFA_SYMBOL_N_PIXELS / 8; i++)
for (i = 0; i < CHAFA_SYMBOL_N_PIXELS / 4; i++)
{
__m256i pixels_8x, sym_mask_8x;
__m256i p0, fg0, bg0;
__m256i p1, fg1, bg1;

pixels_8x = _mm256_loadu_si256 (pixels_8x_p);
pixels_8x_p++;

sym_mask_8x = _mm256_loadu_si256 (sym_mask_8x_p);
sym_mask_8x_p++;
__m128i pixels_4x, sym_mask_4x;

p0 = _mm256_andnot_si256 (sym_mask_8x, pixels_8x);
p1 = _mm256_and_si256 (sym_mask_8x, pixels_8x);
pixels_4x = _mm_loadu_si128 (pixels_4x_p++);
sym_mask_4x = _mm_loadu_si128 (sym_mask_4x_p++);

fg0 = _mm256_cvtepu8_epi16 (_mm256_extracti128_si256 (p0, 0));
fg1 = _mm256_cvtepu8_epi16 (_mm256_extracti128_si256 (p0, 1));
accum_fg [0] = _mm256_add_epi16 (accum_fg [0], fg0);
accum_fg [1] = _mm256_add_epi16 (accum_fg [1], fg1);

bg0 = _mm256_cvtepu8_epi16 (_mm256_extracti128_si256 (p1, 0));
bg1 = _mm256_cvtepu8_epi16 (_mm256_extracti128_si256 (p1, 1));
accum_bg [0] = _mm256_add_epi16 (accum_bg [0], bg0);
accum_bg [1] = _mm256_add_epi16 (accum_bg [1], bg1);
accum_fg = _mm256_add_epi16 (accum_fg,
_mm256_cvtepu8_epi16 (_mm_and_si128 (sym_mask_4x, pixels_4x)));
accum_bg = _mm256_add_epi16 (accum_bg,
_mm256_cvtepu8_epi16 (_mm_andnot_si128 (sym_mask_4x, pixels_4x)));
}

accum_fg [0] = _mm256_add_epi16 (accum_fg [0], accum_fg [1]);
accum_fg_128 = _mm_add_epi16 (_mm256_extracti128_si256 (accum_fg [0], 0),
_mm256_extracti128_si256 (accum_fg [0], 1));
accum_bg_128 = _mm_add_epi16 (_mm256_extracti128_si256 (accum_bg, 0),
_mm256_extracti128_si256 (accum_bg, 1));
((guint64 *) accums_out) [0] =
(guint64) _mm_extract_epi64 (accum_fg_128, 0)
+ (guint64) _mm_extract_epi64 (accum_fg_128, 1);

accum_bg [0] = _mm256_add_epi16 (accum_bg [0], accum_bg [1]);
accum_bg_128 = _mm_add_epi16 (_mm256_extracti128_si256 (accum_bg [0], 0),
_mm256_extracti128_si256 (accum_bg [0], 1));
((guint64 *) accums_out) [1] =
(guint64) _mm_extract_epi64 (accum_bg_128, 0)
+ (guint64) _mm_extract_epi64 (accum_bg_128, 1);

accum_fg_128 = _mm_add_epi16 (_mm256_extracti128_si256 (accum_fg, 0),
_mm256_extracti128_si256 (accum_fg, 1));
((guint64 *) accums_out) [1] =
(guint64) _mm_extract_epi64 (accum_fg_128, 0)
+ (guint64) _mm_extract_epi64 (accum_fg_128, 1);
}

0 comments on commit ac72d6f

Please sign in to comment.