diff --git a/src/blitter/32bpp_anim_sse4.cpp b/src/blitter/32bpp_anim_sse4.cpp --- a/src/blitter/32bpp_anim_sse4.cpp +++ b/src/blitter/32bpp_anim_sse4.cpp @@ -290,14 +290,7 @@ bmcr_alpha_blend_single: for (uint x = (uint) bp->width / 2; x > 0; x--) { __m128i srcABCD = _mm_loadl_epi64((const __m128i*) src); __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst); - __m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); - __m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); - __m128i alphaAB = _mm_shuffle_epi8(srcAB, a_cm); - alphaAB = _mm_srli_epi16(alphaAB, 2); // Reduce to 64 levels of shades so the max value fits in 16 bits. - __m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB); - dstAB = _mm_mullo_epi16(dstAB, nom); - dstAB = _mm_srli_epi16(dstAB, 8); - dstAB = _mm_packus_epi16(dstAB, dstAB); + DARKEN_2(); _mm_storel_epi64((__m128i *) dst, dstAB); src += 2; dst += 2; @@ -308,14 +301,7 @@ bmcr_alpha_blend_single: if (bp->width & 1) { __m128i srcABCD = _mm_cvtsi32_si128(src->data); __m128i dstABCD = _mm_cvtsi32_si128(dst->data); - __m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); - __m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); - __m128i alphaAB = _mm_shuffle_epi8(srcAB, a_cm); - alphaAB = _mm_srli_epi16(alphaAB, 2); - __m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB); - dstAB = _mm_mullo_epi16(dstAB, nom); - dstAB = _mm_srli_epi16(dstAB, 8); - dstAB = _mm_packus_epi16(dstAB, dstAB); + DARKEN_2(); dst->data = _mm_cvtsi128_si32(dstAB); if (src[0].a) anim[0] = 0; } diff --git a/src/blitter/32bpp_sse2.cpp b/src/blitter/32bpp_sse2.cpp --- a/src/blitter/32bpp_sse2.cpp +++ b/src/blitter/32bpp_sse2.cpp @@ -144,21 +144,11 @@ bmcr_alpha_blend_single: break; } case BM_TRANSPARENT: { - /* Make the current colour a bit more black, so it looks like this image is transparent. - * rgb = rgb * ((256/4) * 4 - (alpha/4)) / ((256/4) * 4) - */ + /* Make the current colour a bit more black, so it looks like this image is transparent. */ for (uint x = (uint) bp->width / 2; x > 0; x--) { __m128i srcABCD = _mm_loadl_epi64((const __m128i*) src); __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst); - __m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); - __m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); - __m128i alphaAB = _mm_shufflelo_epi16(srcAB, 0x3F); - alphaAB = _mm_shufflehi_epi16(alphaAB, 0x3F); - alphaAB = _mm_srli_epi16(alphaAB, 2); // Reduce to 64 levels of shades so the max value fits in 16 bits. - __m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB); - dstAB = _mm_mullo_epi16(dstAB, nom); - dstAB = _mm_srli_epi16(dstAB, 8); - dstAB = _mm_packus_epi16(dstAB, dstAB); + DARKEN_2(); _mm_storel_epi64((__m128i *) dst, dstAB); src += 2; dst += 2; @@ -166,15 +156,7 @@ bmcr_alpha_blend_single: if (bp->width & 1) { __m128i srcABCD = _mm_cvtsi32_si128(src->data); __m128i dstABCD = _mm_cvtsi32_si128(dst->data); - __m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); - __m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); - __m128i alphaAB = _mm_shufflelo_epi16(srcAB, 0x3F); - alphaAB = _mm_shufflehi_epi16(alphaAB, 0x3F); - alphaAB = _mm_srli_epi16(alphaAB, 2); - __m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB); - dstAB = _mm_mullo_epi16(dstAB, nom); - dstAB = _mm_srli_epi16(dstAB, 8); - dstAB = _mm_packus_epi16(dstAB, dstAB); + DARKEN_2(); dst->data = _mm_cvtsi128_si32(dstAB); } break; diff --git a/src/blitter/32bpp_sse2.hpp b/src/blitter/32bpp_sse2.hpp --- a/src/blitter/32bpp_sse2.hpp +++ b/src/blitter/32bpp_sse2.hpp @@ -81,6 +81,19 @@ typedef union ALIGN(16) um128i { PACK_AB_WITHOUT_SATURATION(srcAB, srcABCD); \ } +/* Darken 2 pixels. + * rgb = rgb * ((256/4) * 4 - (alpha/4)) / ((256/4) * 4) + */ +#define DARKEN_2() \ + __m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); \ + __m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); \ + __m128i PUT_ALPHA_IN_FRONT_OF_RGB(srcAB, alphaAB); \ + alphaAB = _mm_srli_epi16(alphaAB, 2); /* Reduce to 64 levels of shades so the max value fits in 16 bits. */ \ + __m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB); \ + dstAB = _mm_mullo_epi16(dstAB, nom); \ + dstAB = _mm_srli_epi16(dstAB, 8); \ + dstAB = _mm_packus_epi16(dstAB, dstAB); + /** Base methods for 32bpp SSE blitters. */ class Blitter_32bppSSE_Base { public: diff --git a/src/blitter/32bpp_sse4.cpp b/src/blitter/32bpp_sse4.cpp --- a/src/blitter/32bpp_sse4.cpp +++ b/src/blitter/32bpp_sse4.cpp @@ -188,20 +188,11 @@ bmcr_alpha_blend_single: } case BM_TRANSPARENT: { - /* Make the current colour a bit more black, so it looks like this image is transparent. - * rgb = rgb * ((256/4) * 4 - (alpha/4)) / ((256/4) * 4) - */ + /* Make the current colour a bit more black, so it looks like this image is transparent. */ for (uint x = (uint) bp->width / 2; x > 0; x--) { __m128i srcABCD = _mm_loadl_epi64((const __m128i*) src); __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst); - __m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); - __m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); - __m128i alphaAB = _mm_shuffle_epi8(srcAB, a_cm); - alphaAB = _mm_srli_epi16(alphaAB, 2); // Reduce to 64 levels of shades so the max value fits in 16 bits. - __m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB); - dstAB = _mm_mullo_epi16(dstAB, nom); - dstAB = _mm_srli_epi16(dstAB, 8); - dstAB = _mm_packus_epi16(dstAB, dstAB); + DARKEN_2(); _mm_storel_epi64((__m128i *) dst, dstAB); src += 2; dst += 2; @@ -209,14 +200,7 @@ bmcr_alpha_blend_single: if (bp->width & 1) { __m128i srcABCD = _mm_cvtsi32_si128(src->data); __m128i dstABCD = _mm_cvtsi32_si128(dst->data); - __m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); - __m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); - __m128i alphaAB = _mm_shuffle_epi8(srcAB, a_cm); - alphaAB = _mm_srli_epi16(alphaAB, 2); - __m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB); - dstAB = _mm_mullo_epi16(dstAB, nom); - dstAB = _mm_srli_epi16(dstAB, 8); - dstAB = _mm_packus_epi16(dstAB, dstAB); + DARKEN_2(); dst->data = _mm_cvtsi128_si32(dstAB); } diff --git a/src/blitter/32bpp_ssse3.cpp b/src/blitter/32bpp_ssse3.cpp --- a/src/blitter/32bpp_ssse3.cpp +++ b/src/blitter/32bpp_ssse3.cpp @@ -188,20 +188,11 @@ bmcr_alpha_blend_single: } case BM_TRANSPARENT: { - /* Make the current colour a bit more black, so it looks like this image is transparent. - * rgb = rgb * ((256/4) * 4 - (alpha/4)) / ((256/4) * 4) - */ + /* Make the current colour a bit more black, so it looks like this image is transparent. */ for (uint x = (uint) bp->width / 2; x > 0; x--) { __m128i srcABCD = _mm_loadl_epi64((const __m128i*) src); __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst); - __m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); - __m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); - __m128i alphaAB = _mm_shuffle_epi8(srcAB, a_cm); - alphaAB = _mm_srli_epi16(alphaAB, 2); // Reduce to 64 levels of shades so the max value fits in 16 bits. - __m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB); - dstAB = _mm_mullo_epi16(dstAB, nom); - dstAB = _mm_srli_epi16(dstAB, 8); - dstAB = _mm_packus_epi16(dstAB, dstAB); + DARKEN_2(); _mm_storel_epi64((__m128i *) dst, dstAB); src += 2; dst += 2; @@ -209,14 +200,7 @@ bmcr_alpha_blend_single: if (bp->width & 1) { __m128i srcABCD = _mm_cvtsi32_si128(src->data); __m128i dstABCD = _mm_cvtsi32_si128(dst->data); - __m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); - __m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); - __m128i alphaAB = _mm_shuffle_epi8(srcAB, a_cm); - alphaAB = _mm_srli_epi16(alphaAB, 2); - __m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB); - dstAB = _mm_mullo_epi16(dstAB, nom); - dstAB = _mm_srli_epi16(dstAB, 8); - dstAB = _mm_packus_epi16(dstAB, dstAB); + DARKEN_2(); dst->data = _mm_cvtsi128_si32(dstAB); } break;