Changeset - r21161:e80e20ffc783
[Not reviewed]
master
0 6 0
rubidium - 11 years ago 2014-01-13 17:54:24
rubidium@openttd.org
(svn r26247) -Fix [FS#5854, FS#5855]: Possible out of bounds reads with the sse blitters (MJP)
6 files changed with 164 insertions and 208 deletions:
0 comments (0 inline, 0 general)
src/blitter/32bpp_anim_sse4.cpp
Show inline comments
 
@@ -80,10 +80,11 @@ inline void Blitter_32bppSSE4_Anim::Draw
 
					}
 

	
 
					case RM_WITH_SKIP: {
 
						uint32 mvX2 = *((uint32 *) const_cast<MapValue *>(src_mv));
 
						__m128i srcABCD = _mm_loadu_si128((const __m128i*) src);
 
						__m128i dstABCD = _mm_loadu_si128((__m128i*) dst);
 
						for (uint x = (uint) effective_width/2; x != 0; x--) {
 
							uint32 mvX2 = *((uint32 *) const_cast<MapValue *>(src_mv));
 
							__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
 
							__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
 

	
 
							/* Remap colours. */
 
							const byte m0 = mvX2;
 
							if (m0 >= PALETTE_ANIM_START) {
 
@@ -125,42 +126,32 @@ inline void Blitter_32bppSSE4_Anim::Draw
 
bmno_alpha_blend:
 
							ALPHA_BLEND_2(pack_low_cm);
 
bmno_full_opacity:
 
							srcABCD = _mm_blend_epi16(srcABCD, dstABCD, 0xF0);
 

	
 
							_mm_storel_epi64((__m128i *) dst, srcABCD);
 
bmno_full_transparency:
 
							src_mv += 2;
 
							mvX2 = *((uint32 *) const_cast<MapValue *>(src_mv));
 
							src += 2;
 
							anim += 2;
 
							dstABCD = _mm_loadu_si128((__m128i*) (dst+2));
 
							_mm_storeu_si128((__m128i *) dst, srcABCD);
 
							srcABCD = _mm_loadu_si128((const __m128i*) src);
 
							dst += 2;
 
							continue;
 

	
 
bmno_full_transparency:
 
							src_mv += 2;
 
							mvX2 = *((uint32 *) const_cast<MapValue *>(src_mv));
 
							dst += 2;
 
							src += 2;
 
							anim += 2;
 
							dstABCD = _mm_loadu_si128((__m128i*) dst);
 
							srcABCD = _mm_loadu_si128((const __m128i*) src);
 
						}
 

	
 
						if (bt_last == BT_ODD) {
 
							if (src->a == 0) {
 
							} else if (src->a == 255) {
 
								*anim = (uint16) mvX2;
 
								*dst = ((byte) mvX2 >= PALETTE_ANIM_START) ? AdjustBrightness(LookupColourInPalette((byte) mvX2), (byte) (mvX2 >> 8)) : *src;
 
								*anim = *(const uint16*) src_mv;
 
								*dst = (src_mv->m >= PALETTE_ANIM_START) ? AdjustBrightness(LookupColourInPalette(src_mv->m), src_mv->v) : *src;
 
							} else {
 
								*anim = 0;
 
								if ((byte) mvX2 >= PALETTE_ANIM_START) {
 
									ALIGN(16) Colour colour = AdjustBrightness(LookupColourInPalette((byte) mvX2), (byte) (mvX2 >> 8));
 
								__m128i srcABCD;
 
								__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
 
								if (src_mv->m >= PALETTE_ANIM_START) {
 
									Colour colour = AdjustBrightness(LookupColourInPalette(src_mv->m), src_mv->v);
 
									colour.a = src->a;
 
									srcABCD = _mm_load_si128((__m128i*) &colour);
 
									srcABCD = _mm_cvtsi32_si128(colour.data);
 
								} else {
 
									srcABCD = _mm_cvtsi32_si128(src->data);
 
								}
 
								ALPHA_BLEND_2(pack_low_cm);
 
								(*dst).data = EXTR32(srcABCD, 0);
 
								dst->data = _mm_cvtsi128_si32(srcABCD);
 
							}
 
						}
 
						break;
 
@@ -181,18 +172,18 @@ bmno_full_transparency:
 
						const int width_diff = si->sprite_width - bp->width;
 
						effective_width = bp->width - (int) src_rgba_line[0].data;
 
						const int delta_diff = (int) src_rgba_line[1].data - width_diff;
 
						const int nd = effective_width - delta_diff;
 
						effective_width = delta_diff > 0 ? nd : effective_width;
 
						const int new_width = effective_width - delta_diff;
 
						effective_width = delta_diff > 0 ? new_width : effective_width;
 
						if (effective_width <= 0) break;
 
						/* FALLTHROUGH */
 
					}
 

	
 
					case RM_WITH_SKIP: {
 
						uint32 mvX2 = *((uint32 *) const_cast<MapValue *>(src_mv));
 
						__m128i srcABCD = _mm_loadu_si128((const __m128i*) src);
 
						__m128i dstABCD = _mm_loadu_si128((__m128i*) dst);
 
						for (uint x = (uint) effective_width / 2; x != 0; x--) {
 
							uint32 mvX2 = *((uint32 *) const_cast<MapValue *>(src_mv));
 
							__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
 
							__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
 

	
 
						for (uint x = (uint) effective_width / 2; x != 0; x--) {
 
							/* Remap colours. */
 
							const uint m0 = (byte) mvX2;
 
							const uint r0 = remap[m0];
 
@@ -250,53 +241,40 @@ bmno_full_transparency:
 
bmcr_alpha_blend:
 
							ALPHA_BLEND_2(pack_low_cm);
 
bmcr_full_opacity:
 
							srcABCD = _mm_blend_epi16(srcABCD, dstABCD, 0xF0);
 

	
 
							src += 2;
 
							src_mv += 2;
 
							anim += 2;
 
							mvX2 = *((uint32 *) const_cast<MapValue *>(src_mv));
 
							dstABCD = _mm_loadu_si128((__m128i*) (dst+2));
 
							_mm_storeu_si128((__m128i *) dst, srcABCD);
 
							srcABCD = _mm_loadu_si128((const __m128i*) src);
 
							dst += 2;
 
							continue;
 

	
 
							_mm_storel_epi64((__m128i *) dst, srcABCD);
 
bmcr_full_transparency:
 
							src_mv += 2;
 
							mvX2 = *((uint32 *) const_cast<MapValue *>(src_mv));
 
							dst += 2;
 
							src += 2;
 
							anim += 2;
 
							dstABCD = _mm_loadu_si128((__m128i*) dst);
 
							srcABCD = _mm_loadu_si128((const __m128i*) src);
 
						}
 

	
 
						if (effective_width & 1) {
 
							/* In case the m-channel is zero, do not remap this pixel in any way. */
 
							if (src->a == 0) {
 
							} else if ((byte) mvX2 != 0) {
 
								const uint r = remap[(byte) mvX2];
 
								*anim = (src->a == 255) ? (r | ((uint16) mvX2 & 0xFF00)) : 0;
 
							__m128i srcABCD;
 
							if (src->a == 0) break;
 
							if (src_mv->m) {
 
								const uint r = remap[src_mv->m];
 
								*anim = (src->a == 255) ? r | ((uint16) src_mv->v << 8 ) : 0;
 
								if (r != 0) {
 
									Colour remapped_colour = AdjustBrightness(LookupColourInPalette(r), (byte) (mvX2 >> 8));
 
									Colour remapped_colour = AdjustBrightness(this->LookupColourInPalette(r), src_mv->v);
 
									if (src->a == 255) {
 
										*dst = remapped_colour;
 
									} else {
 
										remapped_colour.a = src->a;
 
										INSR32(remapped_colour.data, srcABCD, 0);
 
										srcABCD = _mm_cvtsi32_si128(remapped_colour.data);
 
										goto bmcr_alpha_blend_single;
 
									}
 
								}
 
							} else {
 
								*anim = 0;
 
								if (src->a == 255) {
 
									*dst = *src;
 
								} else {
 
								srcABCD = _mm_cvtsi32_si128(src->data);
 
								if (src->a < 255) {
 
bmcr_alpha_blend_single:
 
									__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
 
									ALPHA_BLEND_2(pack_low_cm);
 
									(*dst).data = EXTR32(srcABCD, 0);
 
								}
 
								dst->data = _mm_cvtsi128_si32(srcABCD);
 
							}
 
						}
 
						break;
 
@@ -309,29 +287,27 @@ bmcr_alpha_blend_single:
 

	
 
			case BM_TRANSPARENT: {
 
				/* Make the current colour a bit more black, so it looks like this image is transparent. */
 
				__m128i srcABCD = _mm_loadu_si128((const __m128i*) src);
 
				__m128i dstABCD = _mm_loadu_si128((__m128i*) dst);
 
				for (uint x = (uint) bp->width / 2; x > 0; x--) {
 
					__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
 
					__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
 
					__m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128());
 
					__m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128());
 
					__m128i dstCD = _mm_unpackhi_epi8(dstABCD, _mm_setzero_si128());
 
					__m128i alphaAB = _mm_shuffle_epi8(srcAB, a_cm);
 
					alphaAB = _mm_srli_epi16(alphaAB, 2); // Reduce to 64 levels of shades so the max value fits in 16 bits.
 
					__m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB);
 
					dstAB = _mm_mullo_epi16(dstAB, nom);
 
					dstAB = _mm_srli_epi16(dstAB, 8);
 
					dstAB = _mm_packus_epi16(dstAB, dstCD);
 
					Colour *old_dst = dst;
 
					dstAB = _mm_packus_epi16(dstAB, dstAB);
 
					_mm_storel_epi64((__m128i *) dst, dstAB);
 
					src += 2;
 
					dst += 2;
 
					anim += 2;
 
					dstABCD = _mm_loadu_si128((__m128i*) dst);
 
					_mm_storeu_si128((__m128i *) old_dst, dstAB);
 
					srcABCD = _mm_loadu_si128((const __m128i*) src);
 
					if (src[-2].a) anim[-2] = 0;
 
					if (src[-1].a) anim[-1] = 0;
 
				}
 
				if (bp->width & 1) {
 
					__m128i srcABCD = _mm_cvtsi32_si128(src->data);
 
					__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
 
					__m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128());
 
					__m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128());
 
					__m128i alphaAB = _mm_shuffle_epi8(srcAB, a_cm);
 
@@ -340,7 +316,7 @@ bmcr_alpha_blend_single:
 
					dstAB = _mm_mullo_epi16(dstAB, nom);
 
					dstAB = _mm_srli_epi16(dstAB, 8);
 
					dstAB = _mm_packus_epi16(dstAB, dstAB);
 
					(*dst).data = EXTR32(dstAB, 0);
 
					dst->data = _mm_cvtsi128_si32(dstAB);
 
					if (src[0].a) anim[0] = 0;
 
				}
 
				break;
src/blitter/32bpp_sse2.cpp
Show inline comments
 
@@ -70,18 +70,18 @@ inline void Blitter_32bppSSE2::Draw(cons
 

	
 
					case RM_WITH_SKIP: {
 
						for (uint x = (uint) effective_width / 2; x > 0; x--) {
 
							__m128i srcABCD = _mm_loadu_si128((const __m128i*) src);
 
							__m128i dstABCD = _mm_loadu_si128((__m128i*) dst);
 
							__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
 
							__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
 
							ALPHA_BLEND_2();
 
							*(uint64*) dst = EXTR64(srcABCD, 0);
 
							_mm_storel_epi64((__m128i*) dst, srcABCD);
 
							src += 2;
 
							dst += 2;
 
						}
 
						if (bt_last == BT_ODD) {
 
							__m128i srcABCD = _mm_loadu_si128((const __m128i*) src);
 
							__m128i dstABCD = _mm_loadu_si128((__m128i*) dst);
 
							__m128i srcABCD = _mm_cvtsi32_si128(src->data);
 
							__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
 
							ALPHA_BLEND_2();
 
							(*dst).data = EXTR32(srcABCD, 0);
 
							dst->data = _mm_cvtsi128_si32(srcABCD);
 
						}
 
						break;
 
					}
 
@@ -99,8 +99,8 @@ inline void Blitter_32bppSSE2::Draw(cons
 
						const int width_diff = si->sprite_width - bp->width;
 
						effective_width = bp->width - (int) src_rgba_line[0].data;
 
						const int delta_diff = (int) src_rgba_line[1].data - width_diff;
 
						const int nd = effective_width - delta_diff;
 
						effective_width = delta_diff > 0 ? nd : effective_width;
 
						const int new_width = effective_width - delta_diff;
 
						effective_width = delta_diff > 0 ? new_width : effective_width;
 
						if (effective_width <= 0) break;
 
						/* FALLTHROUGH */
 
					}
 
@@ -108,30 +108,28 @@ inline void Blitter_32bppSSE2::Draw(cons
 
					case RM_WITH_SKIP: {
 
						const byte *remap = bp->remap;
 
						for (uint x = (uint) effective_width; x != 0; x--) {
 
							/* In case the m-channel is zero, do not remap this pixel in any way */
 
							if (src_mv->m == 0) {
 
								if (src->a < 255) {
 
									__m128i srcABCD = _mm_loadu_si128((const __m128i*) src);
 
									__m128i dstABCD = _mm_loadu_si128((__m128i*) dst);
 
									ALPHA_BLEND_2();
 
									(*dst).data = EXTR32(srcABCD, 0);
 
								} else {
 
									*dst = src->data;
 
								}
 
							} else {
 
							/* In case the m-channel is zero, do not remap this pixel in any way. */
 
							__m128i srcABCD;
 
							if (src_mv->m) {
 
								const uint r = remap[src_mv->m];
 
								if (r != 0) {
 
									Colour remapped_colour = AdjustBrightness(this->LookupColourInPalette(r), src_mv->v);
 
									if (src->a < 255) {
 
										__m128i srcABCD;
 
										__m128i dstABCD = _mm_loadu_si128((__m128i*) dst);
 
									if (src->a == 255) {
 
										*dst = remapped_colour;
 
									} else {
 
										remapped_colour.a = src->a;
 
										INSR32(remapped_colour.data, srcABCD, 0);
 
										ALPHA_BLEND_2();
 
										(*dst).data = EXTR32(srcABCD, 0);
 
									} else
 
										*dst = remapped_colour;
 
										srcABCD = _mm_cvtsi32_si128(remapped_colour.data);
 
										goto bmcr_alpha_blend_single;
 
									}
 
								}
 
							} else {
 
								srcABCD = _mm_cvtsi32_si128(src->data);
 
								if (src->a < 255) {
 
bmcr_alpha_blend_single:
 
									__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
 
									ALPHA_BLEND_2();
 
								}
 
								dst->data = _mm_cvtsi128_si32(srcABCD);
 
							}
 
							src_mv++;
 
							dst++;
 
@@ -149,27 +147,25 @@ inline void Blitter_32bppSSE2::Draw(cons
 
				/* Make the current colour a bit more black, so it looks like this image is transparent.
 
				 * rgb = rgb * ((256/4) * 4 - (alpha/4)) / ((256/4) * 4)
 
				 */
 
				__m128i srcABCD = _mm_loadu_si128((const __m128i*) src);
 
				__m128i dstABCD = _mm_loadu_si128((__m128i*) dst);
 
				for (uint x = (uint) bp->width / 2; x > 0; x--) {
 
					__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
 
					__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
 
					__m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128());
 
					__m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128());
 
					__m128i dstCD = _mm_unpackhi_epi8(dstABCD, _mm_setzero_si128());
 
					__m128i alphaAB = _mm_shufflelo_epi16(srcAB, 0x3F);
 
					alphaAB = _mm_shufflehi_epi16(alphaAB, 0x3F);
 
					alphaAB = _mm_srli_epi16(alphaAB, 2); // Reduce to 64 levels of shades so the max value fits in 16 bits.
 
					__m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB);
 
					dstAB = _mm_mullo_epi16(dstAB, nom);
 
					dstAB = _mm_srli_epi16(dstAB, 8);
 
					dstAB = _mm_packus_epi16(dstAB, dstCD);
 
					Colour *old_dst = dst;
 
					dstAB = _mm_packus_epi16(dstAB, dstAB);
 
					_mm_storel_epi64((__m128i *) dst, dstAB);
 
					src += 2;
 
					dst += 2;
 
					dstABCD = _mm_loadu_si128((__m128i*) dst);
 
					_mm_storeu_si128((__m128i *) old_dst, dstAB);
 
					srcABCD = _mm_loadu_si128((const __m128i*) src);
 
				}
 
				if (bp->width & 1) {
 
					__m128i srcABCD = _mm_cvtsi32_si128(src->data);
 
					__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
 
					__m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128());
 
					__m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128());
 
					__m128i alphaAB = _mm_shufflelo_epi16(srcAB, 0x3F);
 
@@ -179,7 +175,7 @@ inline void Blitter_32bppSSE2::Draw(cons
 
					dstAB = _mm_mullo_epi16(dstAB, nom);
 
					dstAB = _mm_srli_epi16(dstAB, 8);
 
					dstAB = _mm_packus_epi16(dstAB, dstAB);
 
					(*dst).data = EXTR32(dstAB, 0);
 
					dst->data = _mm_cvtsi128_si32(dstAB);
 
				}
 
				break;
 
			}
 
@@ -345,7 +341,7 @@ inline Colour Blitter_32bppSSE2::AdjustB
 
IGNORE_UNINITIALIZED_WARNING_START
 
/* static */ Colour Blitter_32bppSSE2::ReallyAdjustBrightness(Colour colour, uint8 brightness)
 
{
 
	ALIGN(16) uint64 c16 = colour.b | (uint64) colour.g << 16 | (uint64) colour.r << 32;
 
	uint64 c16 = colour.b | (uint64) colour.g << 16 | (uint64) colour.r << 32;
 
	c16 *= brightness;
 
	uint64 c16_ob = c16; // Helps out of order execution.
 
	c16 /= DEFAULT_BRIGHTNESS;
 
@@ -357,12 +353,20 @@ IGNORE_UNINITIALIZED_WARNING_START
 

	
 
	const uint32 alpha32 = colour.data & 0xFF000000;
 
	__m128i ret;
 
#ifdef _SQ64
 
	ret = _mm_cvtsi64_si128(c16);
 
#else
 
	INSR64(c16, ret, 0);
 
#endif
 
	if (ob != 0) {
 
		/* Reduce overbright strength. */
 
		ob /= 2;
 
		__m128i ob128;
 
#ifdef _SQ64
 
		ob128 = _mm_cvtsi64_si128(ob | ob << 16 | ob << 32);
 
#else
 
		INSR64(ob | ob << 16 | ob << 32, ob128, 0);
 
#endif
 
		__m128i white = OVERBRIGHT_VALUE_MASK;
 
		__m128i c128 = ret;
 
		ret = _mm_subs_epu16(white, c128); /* PSUBUSW,   (255 - rgb) */
 
@@ -372,7 +376,7 @@ IGNORE_UNINITIALIZED_WARNING_START
 
	}
 

	
 
	ret = _mm_packus_epi16(ret, ret);      /* PACKUSWB, saturate and pack. */
 
	return alpha32 | EXTR32(ret, 0);
 
	return alpha32 | _mm_cvtsi128_si32(ret);
 
}
 
IGNORE_UNINITIALIZED_WARNING_STOP
 

	
src/blitter/32bpp_sse4.cpp
Show inline comments
 
@@ -76,24 +76,19 @@ inline void Blitter_32bppSSE4::Draw(cons
 
					}
 

	
 
					case RM_WITH_SKIP: {
 
						__m128i srcABCD = _mm_loadu_si128((const __m128i*) src);
 
						__m128i dstABCD = _mm_loadu_si128((__m128i*) dst);
 
						for (uint x = (uint) effective_width / 2; x > 0; x--) {
 
							__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
 
							__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
 
							ALPHA_BLEND_2(pack_low_cm);
 
							srcABCD = _mm_blend_epi16(srcABCD, dstABCD, 0xF0);
 
							Colour *old_dst = dst;
 
							_mm_storel_epi64((__m128i*) dst, srcABCD);
 
							src += 2;
 
							dst += 2;
 
							/* It is VERY important to read next data before it gets invalidated in cpu cache.
 
							 * And PEXTR latency is a real problem here.
 
							 */
 
							dstABCD = _mm_loadu_si128((__m128i*) dst);
 
							_mm_storeu_si128((__m128i *) old_dst, srcABCD);
 
							srcABCD = _mm_loadu_si128((const __m128i*) src);
 
						}
 
						if (bt_last == BT_ODD) {
 
							__m128i srcABCD = _mm_cvtsi32_si128(src->data);
 
							__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
 
							ALPHA_BLEND_2(pack_low_cm);
 
							*dst = (Colour) EXTR32(srcABCD, 0);
 
							dst->data = _mm_cvtsi128_si32(srcABCD);
 
						}
 
						break;
 
					}
 
@@ -112,18 +107,18 @@ inline void Blitter_32bppSSE4::Draw(cons
 
						const int width_diff = si->sprite_width - bp->width;
 
						effective_width = bp->width - (int) src_rgba_line[0].data;
 
						const int delta_diff = (int) src_rgba_line[1].data - width_diff;
 
						const int nd = effective_width - delta_diff;
 
						effective_width = delta_diff > 0 ? nd : effective_width;
 
						const int new_width = effective_width - delta_diff;
 
						effective_width = delta_diff > 0 ? new_width : effective_width;
 
						if (effective_width <= 0) break;
 
						/* FALLTHROUGH */
 
					}
 

	
 
					case RM_WITH_SKIP: {
 
						__m128i srcABCD = _mm_loadu_si128((const __m128i*) src);
 
						__m128i dstABCD = _mm_loadu_si128((__m128i*) dst);
 
						uint32 mvX2 = *((uint32 *) const_cast<MapValue *>(src_mv));
 
						for (uint x = (uint) effective_width / 2; x > 0; x--) {
 
							__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
 
							__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
 
							uint32 mvX2 = *((uint32 *) const_cast<MapValue *>(src_mv));
 

	
 
						for (uint x = (uint) effective_width / 2; x > 0; x--) {
 
							/* Remap colours. */
 
							if (mvX2 & 0x00FF00FF) {
 
								/* Written so the compiler uses CMOV. */
 
@@ -152,38 +147,35 @@ inline void Blitter_32bppSSE4::Draw(cons
 

	
 
							/* Blend colours. */
 
							ALPHA_BLEND_2(pack_low_cm);
 
							srcABCD = _mm_blend_epi16(srcABCD, dstABCD, 0xF0);
 
							Colour *old_dst = dst;
 
							_mm_storel_epi64((__m128i *) dst, srcABCD);
 
							dst += 2;
 
							src += 2;
 
							src_mv += 2;
 
							dstABCD = _mm_loadu_si128((__m128i*) dst);
 
							_mm_storeu_si128((__m128i *) old_dst, srcABCD);
 
							mvX2 = *((uint32 *) const_cast<MapValue *>(src_mv));
 
							srcABCD = _mm_loadu_si128((const __m128i*) src);
 
						}
 

	
 
						if (effective_width & 1) {
 
							/* In case the m-channel is zero, do not remap this pixel in any way. */
 
							if ((byte) mvX2 == 0) {
 
								if (src->a < 255) {
 
									ALPHA_BLEND_2(pack_low_cm);
 
									(*dst).data = EXTR32(srcABCD, 0);
 
								} else
 
									*dst = *src;
 
							} else {
 
								const uint r = remap[(byte) mvX2];
 
							__m128i srcABCD;
 
							if (src_mv->m) {
 
								const uint r = remap[src_mv->m];
 
								if (r != 0) {
 
									Colour remapped_colour = AdjustBrightness(this->LookupColourInPalette(r), (byte) (mvX2 >> 8));
 
									Colour remapped_colour = AdjustBrightness(this->LookupColourInPalette(r), src_mv->v);
 
									if (src->a == 255) {
 
										*dst = remapped_colour;
 
									} else {
 
										remapped_colour.a = src->a;
 
										INSR32(remapped_colour.data, srcABCD, 0);
 
										ALPHA_BLEND_2(pack_low_cm);
 
										(*dst).data = EXTR32(srcABCD, 0);
 
										srcABCD = _mm_cvtsi32_si128(remapped_colour.data);
 
										goto bmcr_alpha_blend_single;
 
									}
 
								}
 
							} else {
 
								srcABCD = _mm_cvtsi32_si128(src->data);
 
								if (src->a < 255) {
 
bmcr_alpha_blend_single:
 
									__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
 
									ALPHA_BLEND_2(pack_low_cm);
 
								}
 
								dst->data = _mm_cvtsi128_si32(srcABCD);
 
							}
 
						}
 
						break;
 
@@ -199,26 +191,24 @@ inline void Blitter_32bppSSE4::Draw(cons
 
				/* Make the current colour a bit more black, so it looks like this image is transparent.
 
				 * rgb = rgb * ((256/4) * 4 - (alpha/4)) / ((256/4) * 4)
 
				 */
 
				__m128i srcABCD = _mm_loadu_si128((const __m128i*) src);
 
				__m128i dstABCD = _mm_loadu_si128((__m128i*) dst);
 
				for (uint x = (uint) bp->width / 2; x > 0; x--) {
 
					__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
 
					__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
 
					__m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128());
 
					__m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128());
 
					__m128i dstCD = _mm_unpackhi_epi8(dstABCD, _mm_setzero_si128());
 
					__m128i alphaAB = _mm_shuffle_epi8(srcAB, a_cm);
 
					alphaAB = _mm_srli_epi16(alphaAB, 2); // Reduce to 64 levels of shades so the max value fits in 16 bits.
 
					__m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB);
 
					dstAB = _mm_mullo_epi16(dstAB, nom);
 
					dstAB = _mm_srli_epi16(dstAB, 8);
 
					dstAB = _mm_packus_epi16(dstAB, dstCD);
 
					Colour *old_dst = dst;
 
					dstAB = _mm_packus_epi16(dstAB, dstAB);
 
					_mm_storel_epi64((__m128i *) dst, dstAB);
 
					src += 2;
 
					dst += 2;
 
					dstABCD = _mm_loadu_si128((__m128i*) dst);
 
					_mm_storeu_si128((__m128i *) old_dst, dstAB);
 
					srcABCD = _mm_loadu_si128((const __m128i*) src);
 
				}
 
				if (bp->width & 1) {
 
					__m128i srcABCD = _mm_cvtsi32_si128(src->data);
 
					__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
 
					__m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128());
 
					__m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128());
 
					__m128i alphaAB = _mm_shuffle_epi8(srcAB, a_cm);
 
@@ -227,7 +217,7 @@ inline void Blitter_32bppSSE4::Draw(cons
 
					dstAB = _mm_mullo_epi16(dstAB, nom);
 
					dstAB = _mm_srli_epi16(dstAB, 8);
 
					dstAB = _mm_packus_epi16(dstAB, dstAB);
 
					(*dst).data = EXTR32(dstAB, 0);
 
					dst->data = _mm_cvtsi128_si32(dstAB);
 
				}
 

	
 
				break;
 
@@ -290,7 +280,7 @@ inline Colour Blitter_32bppSSE4::AdjustB
 
IGNORE_UNINITIALIZED_WARNING_START
 
/* static */ Colour Blitter_32bppSSE4::ReallyAdjustBrightness(Colour colour, uint8 brightness)
 
{
 
	ALIGN(16) uint64 c16 = colour.b | (uint64) colour.g << 16 | (uint64) colour.r << 32;
 
	uint64 c16 = colour.b | (uint64) colour.g << 16 | (uint64) colour.r << 32;
 
	c16 *= brightness;
 
	uint64 c16_ob = c16; // Helps out of order execution.
 
	c16 /= DEFAULT_BRIGHTNESS;
 
@@ -317,7 +307,7 @@ IGNORE_UNINITIALIZED_WARNING_START
 
	}
 

	
 
	ret = _mm_packus_epi16(ret, ret);      /* PACKUSWB, saturate and pack. */
 
	return alpha32 | EXTR32(ret, 0);
 
	return alpha32 | _mm_cvtsi128_si32(ret);
 
}
 
IGNORE_UNINITIALIZED_WARNING_STOP
 

	
src/blitter/32bpp_ssse3.cpp
Show inline comments
 
@@ -50,7 +50,7 @@ inline void Blitter_32bppSSSE3::Draw(con
 

	
 
	/* Load these variables into register before loop. */
 
	const __m128i a_cm        = ALPHA_CONTROL_MASK;
 
	const __m128i pack_hi_cm  = PACK_HIGH_CONTROL_MASK;
 
	const __m128i pack_low_cm = PACK_LOW_CONTROL_MASK;
 
	const __m128i briAB_cm    = BRIGHTNESS_LOW_CONTROL_MASK;
 
	const __m128i div_cleaner = BRIGHTNESS_DIV_CLEANER;
 
	const __m128i ob_check    = OVERBRIGHT_PRESENCE_MASK;
 
@@ -79,27 +79,19 @@ inline void Blitter_32bppSSSE3::Draw(con
 
					}
 

	
 
					case RM_WITH_SKIP: {
 
						__m128i srcABCD = _mm_loadu_si128((const __m128i*) src);
 
						__m128i dstABCD = _mm_loadu_si128((__m128i*) dst);
 
						for (uint x = (uint) effective_width / 2; x > 0; x--) {
 
							ALPHA_BLEND_2(pack_hi_cm);
 
							/* With high repack, srcABCD have its 2 blended pixels like: [S0 S1 S2 S3] -> [-- -- BS0 BS1]
 
							 * dstABCD shuffled: [D0 D1 D2 D3] -> [D2 D3 D0 D0]
 
							 * PALIGNR takes what's in (): [-- -- (BS0 BS1] [D2 D3) D0 D0]
 
							 */
 
							dstABCD = _mm_shuffle_epi32(dstABCD, 0x0E);
 
							srcABCD = _mm_alignr_epi8(dstABCD, srcABCD, 8);
 
							Colour *old_dst = dst;
 
							__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
 
							__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
 
							ALPHA_BLEND_2(pack_low_cm);
 
							_mm_storel_epi64((__m128i*) dst, srcABCD);
 
							src += 2;
 
							dst += 2;
 
							/* It is VERY important to read next data before it gets invalidated in cpu cache. */
 
							dstABCD = _mm_loadu_si128((__m128i*) dst);
 
							_mm_storeu_si128((__m128i *) old_dst, srcABCD);
 
							srcABCD = _mm_loadu_si128((const __m128i*) src);
 
						}
 
						if (bt_last == BT_ODD) {
 
							ALPHA_BLEND_2(pack_hi_cm);
 
							(*dst).data = EXTR32(srcABCD, 2);
 
							__m128i srcABCD = _mm_cvtsi32_si128(src->data);
 
							__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
 
							ALPHA_BLEND_2(pack_low_cm);
 
							dst->data = _mm_cvtsi128_si32(srcABCD);
 
						}
 
						break;
 
					}
 
@@ -117,18 +109,18 @@ inline void Blitter_32bppSSSE3::Draw(con
 
						const int width_diff = si->sprite_width - bp->width;
 
						effective_width = bp->width - (int) src_rgba_line[0].data;
 
						const int delta_diff = (int) src_rgba_line[1].data - width_diff;
 
						const int nd = effective_width - delta_diff;
 
						effective_width = delta_diff > 0 ? nd : effective_width;
 
						const int new_width = effective_width - delta_diff;
 
						effective_width = delta_diff > 0 ? new_width : effective_width;
 
						if (effective_width <= 0) break;
 
						/* FALLTHROUGH */
 
					}
 

	
 
					case RM_WITH_SKIP: {
 
						__m128i srcABCD = _mm_loadu_si128((const __m128i*) src);
 
						__m128i dstABCD = _mm_loadu_si128((__m128i*) dst);
 
						uint32 mvX2 = *((uint32 *) const_cast<MapValue *>(src_mv));
 
						for (uint x = (uint) effective_width / 2; x > 0; x--) {
 
							__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
 
							__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
 
							uint32 mvX2 = *((uint32 *) const_cast<MapValue *>(src_mv));
 

	
 
						for (uint x = (uint) effective_width / 2; x > 0; x--) {
 
							/* Remap colours. */
 
							if (mvX2 & 0x00FF00FF) {
 
								/* Written so the compiler uses CMOV. */
 
@@ -139,7 +131,7 @@ inline void Blitter_32bppSSSE3::Draw(con
 
								Colour c0 = 0; // Use alpha of 0 to keep dst as is.
 
								c0 = r0 == 0 ? c0 : c0map;
 
								c0 = m0 != 0 ? c0 : src0;
 
								INSR32(c0.data, srcABCD, 0);
 
								srcABCD = _mm_cvtsi32_si128(c0.data);
 

	
 
								const Colour src1 = src[1];
 
								const uint m1 = (byte) (mvX2 >> 16);
 
@@ -156,40 +148,36 @@ inline void Blitter_32bppSSSE3::Draw(con
 
							}
 

	
 
							/* Blend colours. */
 
							ALPHA_BLEND_2(pack_hi_cm);
 
							dstABCD = _mm_shuffle_epi32(dstABCD, 0x0E);
 
							srcABCD = _mm_alignr_epi8(dstABCD, srcABCD, 8);
 
							Colour *old_dst = dst;
 
							ALPHA_BLEND_2(pack_low_cm);
 
							_mm_storel_epi64((__m128i *) dst, srcABCD);
 
							dst += 2;
 
							src += 2;
 
							src_mv += 2;
 
							dstABCD = _mm_loadu_si128((__m128i*) dst);
 
							_mm_storeu_si128((__m128i *) old_dst, srcABCD);
 
							mvX2 = *((uint32 *) const_cast<MapValue *>(src_mv));
 
							srcABCD = _mm_loadu_si128((const __m128i*) src);
 
						}
 

	
 
						if (effective_width & 1) {
 
							/* In case the m-channel is zero, do not remap this pixel in any way */
 
							if (src_mv->m == 0) {
 
								if (src->a < 255) {
 
									ALPHA_BLEND_2(pack_hi_cm);
 
									(*dst).data = EXTR32(srcABCD, 2);
 
								} else {
 
									*dst = src->data;
 
								}
 
							} else {
 
							/* In case the m-channel is zero, do not remap this pixel in any way. */
 
							__m128i srcABCD;
 
							if (src_mv->m) {
 
								const uint r = remap[src_mv->m];
 
								if (r != 0) {
 
									Colour remapped_colour = AdjustBrightness(this->LookupColourInPalette(r), src_mv->v);
 
									if (src->a < 255) {
 
									if (src->a == 255) {
 
										*dst = remapped_colour;
 
									} else {
 
										remapped_colour.a = src->a;
 
										INSR32(remapped_colour.data, srcABCD, 0);
 
										ALPHA_BLEND_2(pack_hi_cm);
 
										(*dst).data = EXTR32(srcABCD, 2);
 
									} else
 
										*dst = remapped_colour;
 
										srcABCD = _mm_cvtsi32_si128(remapped_colour.data);
 
										goto bmcr_alpha_blend_single;
 
									}
 
								}
 
							} else {
 
								srcABCD = _mm_cvtsi32_si128(src->data);
 
								if (src->a < 255) {
 
bmcr_alpha_blend_single:
 
									__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
 
									ALPHA_BLEND_2(pack_low_cm);
 
								}
 
								dst->data = _mm_cvtsi128_si32(srcABCD);
 
							}
 
						}
 
						break;
 
@@ -200,30 +188,29 @@ inline void Blitter_32bppSSSE3::Draw(con
 
				src_mv_line += si->sprite_width;
 
				break;
 
			}
 

	
 
			case BM_TRANSPARENT: {
 
				/* Make the current colour a bit more black, so it looks like this image is transparent.
 
				 * rgb = rgb * ((256/4) * 4 - (alpha/4)) / ((256/4) * 4)
 
				 */
 
				__m128i srcABCD = _mm_loadu_si128((const __m128i*) src);
 
				__m128i dstABCD = _mm_loadu_si128((__m128i*) dst);
 
				for (uint x = (uint) bp->width / 2; x > 0; x--) {
 
					__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
 
					__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
 
					__m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128());
 
					__m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128());
 
					__m128i dstCD = _mm_unpackhi_epi8(dstABCD, _mm_setzero_si128());
 
					__m128i alphaAB = _mm_shuffle_epi8(srcAB, a_cm);
 
					alphaAB = _mm_srli_epi16(alphaAB, 2); // Reduce to 64 levels of shades so the max value fits in 16 bits.
 
					__m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB);
 
					dstAB = _mm_mullo_epi16(dstAB, nom);
 
					dstAB = _mm_srli_epi16(dstAB, 8);
 
					dstAB = _mm_packus_epi16(dstAB, dstCD);
 
					Colour *old_dst = dst;
 
					dstAB = _mm_packus_epi16(dstAB, dstAB);
 
					_mm_storel_epi64((__m128i *) dst, dstAB);
 
					src += 2;
 
					dst += 2;
 
					dstABCD = _mm_loadu_si128((__m128i*) dst);
 
					_mm_storeu_si128((__m128i *) old_dst, dstAB);
 
					srcABCD = _mm_loadu_si128((const __m128i*) src);
 
				}
 
				if (bp->width & 1) {
 
					__m128i srcABCD = _mm_cvtsi32_si128(src->data);
 
					__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
 
					__m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128());
 
					__m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128());
 
					__m128i alphaAB = _mm_shuffle_epi8(srcAB, a_cm);
 
@@ -232,7 +219,7 @@ inline void Blitter_32bppSSSE3::Draw(con
 
					dstAB = _mm_mullo_epi16(dstAB, nom);
 
					dstAB = _mm_srli_epi16(dstAB, 8);
 
					dstAB = _mm_packus_epi16(dstAB, dstAB);
 
					(*dst).data = EXTR32(dstAB, 0);
 
					dst->data = _mm_cvtsi128_si32(dstAB);
 
				}
 
				break;
 
			}
src/blitter/32bpp_ssse3.hpp
Show inline comments
 
@@ -47,8 +47,7 @@
 
	__m128i zero = _mm_setzero_si128(); \
 
	__m128i colAB = _mm_unpacklo_epi8(colourX2, zero); \
 
	\
 
	__m128i briAB; \
 
	INSR64(brightnessX2, briAB, 0); \
 
	__m128i briAB = _mm_cvtsi32_si128(brightnessX2); \
 
	briAB = _mm_shuffle_epi8(briAB, briAB_cm); /* DEFAULT_BRIGHTNESS in 0, 0x00 in 2. */ \
 
	colAB = _mm_mullo_epi16(colAB, briAB); \
 
	__m128i colAB_ob = _mm_srli_epi16(colAB, 8+7); \
src/video/win32_v.cpp
Show inline comments
 
@@ -1077,7 +1077,7 @@ static bool AllocateDibSection(int w, in
 
	bi->bmiHeader.biSize = sizeof(BITMAPINFOHEADER);
 

	
 
	bi->bmiHeader.biWidth = _wnd.width = w;
 
	bi->bmiHeader.biHeight = -(_wnd.height = h+1); // Allocate extra room to prevent out-of-bounds when SSE reads a 16B block at the end of the buffer.
 
	bi->bmiHeader.biHeight = -(_wnd.height = h);
 

	
 
	bi->bmiHeader.biPlanes = 1;
 
	bi->bmiHeader.biBitCount = BlitterFactory::GetCurrentBlitter()->GetScreenDepth();
0 comments (0 inline, 0 general)