Changeset - r21169:98d87aed29c1
[Not reviewed]
master
0 4 0
rubidium - 11 years ago 2014-01-13 18:05:47
rubidium@openttd.org
(svn r26255) -Codechange: improve performance of brightness adjustment (MJP)
4 files changed with 21 insertions and 22 deletions:
0 comments (0 inline, 0 general)
src/blitter/32bpp_sse2.cpp
Show inline comments
 
@@ -285,52 +285,42 @@ Sprite *Blitter_32bppSSE_Base::Encode(co
 
/** ReallyAdjustBrightness() is not called that often.
 
 * Inlining this function implies a far jump, which has a huge latency.
 
 */
 
inline Colour Blitter_32bppSSE2::AdjustBrightness(Colour colour, uint8 brightness)
 
{
 
	/* Shortcut for normal brightness. */
 
	if (brightness == DEFAULT_BRIGHTNESS) return colour;
 

	
 
	return Blitter_32bppSSE2::ReallyAdjustBrightness(colour, brightness);
 
}
 

	
 
IGNORE_UNINITIALIZED_WARNING_START
 
/* static */ Colour Blitter_32bppSSE2::ReallyAdjustBrightness(Colour colour, uint8 brightness)
 
Colour Blitter_32bppSSE2::ReallyAdjustBrightness(Colour colour, uint8 brightness)
 
{
 
	uint64 c16 = colour.b | (uint64) colour.g << 16 | (uint64) colour.r << 32;
 
	c16 *= brightness;
 
	uint64 c16_ob = c16; // Helps out of order execution.
 
	c16 /= DEFAULT_BRIGHTNESS;
 
	c16 &= 0x01FF01FF01FF;
 

	
 
	/* Sum overbright (maximum for each rgb is 508, 9 bits, -255 is changed in -256 so we just have to take the 8 lower bits into account). */
 
	c16_ob = (((c16_ob >> (8 + 7)) & 0x0100010001) * 0xFF) & c16;
 
	uint64 ob = (uint16) c16_ob + (uint16) (c16_ob >> 16) + (uint16) (c16_ob >> 32);
 
	const uint ob = ((uint16) c16_ob + (uint16) (c16_ob >> 16) + (uint16) (c16_ob >> 32)) / 2;
 

	
 
	const uint32 alpha32 = colour.data & 0xFF000000;
 
	__m128i ret;
 
#ifdef _SQ64
 
	ret = _mm_cvtsi64_si128(c16);
 
#else
 
	INSR64(c16, ret, 0);
 
#endif
 
	LOAD64(c16, ret);
 
	if (ob != 0) {
 
		/* Reduce overbright strength. */
 
		ob /= 2;
 
		__m128i ob128;
 
#ifdef _SQ64
 
		ob128 = _mm_cvtsi64_si128(ob | ob << 16 | ob << 32);
 
#else
 
		INSR64(ob | ob << 16 | ob << 32, ob128, 0);
 
#endif
 
		__m128i ob128 = _mm_cvtsi32_si128(ob);
 
		ob128 = _mm_shufflelo_epi16(ob128, 0xC0);
 
		__m128i white = OVERBRIGHT_VALUE_MASK;
 
		__m128i c128 = ret;
 
		ret = _mm_subs_epu16(white, c128); /* PSUBUSW,   (255 - rgb) */
 
		ret = _mm_mullo_epi16(ret, ob128); /* PMULLW, ob*(255 - rgb) */
 
		ret = _mm_srli_epi16(ret, 8);      /* PSRLW,  ob*(255 - rgb)/256 */
 
		ret = _mm_add_epi16(ret, c128);    /* PADDW,  ob*(255 - rgb)/256 + rgb */
 
	}
 

	
 
	ret = _mm_packus_epi16(ret, ret);      /* PACKUSWB, saturate and pack. */
 
	return alpha32 | _mm_cvtsi128_si32(ret);
 
}
 
IGNORE_UNINITIALIZED_WARNING_STOP
src/blitter/32bpp_sse2.hpp
Show inline comments
 
@@ -45,24 +45,30 @@ typedef union ALIGN(16) um128i {
 
#define OVERBRIGHT_VALUE_MASK       _mm_setr_epi8(-1,  0, -1,  0, -1,  0,  0,  0, -1,  0, -1,  0, -1,  0,  0,  0)
 
#define OVERBRIGHT_CONTROL_MASK     _mm_setr_epi8( 0,  1,  0,  1,  0,  1,  7,  7,  2,  3,  2,  3,  2,  3,  7,  7)
 
#define TRANSPARENT_NOM_BASE        _mm_setr_epi16(256, 256, 256, 256, 256, 256, 256, 256)
 

	
 
#define EXTR32(m_from, m_rank) (*(um128i*) &m_from).m128i_u32[m_rank]
 
#define EXTR64(m_from, m_rank) (*(um128i*) &m_from).m128i_u64[m_rank]
 
#define INSR32(m_val, m_into, m_rank) { \
 
	(*(um128i*) &m_into).m128i = _mm_insert_epi16((*(um128i*) &m_into).m128i, m_val, (m_rank)*2); \
 
	(*(um128i*) &m_into).m128i = _mm_insert_epi16((*(um128i*) &m_into).m128i, (m_val) >> 16, (m_rank)*2 + 1); \
 
}
 
#define INSR64(m_val, m_into, m_rank) (*(um128i*) &m_into).m128i_u64[m_rank] = (m_val)
 

	
 
#ifdef _SQ64
 
	#define LOAD64(m_val, m_into) m_into = _mm_cvtsi64_si128(m_val);
 
#else
 
	#define LOAD64(m_val, m_into) INSR64(m_val, m_into, 0)
 
#endif
 

	
 
/* PUT_ALPHA_IN_FRONT_OF_RGB is redefined in 32bpp_ssse3.hpp. */
 
#define PUT_ALPHA_IN_FRONT_OF_RGB(m_from, m_into) \
 
	m_into = _mm_shufflelo_epi16(m_from, 0x3F); /* PSHUFLW, put alpha1 in front of each rgb1 */ \
 
	m_into = _mm_shufflehi_epi16(m_into, 0x3F); /* PSHUFHW, put alpha2 in front of each rgb2 */
 

	
 
/* PACK_AB_WITHOUT_SATURATION is redefined in 32bpp_ssse3.hpp. */
 
#define PACK_AB_WITHOUT_SATURATION(m_from, m_into) \
 
	m_from = _mm_and_si128(m_from, clear_hi);  /* PAND, wipe high bytes to keep low bytes when packing */ \
 
	m_into = _mm_packus_epi16(m_from, m_from); /* PACKUSWB, pack 2 colours (with saturation) */
 

	
 
/* Alpha blend 2 pixels. */
 
#define ALPHA_BLEND_2() { \
src/blitter/32bpp_sse4.cpp
Show inline comments
 
@@ -223,44 +223,42 @@ void Blitter_32bppSSE4::Draw(Blitter::Bl
 
}
 

	
 
/** Same code as seen in 32bpp_sse2.cpp but some macros are not the same. */
 
inline Colour Blitter_32bppSSE4::AdjustBrightness(Colour colour, uint8 brightness)
 
{
 
	/* Shortcut for normal brightness. */
 
	if (brightness == DEFAULT_BRIGHTNESS) return colour;
 

	
 
	return Blitter_32bppSSE4::ReallyAdjustBrightness(colour, brightness);
 
}
 

	
 
IGNORE_UNINITIALIZED_WARNING_START
 
/* static */ Colour Blitter_32bppSSE4::ReallyAdjustBrightness(Colour colour, uint8 brightness)
 
Colour Blitter_32bppSSE4::ReallyAdjustBrightness(Colour colour, uint8 brightness)
 
{
 
	uint64 c16 = colour.b | (uint64) colour.g << 16 | (uint64) colour.r << 32;
 
	c16 *= brightness;
 
	uint64 c16_ob = c16; // Helps out of order execution.
 
	c16 /= DEFAULT_BRIGHTNESS;
 
	c16 &= 0x01FF01FF01FF;
 

	
 
	/* Sum overbright (maximum for each rgb is 508, 9 bits, -255 is changed in -256 so we just have to take the 8 lower bits into account). */
 
	c16_ob = (((c16_ob >> (8 + 7)) & 0x0100010001) * 0xFF) & c16;
 
	uint64 ob = (uint16) c16_ob + (uint16) (c16_ob >> 16) + (uint16) (c16_ob >> 32);
 
	const uint ob = ((uint16) c16_ob + (uint16) (c16_ob >> 16) + (uint16) (c16_ob >> 32)) / 2;
 

	
 
	const uint32 alpha32 = colour.data & 0xFF000000;
 
	__m128i ret;
 
	INSR64(c16, ret, 0);
 
	LOAD64(c16, ret);
 
	if (ob != 0) {
 
		/* Reduce overbright strength. */
 
		ob /= 2;
 
		__m128i ob128;
 
		INSR64(ob | ob << 16 | ob << 32, ob128, 0);
 
		__m128i ob128 = _mm_cvtsi32_si128(ob);
 
		ob128 = _mm_shufflelo_epi16(ob128, 0xC0);
 
		__m128i white = OVERBRIGHT_VALUE_MASK;
 
		__m128i c128 = ret;
 
		ret = _mm_subs_epu16(white, c128); /* PSUBUSW,   (255 - rgb) */
 
		ret = _mm_mullo_epi16(ret, ob128); /* PMULLW, ob*(255 - rgb) */
 
		ret = _mm_srli_epi16(ret, 8);      /* PSRLW,  ob*(255 - rgb)/256 */
 
		ret = _mm_add_epi16(ret, c128);    /* PADDW,  ob*(255 - rgb)/256 + rgb */
 
	}
 

	
 
	ret = _mm_packus_epi16(ret, ret);      /* PACKUSWB, saturate and pack. */
 
	return alpha32 | _mm_cvtsi128_si32(ret);
 
}
 
IGNORE_UNINITIALIZED_WARNING_STOP
src/blitter/32bpp_sse4.hpp
Show inline comments
 
@@ -26,24 +26,29 @@ IGNORE_UNINITIALIZED_WARNING_START
 
#ifdef _SQ64
 
	#undef INSR64
 
	#define INSR64(m_val, m_into, m_rank) (*(um128i*) &m_into).m128i = _mm_insert_epi64((*(um128i*) &m_into).m128i, m_val, m_rank)
 
#else
 
	typedef union { uint64 u64; struct _u32 { uint32 low, high; } u32; } u6432;
 
	#undef INSR64
 
	#define INSR64(m_val, m_into, m_rank) { \
 
		u6432 v; \
 
		v.u64 = m_val; \
 
		(*(um128i*) &m_into).m128i = _mm_insert_epi32((*(um128i*) &m_into).m128i, v.u32.low, (m_rank)*2); \
 
		(*(um128i*) &m_into).m128i = _mm_insert_epi32((*(um128i*) &m_into).m128i, v.u32.high, (m_rank)*2 + 1); \
 
	}
 

	
 
	#undef LOAD64
 
	#define LOAD64(m_val, m_into) \
 
		m_into = _mm_cvtsi32_si128(m_val); \
 
		INSR32((m_val) >> 32, m_into, 1);
 
#endif
 
IGNORE_UNINITIALIZED_WARNING_STOP
 

	
 
/** The SSE4 32 bpp blitter (without palette animation). */
 
class Blitter_32bppSSE4 : public Blitter_32bppSSSE3 {
 
public:
 
	Colour AdjustBrightness(Colour colour, uint8 brightness);
 
	static Colour ReallyAdjustBrightness(Colour colour, uint8 brightness);
 

	
 
	/* virtual */ void Draw(Blitter::BlitterParams *bp, BlitterMode mode, ZoomLevel zoom);
 
	template <BlitterMode mode, Blitter_32bppSSE_Base::ReadMode read_mode, Blitter_32bppSSE_Base::BlockType bt_last>
 
	void Draw(const Blitter::BlitterParams *bp, ZoomLevel zoom);
0 comments (0 inline, 0 general)