ClanLib  2.3.7
blit_argb8_sse.h
Go to the documentation of this file.
1 /*
2 ** ClanLib SDK
3 ** Copyright (c) 1997-2011 The ClanLib Team
4 **
5 ** This software is provided 'as-is', without any express or implied
6 ** warranty. In no event will the authors be held liable for any damages
7 ** arising from the use of this software.
8 **
9 ** Permission is granted to anyone to use this software for any purpose,
10 ** including commercial applications, and to alter it and redistribute it
11 ** freely, subject to the following restrictions:
12 **
13 ** 1. The origin of this software must not be misrepresented; you must not
14 ** claim that you wrote the original software. If you use this software
15 ** in a product, an acknowledgment in the product documentation would be
16 ** appreciated but is not required.
17 ** 2. Altered source versions must be plainly marked as such, and must not be
18 ** misrepresented as being the original software.
19 ** 3. This notice may not be removed or altered from any source distribution.
20 **
21 ** Note: Some of the libraries ClanLib may link to may have additional
22 ** requirements or restrictions.
23 **
24 ** File Author(s):
25 **
26 ** Magnus Norddahl
27 */
28 
31 
32 #pragma once
33 
34 #include "api_swrender.h"
35 
36 #if defined(__GNUC__) && !defined(__SSE2__)
37 // Do not attempt to compile SSE2 code if the compiler does not support it
38 #else
39 
40 #ifndef CL_DISABLE_SSE2
41 
42 #include <emmintrin.h>
43 
48 {
50 public:
51  static void copy_pixels(unsigned int *dest, const unsigned int *src);
52  static void load_pixel(__m128i &xmm, const unsigned int &pixel);
53  static void load_pixels(__m128i &xmm, const unsigned int *pixels);
54  static void load_pixels(__m128i &xmm, const unsigned int &p1, unsigned int &p2);
55  static void load_pixel_linear(__m128i &xmm, const unsigned int &p1, const unsigned int &p2, const unsigned int &p3, const unsigned int &p4, unsigned int ifracx, unsigned int ifracy);
56  static void set_one(__m128i &xmm);
57  static void set_half(__m128i &xmm);
58  static void set_color(__m128i &xmm, unsigned short red, unsigned short green, unsigned short blue, unsigned short alpha);
59  static void set_color(__m128i &xmm, unsigned short r1, unsigned short g1, unsigned short b1, unsigned short a1, unsigned short r2, unsigned short g2, unsigned short b2, unsigned short a2);
60 
61 #ifdef _MSC_VER
62  static void multiply_color(__m128i &src, __m128i &primcolor);
63 #else
64  // Fix to compile on gcc
65  static void multiply_color(__m128i &src, __m128i primcolor);
66 #endif
67  static void blend_normal(__m128i &dest, __m128i &src, __m128i &one, __m128i &half);
68  static void blend_premultiplied(__m128i &dest, __m128i &src, __m128i &one, __m128i &half);
69  static void blend_lcd(__m128i &dest, __m128i &src, __m128i &one, __m128i &half, __m128i &color);
70  static void store_pixel(unsigned int &pixel, __m128i &xmm);
71  static void store_pixels(unsigned int *pixels, __m128i &xmm);
72 
73  static void pixels_to_channels(__m128i &red, __m128i &green, __m128i &blue, __m128i &alpha, const __m128i &src0, const __m128i &src1);
74  static void channels_to_pixels(__m128i &dest0, __m128i &dest1, __m128i &red, __m128i &green, __m128i &blue, __m128i &alpha);
75 // static void sample_nearest(__m128i &out0, __m128i tx, __m128i ty, const unsigned int *data, int width);
76 };
77 
78 inline void CL_BlitARGB8SSE::copy_pixels(unsigned int *dest, const unsigned int *src)
79 {
80  __m128i src0;
81  src0 = _mm_loadl_epi64((const __m128i *) src);
82  _mm_storel_epi64((__m128i *) dest, src0);
83 }
84 
85 inline void CL_BlitARGB8SSE::load_pixel(__m128i &xmm, const unsigned int &pixel)
86 {
87  xmm = _mm_cvtsi32_si128(pixel);
88  xmm = _mm_unpacklo_epi8(xmm, _mm_setzero_si128());
89 }
90 
91 inline void CL_BlitARGB8SSE::load_pixels(__m128i &xmm, const unsigned int *pixels)
92 {
93  xmm = _mm_loadl_epi64((const __m128i *) pixels);
94  xmm = _mm_unpacklo_epi8(xmm, _mm_setzero_si128());
95 }
96 
97 inline void CL_BlitARGB8SSE::load_pixels(__m128i &xmm, const unsigned int &p1, unsigned int &p2)
98 {
99  xmm = _mm_set_epi32(0, 0, p2, p1);
100  xmm = _mm_unpacklo_epi8(xmm, _mm_setzero_si128());
101 }
102 
103 inline void CL_BlitARGB8SSE::load_pixel_linear(__m128i &xmm, const unsigned int &pixel1, const unsigned int &pixel2, const unsigned int &pixel3, const unsigned int &pixel4, unsigned int ifracx, unsigned int ifracy)
104 {
105  __m128i src0, src1, src2, src3;
106  __m128i frac0, frac1, frac2, frac3;
107  __m128i fracx, inv_fracx, fracy, inv_fracy;
108  __m128i half = _mm_set1_epi16(64);
109  fracx = _mm_set1_epi16(ifracx);
110  fracy = _mm_set1_epi16(ifracy);
111  inv_fracx = _mm_set1_epi16(0x80-ifracx);
112  inv_fracy = _mm_set1_epi16(0x80-ifracy);
113  frac0 = _mm_srli_epi16(_mm_mullo_epi16(inv_fracx, inv_fracy), 7);
114  frac1 = _mm_srli_epi16(_mm_mullo_epi16(fracx, inv_fracy), 7);
115  frac2 = _mm_srli_epi16(_mm_mullo_epi16(inv_fracx, fracy), 7);
116  frac3 = _mm_srli_epi16(_mm_mullo_epi16(fracx, fracy), 7);
117  src0 = _mm_mullo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(pixel1), _mm_setzero_si128()), frac0);
118  src1 = _mm_mullo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(pixel2), _mm_setzero_si128()), frac1);
119  src2 = _mm_mullo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(pixel3), _mm_setzero_si128()), frac2);
120  src3 = _mm_mullo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(pixel4), _mm_setzero_si128()), frac3);
121  xmm = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_add_epi16(_mm_add_epi16(src0, src1), src2), src3), half), 7);
122 }
123 
124 inline void CL_BlitARGB8SSE::set_one(__m128i &xmm)
125 {
126  xmm = _mm_set1_epi16(0x0100);
127 }
128 
129 inline void CL_BlitARGB8SSE::set_half(__m128i &xmm)
130 {
131  xmm = _mm_set1_epi16(0x007f);
132 }
133 
134 inline void CL_BlitARGB8SSE::set_color(__m128i &xmm, unsigned short red, unsigned short green, unsigned short blue, unsigned short alpha)
135 {
136  xmm = _mm_set_epi16(alpha, red, green, blue, alpha, red, green, blue);
137 }
138 
139 inline void CL_BlitARGB8SSE::set_color(__m128i &xmm, unsigned short r1, unsigned short g1, unsigned short b1, unsigned short a1, unsigned short r2, unsigned short g2, unsigned short b2, unsigned short a2)
140 {
141  xmm = _mm_set_epi16(a2, r2, g2, b2, a1, r1, g1, b1);
142 }
143 
144 #ifdef _MSC_VER
145 inline void CL_BlitARGB8SSE::multiply_color(__m128i &src, __m128i &primcolor)
146 {
147  src = _mm_mullo_epi16(src, primcolor);
148  src = _mm_srli_epi16(src, 8);
149 }
150 #else
151  // For some reason "primcolor" cannot be a reference on gcc
152 inline void CL_BlitARGB8SSE::multiply_color(__m128i &src, __m128i primcolor)
153 {
154  src = _mm_mullo_epi16(src, primcolor);
155  src = _mm_srli_epi16(src, 8);
156 }
157 #endif
158 
159 #define cl_blitargb8sse_multiply_color(src, primcolor) \
160 { \
161  src = _mm_mullo_epi16(src, primcolor); \
162  src = _mm_srli_epi16(src, 8); \
163 }
164 
165 inline void CL_BlitARGB8SSE::blend_normal(__m128i &dest, __m128i &src, __m128i &one, __m128i &half)
166 {
167  __m128i src_alpha, invsrc_alpha;
168 
169  src_alpha = src;
170  src_alpha = _mm_shufflelo_epi16(src_alpha, 0xff);
171  src_alpha = _mm_shufflehi_epi16(src_alpha, 0xff);
172 
173  invsrc_alpha = _mm_sub_epi16(one, src_alpha);
174 
175  src = _mm_mullo_epi16(src, src_alpha);
176  dest = _mm_mullo_epi16(dest, invsrc_alpha);
177 
178  dest = _mm_add_epi16(dest, src);
179  dest = _mm_add_epi16(dest, half); // round up
180  dest = _mm_srli_epi16(dest, 8);
181 }
182 
183 #define cl_blitargb8sse_blend_normal(dest, src, one, half) \
184 { \
185  __m128i src_alpha, invsrc_alpha; \
186 \
187  src_alpha = src; \
188  src_alpha = _mm_shufflelo_epi16(src_alpha, 0xff); \
189  src_alpha = _mm_shufflehi_epi16(src_alpha, 0xff); \
190 \
191  invsrc_alpha = _mm_sub_epi16(one, src_alpha); \
192 \
193  src = _mm_mullo_epi16(src, src_alpha); \
194  dest = _mm_mullo_epi16(dest, invsrc_alpha); \
195 \
196  dest = _mm_add_epi16(dest, src); \
197  dest = _mm_add_epi16(dest, half); \
198  dest = _mm_srli_epi16(dest, 8); \
199 }
200 
201 inline void CL_BlitARGB8SSE::blend_premultiplied(__m128i &dest, __m128i &src, __m128i &one, __m128i &half)
202 {
203  __m128i src_alpha, invsrc_alpha;
204 
205  src_alpha = src;
206  src_alpha = _mm_shufflelo_epi16(src_alpha, 0xff);
207  src_alpha = _mm_shufflehi_epi16(src_alpha, 0xff);
208 
209  invsrc_alpha = _mm_sub_epi16(one, src_alpha);
210 
211  dest = _mm_mullo_epi16(dest, invsrc_alpha);
212  dest = _mm_add_epi16(dest, half); // round up
213  dest = _mm_srli_epi16(dest, 8);
214  dest = _mm_add_epi16(dest, src);
215 }
216 
217 inline void CL_BlitARGB8SSE::blend_lcd(__m128i &dest, __m128i &src, __m128i &one, __m128i &half, __m128i &color)
218 {
219  __m128i invsrc;
220  invsrc = _mm_sub_epi16(one, _mm_add_epi16(_mm_srli_epi16(src, 7), src));
221 
222  dest = _mm_add_epi16(_mm_mullo_epi16(src, color), _mm_mullo_epi16(dest, invsrc));
223  dest = _mm_add_epi16(dest, half); // round up
224  dest = _mm_srli_epi16(dest, 8);
225 }
226 
227 inline void CL_BlitARGB8SSE::store_pixel(unsigned int &pixel, __m128i &xmm)
228 {
229  xmm = _mm_packus_epi16(xmm, _mm_setzero_si128());
230  pixel = _mm_cvtsi128_si32(xmm);
231 }
232 
233 inline void CL_BlitARGB8SSE::store_pixels(unsigned int *pixels, __m128i &xmm)
234 {
235  xmm = _mm_packus_epi16(xmm, _mm_setzero_si128());
236  _mm_storel_epi64((__m128i *) pixels, xmm);
237 }
238 
239 inline void CL_BlitARGB8SSE::pixels_to_channels(__m128i &red, __m128i &green, __m128i &blue, __m128i &alpha, const __m128i &src0, const __m128i &src1)
240 {
241  __m128i alpha_mask = _mm_set1_epi32(0xff000000);
242  __m128i red_mask = _mm_set1_epi32(0x00ff0000);
243  __m128i green_mask = _mm_set1_epi32(0x0000ff00);
244  __m128i blue_mask = _mm_set1_epi32(0x000000ff);
245 
246  alpha = _mm_srli_si128(_mm_and_si128(alpha_mask, src0), 1);
247  alpha = _mm_or_si128(alpha, _mm_srli_si128(_mm_and_si128(alpha_mask, src1), 3));
248 
249  red = _mm_and_si128(red_mask, src0);
250  red = _mm_or_si128(red, _mm_srli_si128(_mm_and_si128(red_mask, src1), 2));
251 
252  green = _mm_slli_si128(_mm_and_si128(green_mask, src0), 1);
253  green = _mm_or_si128(green, _mm_srli_si128(_mm_and_si128(green_mask, src1), 1));
254 
255  blue = _mm_slli_si128(_mm_and_si128(blue_mask, src0), 2);
256  blue = _mm_or_si128(blue, _mm_and_si128(blue_mask, src1));
257 }
258 
259 inline void CL_BlitARGB8SSE::channels_to_pixels(__m128i &dest0, __m128i &dest1, __m128i &red, __m128i &green, __m128i &blue, __m128i &alpha)
260 {
261  __m128i alpha_mask = _mm_set1_epi32(0xff000000);
262  __m128i red_mask = _mm_set1_epi32(0x00ff0000);
263  __m128i green_mask = _mm_set1_epi32(0x0000ff00);
264  __m128i blue_mask = _mm_set1_epi32(0x000000ff);
265 
266  dest0 = _mm_and_si128(alpha_mask, _mm_slli_si128(alpha, 1));
267  dest1 = _mm_and_si128(alpha_mask, _mm_slli_si128(alpha, 3));
268 
269  dest0 = _mm_or_si128(dest0, _mm_and_si128(red_mask, red));
270  dest1 = _mm_or_si128(dest1, _mm_and_si128(red_mask, _mm_slli_si128(red, 2)));
271 
272  dest0 = _mm_or_si128(dest0, _mm_and_si128(green_mask, _mm_srli_si128(green, 1)));
273  dest1 = _mm_or_si128(dest1, _mm_and_si128(green_mask, _mm_slli_si128(green, 1)));
274 
275  dest0 = _mm_or_si128(dest0, _mm_and_si128(blue_mask, _mm_srli_si128(blue, 2)));
276  dest1 = _mm_or_si128(dest1, _mm_and_si128(blue_mask, blue));
277 }
278 
279 #ifdef _MSC_VER
280 
281 #define cl_blitargb8sse_sample_nearest(out0, tx, ty, data, width) \
282 { \
283  __declspec(align(16)) unsigned int x[4], y[4]; \
284  _mm_store_si128((__m128i*) x, _mm_srai_epi32(tx, 16)); \
285  _mm_store_si128((__m128i*) y, _mm_srai_epi32(ty, 16)); \
286  out0 = _mm_set_epi32(data[x[0]+y[0]*width], data[x[1]+y[1]*width], data[x[2]+y[2]*width], data[x[3]+y[3]*width]); \
287 }
288 
289 #else
290 
291 #define cl_blitargb8sse_sample_nearest(out0, tx, ty, data, width) \
292 { \
293  __attribute__ ((aligned(16))) unsigned int x[4], y[4]; \
294  _mm_store_si128((__m128i*) x, _mm_srai_epi32(tx, 16)); \
295  _mm_store_si128((__m128i*) y, _mm_srai_epi32(ty, 16)); \
296  out0 = _mm_set_epi32(data[x[0]+y[0]*width], data[x[1]+y[1]*width], data[x[2]+y[2]*width], data[x[3]+y[3]*width]); \
297 }
298 
299 #endif
300 
301 // Sadly it seems that the Visual C++ 2008 compiler is unable to optimize CL_BlitARGB8SSE::texture_repeat properly
302 // when implemented as an inline function. Maybe it is the branching or the loops that does it?
303 // Implemented as a macro instead.
304 #define cl_blitargb8sse_texture_repeat(tx, ty, width, height) \
305 { \
306  while (true) \
307  { \
308  __m128i compare_result = _mm_cmplt_epi32(tx, _mm_setzero_si128()); \
309  if (_mm_movemask_epi8(compare_result)) \
310  tx = _mm_add_epi32(tx, _mm_and_si128(compare_result, width)); \
311  else \
312  break; \
313  } \
314  while (true) \
315  { \
316  __m128i compare_result = _mm_cmplt_epi32(tx, width); \
317  if (_mm_movemask_epi8(compare_result)!=0xffff) \
318  tx = _mm_sub_epi32(tx, _mm_andnot_si128(compare_result, width)); \
319  else \
320  break; \
321  } \
322  while (true) \
323  { \
324  __m128i compare_result = _mm_cmplt_epi32(ty, _mm_setzero_si128()); \
325  if (_mm_movemask_epi8(compare_result)) \
326  ty = _mm_add_epi32(ty, _mm_and_si128(compare_result, height)); \
327  else \
328  break; \
329  } \
330  while (true) \
331  { \
332  __m128i compare_result = _mm_cmplt_epi32(ty, height); \
333  if (_mm_movemask_epi8(compare_result)!=0xffff) \
334  ty = _mm_sub_epi32(ty, _mm_andnot_si128(compare_result, height)); \
335  else \
336  break; \
337  } \
338 }
339 
340 #endif
341 #endif
342 
344