36 #if defined(__GNUC__) && !defined(__SSE2__)
40 #ifndef CL_DISABLE_SSE2
42 #include <emmintrin.h>
51 static void copy_pixels(
unsigned int *dest,
const unsigned int *src);
52 static void load_pixel(__m128i &xmm,
const unsigned int &pixel);
53 static void load_pixels(__m128i &xmm,
const unsigned int *pixels);
54 static void load_pixels(__m128i &xmm,
const unsigned int &p1,
unsigned int &p2);
55 static void load_pixel_linear(__m128i &xmm,
const unsigned int &p1,
const unsigned int &p2,
const unsigned int &p3,
const unsigned int &p4,
unsigned int ifracx,
unsigned int ifracy);
56 static void set_one(__m128i &xmm);
58 static void set_color(__m128i &xmm,
unsigned short red,
unsigned short green,
unsigned short blue,
unsigned short alpha);
59 static void set_color(__m128i &xmm,
unsigned short r1,
unsigned short g1,
unsigned short b1,
unsigned short a1,
unsigned short r2,
unsigned short g2,
unsigned short b2,
unsigned short a2);
67 static void blend_normal(__m128i &dest, __m128i &src, __m128i &one, __m128i &half);
69 static void blend_lcd(__m128i &dest, __m128i &src, __m128i &one, __m128i &half, __m128i &color);
70 static void store_pixel(
unsigned int &pixel, __m128i &xmm);
71 static void store_pixels(
unsigned int *pixels, __m128i &xmm);
73 static void pixels_to_channels(__m128i &red, __m128i &green, __m128i &blue, __m128i &alpha,
const __m128i &src0,
const __m128i &src1);
74 static void channels_to_pixels(__m128i &dest0, __m128i &dest1, __m128i &red, __m128i &green, __m128i &blue, __m128i &alpha);
81 src0 = _mm_loadl_epi64((
const __m128i *) src);
82 _mm_storel_epi64((__m128i *) dest, src0);
87 xmm = _mm_cvtsi32_si128(pixel);
88 xmm = _mm_unpacklo_epi8(xmm, _mm_setzero_si128());
93 xmm = _mm_loadl_epi64((
const __m128i *) pixels);
94 xmm = _mm_unpacklo_epi8(xmm, _mm_setzero_si128());
99 xmm = _mm_set_epi32(0, 0, p2, p1);
100 xmm = _mm_unpacklo_epi8(xmm, _mm_setzero_si128());
103 inline void CL_BlitARGB8SSE::load_pixel_linear(__m128i &xmm,
const unsigned int &pixel1,
const unsigned int &pixel2,
const unsigned int &pixel3,
const unsigned int &pixel4,
unsigned int ifracx,
unsigned int ifracy)
105 __m128i src0, src1, src2, src3;
106 __m128i frac0, frac1, frac2, frac3;
107 __m128i fracx, inv_fracx, fracy, inv_fracy;
108 __m128i half = _mm_set1_epi16(64);
109 fracx = _mm_set1_epi16(ifracx);
110 fracy = _mm_set1_epi16(ifracy);
111 inv_fracx = _mm_set1_epi16(0x80-ifracx);
112 inv_fracy = _mm_set1_epi16(0x80-ifracy);
113 frac0 = _mm_srli_epi16(_mm_mullo_epi16(inv_fracx, inv_fracy), 7);
114 frac1 = _mm_srli_epi16(_mm_mullo_epi16(fracx, inv_fracy), 7);
115 frac2 = _mm_srli_epi16(_mm_mullo_epi16(inv_fracx, fracy), 7);
116 frac3 = _mm_srli_epi16(_mm_mullo_epi16(fracx, fracy), 7);
117 src0 = _mm_mullo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(pixel1), _mm_setzero_si128()), frac0);
118 src1 = _mm_mullo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(pixel2), _mm_setzero_si128()), frac1);
119 src2 = _mm_mullo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(pixel3), _mm_setzero_si128()), frac2);
120 src3 = _mm_mullo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(pixel4), _mm_setzero_si128()), frac3);
121 xmm = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_add_epi16(_mm_add_epi16(src0, src1), src2), src3), half), 7);
126 xmm = _mm_set1_epi16(0x0100);
131 xmm = _mm_set1_epi16(0x007f);
136 xmm = _mm_set_epi16(alpha, red, green, blue, alpha, red, green, blue);
139 inline void CL_BlitARGB8SSE::set_color(__m128i &xmm,
unsigned short r1,
unsigned short g1,
unsigned short b1,
unsigned short a1,
unsigned short r2,
unsigned short g2,
unsigned short b2,
unsigned short a2)
141 xmm = _mm_set_epi16(a2, r2, g2, b2, a1, r1, g1, b1);
147 src = _mm_mullo_epi16(src, primcolor);
148 src = _mm_srli_epi16(src, 8);
154 src = _mm_mullo_epi16(src, primcolor);
155 src = _mm_srli_epi16(src, 8);
159 #define cl_blitargb8sse_multiply_color(src, primcolor) \
161 src = _mm_mullo_epi16(src, primcolor); \
162 src = _mm_srli_epi16(src, 8); \
167 __m128i src_alpha, invsrc_alpha;
170 src_alpha = _mm_shufflelo_epi16(src_alpha, 0xff);
171 src_alpha = _mm_shufflehi_epi16(src_alpha, 0xff);
173 invsrc_alpha = _mm_sub_epi16(one, src_alpha);
175 src = _mm_mullo_epi16(src, src_alpha);
176 dest = _mm_mullo_epi16(dest, invsrc_alpha);
178 dest = _mm_add_epi16(dest, src);
179 dest = _mm_add_epi16(dest, half);
180 dest = _mm_srli_epi16(dest, 8);
183 #define cl_blitargb8sse_blend_normal(dest, src, one, half) \
185 __m128i src_alpha, invsrc_alpha; \
188 src_alpha = _mm_shufflelo_epi16(src_alpha, 0xff); \
189 src_alpha = _mm_shufflehi_epi16(src_alpha, 0xff); \
191 invsrc_alpha = _mm_sub_epi16(one, src_alpha); \
193 src = _mm_mullo_epi16(src, src_alpha); \
194 dest = _mm_mullo_epi16(dest, invsrc_alpha); \
196 dest = _mm_add_epi16(dest, src); \
197 dest = _mm_add_epi16(dest, half); \
198 dest = _mm_srli_epi16(dest, 8); \
203 __m128i src_alpha, invsrc_alpha;
206 src_alpha = _mm_shufflelo_epi16(src_alpha, 0xff);
207 src_alpha = _mm_shufflehi_epi16(src_alpha, 0xff);
209 invsrc_alpha = _mm_sub_epi16(one, src_alpha);
211 dest = _mm_mullo_epi16(dest, invsrc_alpha);
212 dest = _mm_add_epi16(dest, half);
213 dest = _mm_srli_epi16(dest, 8);
214 dest = _mm_add_epi16(dest, src);
220 invsrc = _mm_sub_epi16(one, _mm_add_epi16(_mm_srli_epi16(src, 7), src));
222 dest = _mm_add_epi16(_mm_mullo_epi16(src, color), _mm_mullo_epi16(dest, invsrc));
223 dest = _mm_add_epi16(dest, half);
224 dest = _mm_srli_epi16(dest, 8);
229 xmm = _mm_packus_epi16(xmm, _mm_setzero_si128());
230 pixel = _mm_cvtsi128_si32(xmm);
235 xmm = _mm_packus_epi16(xmm, _mm_setzero_si128());
236 _mm_storel_epi64((__m128i *) pixels, xmm);
241 __m128i alpha_mask = _mm_set1_epi32(0xff000000);
242 __m128i red_mask = _mm_set1_epi32(0x00ff0000);
243 __m128i green_mask = _mm_set1_epi32(0x0000ff00);
244 __m128i blue_mask = _mm_set1_epi32(0x000000ff);
246 alpha = _mm_srli_si128(_mm_and_si128(alpha_mask, src0), 1);
247 alpha = _mm_or_si128(alpha, _mm_srli_si128(_mm_and_si128(alpha_mask, src1), 3));
249 red = _mm_and_si128(red_mask, src0);
250 red = _mm_or_si128(red, _mm_srli_si128(_mm_and_si128(red_mask, src1), 2));
252 green = _mm_slli_si128(_mm_and_si128(green_mask, src0), 1);
253 green = _mm_or_si128(green, _mm_srli_si128(_mm_and_si128(green_mask, src1), 1));
255 blue = _mm_slli_si128(_mm_and_si128(blue_mask, src0), 2);
256 blue = _mm_or_si128(blue, _mm_and_si128(blue_mask, src1));
261 __m128i alpha_mask = _mm_set1_epi32(0xff000000);
262 __m128i red_mask = _mm_set1_epi32(0x00ff0000);
263 __m128i green_mask = _mm_set1_epi32(0x0000ff00);
264 __m128i blue_mask = _mm_set1_epi32(0x000000ff);
266 dest0 = _mm_and_si128(alpha_mask, _mm_slli_si128(alpha, 1));
267 dest1 = _mm_and_si128(alpha_mask, _mm_slli_si128(alpha, 3));
269 dest0 = _mm_or_si128(dest0, _mm_and_si128(red_mask, red));
270 dest1 = _mm_or_si128(dest1, _mm_and_si128(red_mask, _mm_slli_si128(red, 2)));
272 dest0 = _mm_or_si128(dest0, _mm_and_si128(green_mask, _mm_srli_si128(green, 1)));
273 dest1 = _mm_or_si128(dest1, _mm_and_si128(green_mask, _mm_slli_si128(green, 1)));
275 dest0 = _mm_or_si128(dest0, _mm_and_si128(blue_mask, _mm_srli_si128(blue, 2)));
276 dest1 = _mm_or_si128(dest1, _mm_and_si128(blue_mask, blue));
281 #define cl_blitargb8sse_sample_nearest(out0, tx, ty, data, width) \
283 __declspec(align(16)) unsigned int x[4], y[4]; \
284 _mm_store_si128((__m128i*) x, _mm_srai_epi32(tx, 16)); \
285 _mm_store_si128((__m128i*) y, _mm_srai_epi32(ty, 16)); \
286 out0 = _mm_set_epi32(data[x[0]+y[0]*width], data[x[1]+y[1]*width], data[x[2]+y[2]*width], data[x[3]+y[3]*width]); \
291 #define cl_blitargb8sse_sample_nearest(out0, tx, ty, data, width) \
293 __attribute__ ((aligned(16))) unsigned int x[4], y[4]; \
294 _mm_store_si128((__m128i*) x, _mm_srai_epi32(tx, 16)); \
295 _mm_store_si128((__m128i*) y, _mm_srai_epi32(ty, 16)); \
296 out0 = _mm_set_epi32(data[x[0]+y[0]*width], data[x[1]+y[1]*width], data[x[2]+y[2]*width], data[x[3]+y[3]*width]); \
304 #define cl_blitargb8sse_texture_repeat(tx, ty, width, height) \
308 __m128i compare_result = _mm_cmplt_epi32(tx, _mm_setzero_si128()); \
309 if (_mm_movemask_epi8(compare_result)) \
310 tx = _mm_add_epi32(tx, _mm_and_si128(compare_result, width)); \
316 __m128i compare_result = _mm_cmplt_epi32(tx, width); \
317 if (_mm_movemask_epi8(compare_result)!=0xffff) \
318 tx = _mm_sub_epi32(tx, _mm_andnot_si128(compare_result, width)); \
324 __m128i compare_result = _mm_cmplt_epi32(ty, _mm_setzero_si128()); \
325 if (_mm_movemask_epi8(compare_result)) \
326 ty = _mm_add_epi32(ty, _mm_and_si128(compare_result, height)); \
332 __m128i compare_result = _mm_cmplt_epi32(ty, height); \
333 if (_mm_movemask_epi8(compare_result)!=0xffff) \
334 ty = _mm_sub_epi32(ty, _mm_andnot_si128(compare_result, height)); \