rgb2rgb.c 12.2 KB
Newer Older
1
/*
2 3 4
 *
 *  rgb2rgb.c, Software RGB to RGB convertor
 *  Written by Nick Kurshev.
M
Michael Niedermayer 已提交
5
 *  palette stuff & yuv stuff by Michael
6
 */
N
Nick Kurshev 已提交
7 8 9
#include <inttypes.h>
#include "../config.h"
#include "rgb2rgb.h"
10 11
#include "../mmx_defs.h"

12
#ifdef HAVE_MMX
13 14 15 16 17
static const uint64_t mask32   __attribute__((aligned(8))) = 0x00FFFFFF00FFFFFFULL;
static const uint64_t mask24l  __attribute__((aligned(8))) = 0x0000000000FFFFFFULL;
static const uint64_t mask24h  __attribute__((aligned(8))) = 0x0000FFFFFF000000ULL;
static const uint64_t mask15b  __attribute__((aligned(8))) = 0x001F001F001F001FULL; /* 00000000 00011111  xxB */
static const uint64_t mask15rg __attribute__((aligned(8))) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000  RGx */
18
static const uint64_t mask15s  __attribute__((aligned(8))) = 0xFFE0FFE0FFE0FFE0ULL;
19
#endif
N
sfence  
Nick Kurshev 已提交
20

21
void rgb24to32(const uint8_t *src,uint8_t *dst,unsigned src_size)
N
Nick Kurshev 已提交
22
{
N
Nick Kurshev 已提交
23
  uint8_t *dest = dst;
N
Nick Kurshev 已提交
24 25
  const uint8_t *s = src;
  const uint8_t *end;
N
Nick Kurshev 已提交
26 27 28
#ifdef HAVE_MMX
  uint8_t *mm_end;
#endif
N
Nick Kurshev 已提交
29
  end = s + src_size;
N
Nick Kurshev 已提交
30
#ifdef HAVE_MMX
31
  __asm __volatile(PREFETCH"	%0"::"m"(*s):"memory");
N
Nick Kurshev 已提交
32
  mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
33
  __asm __volatile("movq	%0, %%mm7"::"m"(mask32):"memory");
N
Nick Kurshev 已提交
34
  if(mm_end == end) mm_end -= MMREG_SIZE*2;
N
Nick Kurshev 已提交
35 36 37
  while(s < mm_end)
  {
    __asm __volatile(
38
	PREFETCH"	32%1\n\t"
N
Nick Kurshev 已提交
39 40 41 42 43 44 45 46
	"movd	%1, %%mm0\n\t"
	"movd	3%1, %%mm1\n\t"
	"movd	6%1, %%mm2\n\t"
	"movd	9%1, %%mm3\n\t"
	"punpckldq %%mm1, %%mm0\n\t"
	"punpckldq %%mm3, %%mm2\n\t"
	"pand	%%mm7, %%mm0\n\t"
	"pand	%%mm7, %%mm2\n\t"
N
Nick Kurshev 已提交
47 48
	MOVNTQ"	%%mm0, %0\n\t"
	MOVNTQ"	%%mm2, 8%0"
N
Nick Kurshev 已提交
49 50 51 52 53 54
	:"=m"(*dest)
	:"m"(*s)
	:"memory");
    dest += 16;
    s += 12;
  }
N
sfence  
Nick Kurshev 已提交
55
  __asm __volatile(SFENCE:::"memory");
N
Nick Kurshev 已提交
56
  __asm __volatile(EMMS:::"memory");
N
Nick Kurshev 已提交
57
#endif
N
Nick Kurshev 已提交
58 59
  while(s < end)
  {
N
Nick Kurshev 已提交
60 61 62 63
    *dest++ = *s++;
    *dest++ = *s++;
    *dest++ = *s++;
    *dest++ = 0;
N
Nick Kurshev 已提交
64 65
  }
}
N
Nick Kurshev 已提交
66

67
void rgb32to24(const uint8_t *src,uint8_t *dst,unsigned src_size)
N
Nick Kurshev 已提交
68 69
{
  uint8_t *dest = dst;
N
Nick Kurshev 已提交
70 71
  const uint8_t *s = src;
  const uint8_t *end;
72 73 74
#ifdef HAVE_MMX
  uint8_t *mm_end;
#endif
N
Nick Kurshev 已提交
75
  end = s + src_size;
76
#ifdef HAVE_MMX
77
  __asm __volatile(PREFETCH"	%0"::"m"(*s):"memory");
78 79
  mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
  __asm __volatile(
80 81 82
	"movq	%0, %%mm7\n\t"
	"movq	%1, %%mm6"
	::"m"(mask24l),"m"(mask24h):"memory");
83 84 85 86
  if(mm_end == end) mm_end -= MMREG_SIZE*2;
  while(s < mm_end)
  {
    __asm __volatile(
87
	PREFETCH"	32%1\n\t"
88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
	"movq	%1, %%mm0\n\t"
	"movq	8%1, %%mm1\n\t"
	"movq	%%mm0, %%mm2\n\t"
	"movq	%%mm1, %%mm3\n\t"
	"psrlq	$8, %%mm2\n\t"
	"psrlq	$8, %%mm3\n\t"
	"pand	%%mm7, %%mm0\n\t"
	"pand	%%mm7, %%mm1\n\t"
	"pand	%%mm6, %%mm2\n\t"
	"pand	%%mm6, %%mm3\n\t"
	"por	%%mm2, %%mm0\n\t"
	"por	%%mm3, %%mm1\n\t"
	MOVNTQ"	%%mm0, %0\n\t"
	MOVNTQ"	%%mm1, 6%0"
	:"=m"(*dest)
	:"m"(*s)
	:"memory");
    dest += 12;
    s += 16;
  }
  __asm __volatile(SFENCE:::"memory");
  __asm __volatile(EMMS:::"memory");
#endif
N
Nick Kurshev 已提交
111 112 113 114 115 116 117 118
  while(s < end)
  {
    *dest++ = *s++;
    *dest++ = *s++;
    *dest++ = *s++;
    s++;
  }
}
N
Nick Kurshev 已提交
119

120 121 122
/*
 Original by Strepto/Astral
 ported to gcc & bugfixed : A'rpi
N
Nick Kurshev 已提交
123
 MMX2, 3DNOW optimization by Nick Kurshev
124
 32bit c version, and and&add trick by Michael Niedermayer
125
*/
126
void rgb15to16(const uint8_t *src,uint8_t *dst,unsigned src_size)
N
Nick Kurshev 已提交
127 128
{
#ifdef HAVE_MMX
N
Nick Kurshev 已提交
129
  register const char* s=src+src_size;
N
Nick Kurshev 已提交
130 131
  register char* d=dst+src_size;
  register int offs=-src_size;
132
  __asm __volatile(PREFETCH"	%0"::"m"(*(s+offs)));
133 134
  __asm __volatile(
	"movq	%0, %%mm4\n\t"
135
	::"m"(mask15s));
136 137 138 139 140 141 142 143 144 145
  while(offs<0)
  {
	__asm __volatile(
		PREFETCH"	32%1\n\t"
		"movq	%1, %%mm0\n\t"
		"movq	8%1, %%mm2\n\t"
		"movq	%%mm0, %%mm1\n\t"
		"movq	%%mm2, %%mm3\n\t"
		"pand	%%mm4, %%mm0\n\t"
		"pand	%%mm4, %%mm2\n\t"
146 147
		"paddw	%%mm1, %%mm0\n\t"
		"paddw	%%mm3, %%mm2\n\t"
148 149 150 151
		MOVNTQ"	%%mm0, %0\n\t"
		MOVNTQ"	%%mm2, 8%0"
		:"=m"(*(d+offs))
		:"m"(*(s+offs))
152
		);
153
	offs+=16;
N
Nick Kurshev 已提交
154
  }
155 156
  __asm __volatile(SFENCE:::"memory");
  __asm __volatile(EMMS:::"memory");
N
Nick Kurshev 已提交
157
#else
158
#if 0
N
Nick Kurshev 已提交
159
   const uint16_t *s1=( uint16_t * )src;
N
Nick Kurshev 已提交
160 161 162 163 164 165 166 167 168 169
   uint16_t *d1=( uint16_t * )dst;
   uint16_t *e=((uint8_t *)s1)+src_size;
   while( s1<e ){
     register int x=*( s1++ );
     /* rrrrrggggggbbbbb
        0rrrrrgggggbbbbb
        0111 1111 1110 0000=0x7FE0
        00000000000001 1111=0x001F */
     *( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 );
   }
170
#else
171 172
	const unsigned *s1=( unsigned * )src;
	unsigned *d1=( unsigned * )dst;
173 174 175 176 177 178 179 180 181 182
	int i;
	int size= src_size>>2;
	for(i=0; i<size; i++)
	{
		register int x= s1[i];
//		d1[i] = x + (x&0x7FE07FE0); //faster but need msbit =0 which might not allways be true
		d1[i] = (x&0x7FFF7FFF) + (x&0x7FE07FE0);

	}
#endif
N
Nick Kurshev 已提交
183 184
#endif
}
185 186 187 188

/**
 * Pallete is assumed to contain bgr32
 */
189
void palette8torgb32(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
190
{
191
	unsigned i;
M
Michael Niedermayer 已提交
192
	for(i=0; i<num_pixels; i++)
193
		((unsigned *)dst)[i] = ((unsigned *)palette)[ src[i] ];
194 195
}

M
Michael Niedermayer 已提交
196 197 198
/**
 * Pallete is assumed to contain bgr32
 */
199
void palette8torgb24(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
M
Michael Niedermayer 已提交
200
{
201
	unsigned i;
M
Michael Niedermayer 已提交
202 203
/*
	writes 1 byte o much and might cause alignment issues on some architectures?
M
Michael Niedermayer 已提交
204
	for(i=0; i<num_pixels; i++)
205
		((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[ src[i] ];
M
Michael Niedermayer 已提交
206
*/
M
Michael Niedermayer 已提交
207
	for(i=0; i<num_pixels; i++)
M
Michael Niedermayer 已提交
208 209 210 211 212 213 214 215 216
	{
		//FIXME slow?
		dst[0]= palette[ src[i]*4+0 ];
		dst[1]= palette[ src[i]*4+1 ];
		dst[2]= palette[ src[i]*4+2 ];
		dst+= 3;
	}
}

217
void rgb32to16(const uint8_t *src, uint8_t *dst, unsigned src_size)
218
{
219 220 221
	unsigned j,i,num_pixels=src_size/4;
	uint16_t *d = (uint16_t *)dst;
	for(i=0,j=0; j<num_pixels; i+=4,j++)
222 223 224 225 226
	{
		const int b= src[i+0];
		const int g= src[i+1];
		const int r= src[i+2];

227
		d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
228 229 230
	}
}

231
void rgb32to15(const uint8_t *src, uint8_t *dst, unsigned src_size)
232
{
233 234 235
	unsigned j,i,num_pixels=src_size/4;
	uint16_t *d = (uint16_t *)dst;
	for(i=0,j=0; j<num_pixels; i+=4,j++)
236 237 238 239 240
	{
		const int b= src[i+0];
		const int g= src[i+1];
		const int r= src[i+2];

N
Nick Kurshev 已提交
241
		d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
242 243 244
	}
}

245 246
void rgb24to16(const uint8_t *src, uint8_t *dst, unsigned src_size)
{
247 248 249
	unsigned j,i,num_pixels=src_size/3;
	uint16_t *d = (uint16_t *)dst;
	for(i=0,j=0; j<num_pixels; i+=3,j++)
250 251 252 253 254
	{
		const int b= src[i+0];
		const int g= src[i+1];
		const int r= src[i+2];

255
		d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
256 257 258 259 260
	}
}

void rgb24to15(const uint8_t *src, uint8_t *dst, unsigned src_size)
{
261 262 263
	unsigned j,i,num_pixels=src_size/3;
	uint16_t *d = (uint16_t *)dst;
	for(i=0,j=0; j<num_pixels; i+=3,j++)
264 265 266 267 268
	{
		const int b= src[i+0];
		const int g= src[i+1];
		const int r= src[i+2];

N
Nick Kurshev 已提交
269
		d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
270 271
	}
}
272 273 274 275

/**
 * Palette is assumed to contain bgr16, see rgb32to16 to convert the palette
 */
276
void palette8torgb16(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
277
{
278
	unsigned i;
M
Michael Niedermayer 已提交
279
	for(i=0; i<num_pixels; i++)
280 281 282 283 284 285
		((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ];
}

/**
 * Pallete is assumed to contain bgr15, see rgb32to15 to convert the palette
 */
286
void palette8torgb15(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
287
{
288
	unsigned i;
M
Michael Niedermayer 已提交
289
	for(i=0; i<num_pixels; i++)
290
		((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ];
M
Michael Niedermayer 已提交
291
}
M
Michael Niedermayer 已提交
292 293
/**
 *
M
Michael Niedermayer 已提交
294 295
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
 * problem for anyone then tell me, and ill fix it)
M
Michael Niedermayer 已提交
296
 */
M
Michael Niedermayer 已提交
297 298
void yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
	int width, int height, int lumStride, int chromStride, int dstStride)
M
Michael Niedermayer 已提交
299
{
M
Michael Niedermayer 已提交
300 301 302 303
	int y;
	const int chromWidth= width>>1;
	for(y=0; y<height; y++)
	{
M
Michael Niedermayer 已提交
304
#ifdef HAVE_MMX
M
Michael Niedermayer 已提交
305 306 307 308 309 310 311 312 313 314 315 316
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
		asm volatile(
			"xorl %%eax, %%eax		\n\t"
			"1:				\n\t"
			PREFETCH" 32(%1, %%eax, 2)	\n\t"
			PREFETCH" 32(%2, %%eax)		\n\t"
			PREFETCH" 32(%3, %%eax)		\n\t"
			"movq (%2, %%eax), %%mm0	\n\t" // U(0)
			"movq %%mm0, %%mm2		\n\t" // U(0)
			"movq (%3, %%eax), %%mm1	\n\t" // V(0)
			"punpcklbw %%mm1, %%mm0		\n\t" // UVUV UVUV(0)
			"punpckhbw %%mm1, %%mm2		\n\t" // UVUV UVUV(8)
M
Michael Niedermayer 已提交
317

M
Michael Niedermayer 已提交
318 319 320 321 322 323 324 325
			"movq (%1, %%eax,2), %%mm3	\n\t" // Y(0)
			"movq 8(%1, %%eax,2), %%mm5	\n\t" // Y(8)
			"movq %%mm3, %%mm4		\n\t" // Y(0)
			"movq %%mm5, %%mm6		\n\t" // Y(8)
			"punpcklbw %%mm0, %%mm3		\n\t" // YUYV YUYV(0)
			"punpckhbw %%mm0, %%mm4		\n\t" // YUYV YUYV(4)
			"punpcklbw %%mm2, %%mm5		\n\t" // YUYV YUYV(8)
			"punpckhbw %%mm2, %%mm6		\n\t" // YUYV YUYV(12)
M
Michael Niedermayer 已提交
326

M
Michael Niedermayer 已提交
327 328 329 330
			MOVNTQ" %%mm3, (%0, %%eax, 4)	\n\t"
			MOVNTQ" %%mm4, 8(%0, %%eax, 4)	\n\t"
			MOVNTQ" %%mm5, 16(%0, %%eax, 4)	\n\t"
			MOVNTQ" %%mm6, 24(%0, %%eax, 4)	\n\t"
M
Michael Niedermayer 已提交
331

M
Michael Niedermayer 已提交
332 333 334 335 336 337
			"addl $8, %%eax			\n\t"
			"cmpl %4, %%eax			\n\t"
			" jb 1b				\n\t"
			::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth)
			: "%eax"
		);
M
Michael Niedermayer 已提交
338
#else
M
Michael Niedermayer 已提交
339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354
		int i;
		for(i=0; i<chromWidth; i++)
		{
			dst[4*i+0] = ysrc[2*i+0];
			dst[4*i+1] = usrc[i];
			dst[4*i+2] = ysrc[2*i+1];
			dst[4*i+3] = vsrc[i];
		}
#endif
		if(y&1)
		{
			usrc += chromStride;
			vsrc += chromStride;
		}
		ysrc += lumStride;
		dst += dstStride;
M
Michael Niedermayer 已提交
355
	}
M
Michael Niedermayer 已提交
356 357 358 359
#ifdef HAVE_MMX
asm(    EMMS" \n\t"
        SFENCE" \n\t"
        :::"memory");
M
Michael Niedermayer 已提交
360
#endif
M
Michael Niedermayer 已提交
361 362
}

M
Michael Niedermayer 已提交
363 364 365 366 367 368 369
/**
 *
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
 * problem for anyone then tell me, and ill fix it)
 */
void yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
	int width, int height, int lumStride, int chromStride, int srcStride)
M
Michael Niedermayer 已提交
370
{
M
Michael Niedermayer 已提交
371 372 373 374
	int y;
	const int chromWidth= width>>1;
	for(y=0; y<height; y+=2)
	{
M
Michael Niedermayer 已提交
375
#ifdef HAVE_MMX
M
Michael Niedermayer 已提交
376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446
		asm volatile(
			"xorl %%eax, %%eax		\n\t"
			"pcmpeqw %%mm7, %%mm7		\n\t"
			"psrlw $8, %%mm7		\n\t" // FF,00,FF,00...
			"1:				\n\t"
			PREFETCH" 64(%0, %%eax, 4)	\n\t"
			"movq (%0, %%eax, 4), %%mm0	\n\t" // YUYV YUYV(0)
			"movq 8(%0, %%eax, 4), %%mm1	\n\t" // YUYV YUYV(4)
			"movq %%mm0, %%mm2		\n\t" // YUYV YUYV(0)
			"movq %%mm1, %%mm3		\n\t" // YUYV YUYV(4)
			"psrlw $8, %%mm0		\n\t" // U0V0 U0V0(0)
			"psrlw $8, %%mm1		\n\t" // U0V0 U0V0(4)
			"pand %%mm7, %%mm2		\n\t" // Y0Y0 Y0Y0(0)
			"pand %%mm7, %%mm3		\n\t" // Y0Y0 Y0Y0(4)
			"packuswb %%mm1, %%mm0		\n\t" // UVUV UVUV(0)
			"packuswb %%mm3, %%mm2		\n\t" // YYYY YYYY(0)

			MOVNTQ" %%mm2, (%1, %%eax, 2)	\n\t"

			"movq 16(%0, %%eax, 4), %%mm1	\n\t" // YUYV YUYV(8)
			"movq 24(%0, %%eax, 4), %%mm2	\n\t" // YUYV YUYV(12)
			"movq %%mm1, %%mm3		\n\t" // YUYV YUYV(8)
			"movq %%mm2, %%mm4		\n\t" // YUYV YUYV(12)
			"psrlw $8, %%mm1		\n\t" // U0V0 U0V0(8)
			"psrlw $8, %%mm2		\n\t" // U0V0 U0V0(12)
			"pand %%mm7, %%mm3		\n\t" // Y0Y0 Y0Y0(8)
			"pand %%mm7, %%mm4		\n\t" // Y0Y0 Y0Y0(12)
			"packuswb %%mm2, %%mm1		\n\t" // UVUV UVUV(8)
			"packuswb %%mm4, %%mm3		\n\t" // YYYY YYYY(8)

			MOVNTQ" %%mm3, 8(%1, %%eax, 2)	\n\t"

			"movq %%mm0, %%mm2		\n\t" // UVUV UVUV(0)
			"movq %%mm1, %%mm3		\n\t" // UVUV UVUV(8)
			"psrlw $8, %%mm0		\n\t" // V0V0 V0V0(0)
			"psrlw $8, %%mm1		\n\t" // V0V0 V0V0(8)
			"pand %%mm7, %%mm2		\n\t" // U0U0 U0U0(0)
			"pand %%mm7, %%mm3		\n\t" // U0U0 U0U0(8)
			"packuswb %%mm1, %%mm0		\n\t" // VVVV VVVV(0)
			"packuswb %%mm3, %%mm2		\n\t" // UUUU UUUU(0)

			MOVNTQ" %%mm0, (%3, %%eax)	\n\t"
			MOVNTQ" %%mm2, (%2, %%eax)	\n\t"

			"addl $8, %%eax			\n\t"
			"cmpl %4, %%eax			\n\t"
			" jb 1b				\n\t"

			"1:				\n\t"
			PREFETCH" 64(%0, %%eax, 4)	\n\t"
			"movq (%0, %%eax, 4), %%mm0	\n\t" // YUYV YUYV(0)
			"movq 8(%0, %%eax, 4), %%mm1	\n\t" // YUYV YUYV(4)
			"movq 16(%0, %%eax, 4), %%mm2	\n\t" // YUYV YUYV(8)
			"movq 24(%0, %%eax, 4), %%mm3	\n\t" // YUYV YUYV(12)
			"pand %%mm7, %%mm0		\n\t" // Y0Y0 Y0Y0(0)
			"pand %%mm7, %%mm1		\n\t" // Y0Y0 Y0Y0(4)
			"pand %%mm7, %%mm2		\n\t" // Y0Y0 Y0Y0(8)
			"pand %%mm7, %%mm3		\n\t" // Y0Y0 Y0Y0(12)
			"packuswb %%mm1, %%mm0		\n\t" // YYYY YYYY(0)
			"packuswb %%mm3, %%mm2		\n\t" // YYYY YYYY(8)

			MOVNTQ" %%mm0, (%1, %%eax, 2)	\n\t"
			MOVNTQ" %%mm2, 8(%1, %%eax, 2)	\n\t"

			"addl $8, %%eax			\n\t"
			"cmpl %5, %%eax			\n\t"
			" jb 1b				\n\t"

			::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth), "m"(width)
			: "memory", "%eax"
		);
M
Michael Niedermayer 已提交
447
#else
M
Michael Niedermayer 已提交
448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468
		int i;
		for(i=0; i<chromWidth; i++)
		{
			ydst[2*i+0] 	= src[4*i+0];
			udst[i] 	= src[4*i+1];
			ydst[2*i+1] 	= src[4*i+2];
			vdst[i] 	= src[4*i+3];
		}
		ydst += lumStride;
		src  += srcStride;

		for(i=0; i<chromWidth; i++)
		{
			ydst[2*i+0] 	= src[4*i+0];
			ydst[2*i+1] 	= src[4*i+2];
		}
#endif
		udst += chromStride;
		vdst += chromStride;
		ydst += lumStride;
		src  += srcStride;
M
Michael Niedermayer 已提交
469
	}
M
Michael Niedermayer 已提交
470 471 472 473
#ifdef HAVE_MMX
asm(    EMMS" \n\t"
        SFENCE" \n\t"
        :::"memory");
M
Michael Niedermayer 已提交
474
#endif
M
Michael Niedermayer 已提交
475
}