ijksdl: add yuv2rgb neon optimize

7fcda743 · Zhang Rui · 6800079e · 7fcda743 · 7fcda743 · 7fcda743
16 changed file
--- a/ijkmediaplayer/jni/Application.mk
+++ b/ijkmediaplayer/jni/Application.mk
@@ -30,7 +30,7 @@ APP_CFLAGS := -O3 -Wall -pipe \
    -Wno-psabi -Wa,--noexecstack \
    -DANDROID -DNDEBUG

-# -D__ARM_ARCH_5__ -D__ARM_ARCH_5E__ -D__ARM_ARCH_5T__ -D__ARM_ARCH_5TE__
+# APP_CFLAGS += -D__ARM_ARCH_5__ -D__ARM_ARCH_5E__ -D__ARM_ARCH_5T__ -D__ARM_ARCH_5TE__

 # armeabi-v7a
 APP_CFLAGS += -march=armv7-a -mfpu=vfpv3-d16 -mfloat-abi=softfp

--- a/ijkmediaplayer/jni/ijkplayer/ff_ffplay.c
+++ b/ijkmediaplayer/jni/ijkplayer/ff_ffplay.c
@@ -682,7 +682,7 @@ static int queue_picture(FFPlayer *ffp, AVFrame *src_frame, double pts, int64_t
        SDL_VoutLockYUVOverlay(vp->bmp);

        if (SDL_VoutFFmpeg_ConvertPicture(vp->bmp, vp->width, vp->height,
-            src_frame->format, src_frame->data, src_frame->linesize,
+            src_frame->format, (const uint8_t**)src_frame->data, src_frame->linesize,
            &is->img_convert_ctx, ffp->sws_flags) < 0) {
            fprintf(stderr, "Cannot initialize the conversion context\n");
            exit(1);

--- a/ijkmediaplayer/jni/ijkplayer/ff_ffplay_def.h
+++ b/ijkmediaplayer/jni/ijkplayer/ff_ffplay_def.h
@@ -464,8 +464,8 @@ inline static void ffp_reset_internal(FFPlayer *ffp)
    ffp->sar_den                = 0;

    // ffp->overlay_format         = SDL_FCC_YV12;
-    ffp->overlay_format         = SDL_FCC_RV16;
-    // ffp->overlay_format         = SDL_FCC_RV32;
+    // ffp->overlay_format         = SDL_FCC_RV16;
+    ffp->overlay_format         = SDL_FCC_RV32;

    ffp->last_error             = 0;
    ffp->prepared               = 0;

--- a/ijkmediaplayer/jni/ijksdl/Android.mk
+++ b/ijkmediaplayer/jni/ijksdl/Android.mk
@@ -36,6 +36,15 @@ LOCAL_SRC_FILES += ijksdl_vout.c

 LOCAL_SRC_FILES += ffmpeg/ijksdl_vout_overlay_ffmpeg.c

+ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
+LOCAL_CFLAGS += -DHAVE_NEON=1
+LOCAL_SRC_FILES += ffmpeg/abi_armv7a_neon/image_convert.c
+LOCAL_SRC_FILES += ffmpeg/abi_armv7a_neon/i420_rgb.S.arm.neon
+LOCAL_SRC_FILES += ffmpeg/abi_armv7a_neon/i420_rv16.S.arm.neon
+else
+LOCAL_SRC_FILES += ffmpeg/abi_all/image_convert.c
+endif
+
 LOCAL_SRC_FILES += android/android_audiotrack.c
 LOCAL_SRC_FILES += android/android_nativewindow.c
 LOCAL_SRC_FILES += android/ijksdl_android_jni.c
@@ -44,6 +53,9 @@ LOCAL_SRC_FILES += android/ijksdl_vout_android_nativewindow.c
 LOCAL_SRC_FILES += android/ijksdl_vout_android_surface.c

 LOCAL_SHARED_LIBRARIES := ffmpeg ijkutil
+LOCAL_STATIC_LIBRARIES := cpufeatures

 LOCAL_MODULE := ijksdl
 include $(BUILD_SHARED_LIBRARY)
+
+$(call import-module,android/cpufeatures)
--- a/ijkmediaplayer/jni/ijksdl/android/android_nativewindow.c
+++ b/ijkmediaplayer/jni/ijksdl/android/android_nativewindow.c
@@ -237,7 +237,7 @@ int sdl_native_window_display_l(ANativeWindow *native_window, SDL_VoutOverlay *o
    }

    if (voutDesc->hal_format != overlayDesc->hal_format) {
-        SDLTRACE("ANativeWindow_setBuffersGeometry: w=%d, h=%d, f=%.4s(0x%x) => w=%d, h=%d, f=%.4s(0x%x)",
+        ALOGD("ANativeWindow_setBuffersGeometry: w=%d, h=%d, f=%.4s(0x%x) => w=%d, h=%d, f=%.4s(0x%x)",
            curr_w, curr_h, (char*) &curr_format, curr_format,
            buff_w, buff_h, (char*) &overlay->format, overlay->format);
        retval = ANativeWindow_setBuffersGeometry(native_window, buff_w, buff_h, overlayDesc->hal_format);

--- a/ijkmediaplayer/jni/ijksdl/ffmpeg/abi_all/image_convert.c
+++ b/ijkmediaplayer/jni/ijksdl/ffmpeg/abi_all/image_convert.c
+/*****************************************************************************
+ * yuv_rgb.c : ARM NEONv1 YUV to RGB32 chroma conversion for VLC
+ *****************************************************************************
+ * Copyright (C) 2011 Sébastien Toque
+ *                    Rémi Denis-Courmont
+ * Copyright (C) 2013 Zhang Rui <bbcallen@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ *****************************************************************************/
+
+#include "../ijksdl_image_convert.h"
+
+int ijk_image_convert(int width, int height,
+    enum AVPixelFormat dst_format, uint8_t **dst_data, int *dst_linesize,
+    enum AVPixelFormat src_format, const uint8_t **src_data, int *src_linesize)
+{
+    return -1;
+}
+
--- a/ijkmediaplayer/jni/ijksdl/ffmpeg/abi_armv7a_neon/chroma_neon.h
+++ b/ijkmediaplayer/jni/ijksdl/ffmpeg/abi_armv7a_neon/chroma_neon.h
+/*****************************************************************************
+ * chroma_neon.h
+ *****************************************************************************
+ * Copyright (C) 2011 Rémi Denis-Courmont
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ *****************************************************************************/
+
+/* Planes must start on a 16-bytes boundary. Pitches must be multiples of 16
+ * bytes even for subsampled components. */
+
+/* Planar picture buffer.
+ * Pitch corresponds to luminance component in bytes. Chrominance pitches are
+ * inferred from the color subsampling ratio. */
+struct yuv_planes
+{
+    void *y, *u, *v;
+    size_t pitch;
+};
+
+struct yuv_planes_in
+{
+    const void *y, *u, *v;
+    size_t pitch;
+};
+
+/* Packed picture buffer. Pitch is in bytes (_not_ pixels). */
+struct yuv_pack
+{
+    void *yuv;
+    size_t pitch;
+};
+
+/* I420 to RGBA conversion. */
+void i420_rgb_neon (struct yuv_pack *const out,
+                    const struct yuv_planes_in *const in,
+                    int width, int height) __asm__("i420_rgb_neon");
+
+/* I420 to RV16 conversion. */
+void i420_rv16_neon (struct yuv_pack *const out,
+                     const struct yuv_planes_in *const in,
+                     int width, int height) __asm__("i420_rv16_neon");
+
+/* NV21 to RGBA conversion. */
+void nv21_rgb_neon (struct yuv_pack *const out,
+                    const struct yuv_planes_in *const in,
+                    int width, int height) __asm__("nv21_rgb_neon");
+
+/* NV12 to RGBA conversion. */
+void nv12_rgb_neon (struct yuv_pack *const out,
+                    const struct yuv_planes_in *const in,
+                    int width, int height) __asm__("nv12_rgb_neon");
--- a/ijkmediaplayer/jni/ijksdl/ffmpeg/abi_armv7a_neon/i420_rgb.S
+++ b/ijkmediaplayer/jni/ijksdl/ffmpeg/abi_armv7a_neon/i420_rgb.S
+ @*****************************************************************************
+ @ i420_rgb.S : ARM NEONv1 I420 to RGB chroma conversion
+ @*****************************************************************************
+ @ Copyright (C) 2011 Sébastien Toque
+ @                    Rémi Denis-Courmont
+ @
+ @ This program is free software; you can redistribute it and/or modify it
+ @ under the terms of the GNU Lesser General Public License as published by
+ @ the Free Software Foundation; either version 2.1 of the License, or
+ @ (at your option) any later version.
+ @
+ @ This program is distributed in the hope that it will be useful,
+ @ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ @ GNU Lesser General Public License for more details.
+ @
+ @ You should have received a copy of the GNU Lesser General Public License
+ @ along with this program; if not, write to the Free Software Foundation,
+ @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ @****************************************************************************/
+
+	.syntax unified
+	.fpu neon
+	.text
+
+/* ARM */
+#define O1	r0
+#define O2	r1
+#define WIDTH	r2
+#define HEIGHT	r3
+#define Y1	r4
+#define Y2	r5
+#define U	r6
+#define V	r7
+#define YPITCH	r8
+#define OPAD	r10
+#define YPAD	r11
+#define COUNT	ip
+#define OPITCH	lr
+
+/* NEON */
+#define coefY	D0
+#define coefRV	D1
+#define coefGU	D2
+#define coefGV	D3
+#define coefBU	D4
+#define Rc	Q3
+#define Gc	Q4
+#define Bc	Q5
+
+#define u	D24
+#define v	D25
+#define y1	D18
+#define y2	D19
+
+#define chro_r	Q6
+#define chro_g	Q7
+#define chro_b	Q8
+#define lumi1	Q15
+#define lumi2	Q10
+#define red16_1		Q9
+#define green16_1	Q10
+#define blue16_1	Q11
+#define red16_2		Q12
+#define green16_2	Q13
+#define blue16_2	Q14
+
+#define red1	D24
+#define green1	D25
+#define blue1	D26
+#define alpha1	D27
+#define red2	D28
+#define green2	D29
+#define blue2	D30
+#define alpha2	D31
+
+coefficients:
+    .short  -15872
+    .short    4992
+    .short  -18432
+
+	.align 2
+	.global i420_rgb_neon
+	.type	i420_rgb_neon, %function
+i420_rgb_neon:
+	push		{r4-r8,r10-r11,lr}
+	vpush		{q4-q7}
+
+	/* load arguments */
+	ldmia		r0,	{O1, OPITCH}
+	ldmia		r1,	{Y1, U, V, YPITCH}
+
+	/* round the width to be a multiple of 16 */
+	ands		OPAD, WIDTH, #15
+	sub			WIDTH, WIDTH, OPAD
+	addne		WIDTH, WIDTH, #16
+
+	/* init constants (scale value by 64) */
+	vmov.u8		coefY, #74
+	vmov.u8		coefRV, #115
+	vmov.u8		coefGU, #14
+	vmov.u8		coefGV, #34
+	vmov.u8		coefBU, #135
+	adr			OPAD, coefficients
+	vld1.s16	{d6[], d7[]}, [OPAD]!
+	vld1.s16	{d8[], d9[]}, [OPAD]!
+	vld1.s16	{d10[], d11[]}, [OPAD]!
+	vmov.u8		alpha1, #255
+
+	/* init padding */
+	cmp			HEIGHT,	#0
+	sub			OPAD,	OPITCH,	WIDTH, lsl #2
+	sub			YPAD,	YPITCH,	WIDTH
+
+loop_row:
+	movsgt	COUNT,	WIDTH
+	add		O2,	O1,	OPITCH
+	add		Y2,	Y1,	YPITCH
+	/* exit if all rows have been processed */
+	vpople	{q4-q7}
+	pople	{r4-r8,r10-r11,pc}
+
+loop_col:
+
+	/* Common U & V */
+
+	vld1.u8	{u}, [U,:64]!
+	vld1.u8	{v}, [V,:64]!
+
+	/* Y Top Row */
+	vld2.u8	{y1,y2}, [Y1,:128]!
+
+	vmull.u8	Q14, v, coefRV
+	vmull.u8	Q11, u, coefGU
+	vmull.u8	Q13, u, coefBU
+	vmlal.u8	Q11, v, coefGV
+
+	vmull.u8	lumi2, y2, coefY
+	vmull.u8	lumi1, y1, coefY
+	vadd.s16	chro_r, Rc, Q14
+	vadd.s16	chro_b, Bc, Q13
+	vsub.s16	chro_g, Gc, Q11
+
+	pld	[U]
+	pld	[V]
+
+	/* chrominance + luminance */
+	vqadd.s16	red16_2, lumi2, chro_r
+	vqadd.s16	blue16_2, lumi2, chro_b
+	vqadd.s16	green16_2, lumi2, chro_g
+	vqadd.s16	red16_1, lumi1, chro_r
+	vqadd.s16	green16_1, lumi1, chro_g
+	vqadd.s16	blue16_1, lumi1, chro_b
+
+	/* clamp (divide by 64) */
+	vqrshrun.s16	blue2, blue16_2, #6
+	vqrshrun.s16	red2, red16_2, #6
+	vqrshrun.s16	green2, green16_2, #6
+	vqrshrun.s16	red1, red16_1, #6
+	vqrshrun.s16	green1, green16_1, #6
+	vqrshrun.s16	blue1, blue16_1, #6
+
+	pld	[Y1]
+
+	/* Y Bottom Row */
+	vld2.u8	{y1,y2}, [Y2,:128]!
+
+	vmov.u8	alpha1, #255
+	vzip.u8	red1, red2
+	vzip.u8	green1, green2
+	vzip.u8	blue1, blue2
+
+	vmull.u8	lumi2, y2, coefY
+	vst4.u8		{red1,green1,blue1,alpha1}, [O1,:128]!
+	vst4.u8		{red2,green2,blue2,alpha2}, [O1,:128]!
+
+	/* chrominance + luminance */
+	vmull.u8	lumi1, y1, coefY
+	vqadd.s16	red16_2, lumi2, chro_r
+	vqadd.s16	green16_2, lumi2, chro_g
+	vqadd.s16	blue16_2, lumi2, chro_b
+	vqadd.s16	red16_1, lumi1, chro_r
+	vqadd.s16	green16_1, lumi1, chro_g
+	vqadd.s16	blue16_1, lumi1, chro_b
+
+	/* clamp (divide by 64) */
+	vqrshrun.s16	blue2, blue16_2, #6
+	vqrshrun.s16	red2, red16_2, #6
+	vqrshrun.s16	green2, green16_2, #6
+	vqrshrun.s16	red1, red16_1, #6
+	vqrshrun.s16	green1, green16_1, #6
+	vqrshrun.s16	blue1, blue16_1, #6
+
+	pld	[Y2]
+
+	vmov.u8	alpha2, #255
+	vzip.u8	red1, red2
+	vzip.u8	green1, green2
+	vzip.u8	blue1, blue2
+
+	vst4.u8		{red1,green1,blue1,alpha1}, [O2,:128]!
+	vst4.u8		{red2,green2,blue2,alpha2}, [O2,:128]!
+
+	/* next columns (x16) */
+	subs	COUNT,	COUNT,	#16
+	bgt		loop_col
+
+	/* next rows (x2) */
+	subs	HEIGHT,	#2
+	add		O1,	O2,	OPAD
+	add		Y1,	Y2,	YPAD
+	add		U,	U,	YPAD,	lsr #1
+	add		V,	V,	YPAD,	lsr #1
+	b		loop_row
--- a/ijkmediaplayer/jni/ijksdl/ffmpeg/abi_armv7a_neon/i420_rv16.S
+++ b/ijkmediaplayer/jni/ijksdl/ffmpeg/abi_armv7a_neon/i420_rv16.S
+ @*****************************************************************************
+ @ i420_rv16.S : ARM NEONv1 I420 to RV16 chroma conversion
+ @*****************************************************************************
+ @ Copyright (C) 2011 Sébastien Toque
+ @                    Rémi Denis-Courmont
+ @
+ @ This program is free software; you can redistribute it and/or modify it
+ @ under the terms of the GNU Lesser General Public License as published by
+ @ the Free Software Foundation; either version 2.1 of the License, or
+ @ (at your option) any later version.
+ @
+ @ This program is distributed in the hope that it will be useful,
+ @ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ @ GNU Lesser General Public License for more details.
+ @
+ @ You should have received a copy of the GNU Lesser General Public License
+ @ along with this program; if not, write to the Free Software Foundation,
+ @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ @****************************************************************************/
+
+	.syntax unified
+	.fpu neon
+	.text
+
+/* ARM */
+#define O1	r0
+#define O2	r1
+#define WIDTH	r2
+#define HEIGHT	r3
+#define Y1	r4
+#define Y2	r5
+#define U	r6
+#define V	r7
+#define YPITCH	r8
+#define OPAD	r10
+#define YPAD	r11
+#define COUNT	ip
+#define OPITCH	lr
+
+/* NEON */
+#define coefY	D0
+#define coefRV	D1
+#define coefGU	D2
+#define coefGV	D3
+#define coefBU	D4
+#define Rc	Q3
+#define Gc	Q4
+#define Bc	Q5
+
+#define u	D24
+#define v	D25
+#define y1	D18
+#define y2	D19
+
+#define chro_r	Q6
+#define chro_g	Q7
+#define chro_b	Q8
+#define lumi1	Q15
+#define lumi2	Q10
+#define red16_1		Q9
+#define green16_1	Q10
+#define blue16_1	Q11
+#define red16_2		Q12
+#define green16_2	Q13
+#define blue16_2	Q14
+
+#define red1	D25
+#define green1	D26
+#define blue1	D27
+#define red2	D29
+#define green2	D30
+#define blue2	D31
+
+#define out1l	D24
+#define out1h	D25
+#define out2l	D28
+#define out2h	D29
+
+coefficients:
+    .short  -15872
+    .short    4992
+    .short  -18432
+
+	.align 2
+	.global i420_rv16_neon
+	.type	i420_rv16_neon, %function
+i420_rv16_neon:
+	push		{r4-r8,r10-r11,lr}
+	vpush		{q4-q7}
+
+	/* load arguments */
+	ldmia		r0,	{O1, OPITCH}
+	ldmia		r1,	{Y1, U, V, YPITCH}
+
+	/* round the width to be a multiple of 16 */
+	ands		OPAD, WIDTH, #15
+	sub			WIDTH, WIDTH, OPAD
+	addne		WIDTH, WIDTH, #16
+
+	/* init constants (scale value by 64) */
+	vmov.u8		coefY, #74
+	vmov.u8		coefRV, #115
+	vmov.u8		coefGU, #14
+	vmov.u8		coefGV, #34
+	vmov.u8		coefBU, #135
+	adr			OPAD, coefficients
+	vld1.s16	{d6[], d7[]}, [OPAD]!
+	vld1.s16	{d8[], d9[]}, [OPAD]!
+	vld1.s16	{d10[], d11[]}, [OPAD]!
+
+	/* init padding */
+	cmp			HEIGHT,	#0
+	sub			OPAD,	OPITCH,	WIDTH, lsl #1
+	sub			YPAD,	YPITCH,	WIDTH
+
+loop_row:
+	movsgt	COUNT,	WIDTH
+	add		O2,	O1,	OPITCH
+	add		Y2,	Y1,	YPITCH
+	/* exit if all rows have been processed */
+	vpople	{q4-q7}
+	pople	{r4-r8,r10-r11,pc}
+
+loop_col:
+
+	/* Common U & V */
+
+	vld1.u8	{u}, [U,:64]!
+	vld1.u8	{v}, [V,:64]!
+
+	/* Y Top Row */
+	vld2.u8	{y1,y2}, [Y1,:128]!
+
+	vmull.u8	Q14, v, coefRV
+	vmull.u8	Q11, u, coefGU
+	vmull.u8	Q13, u, coefBU
+	vmlal.u8	Q11, v, coefGV
+
+	vmull.u8	lumi2, y2, coefY
+	vmull.u8	lumi1, y1, coefY
+	vadd.s16	chro_r, Rc, Q14
+	vadd.s16	chro_b, Bc, Q13
+	vsub.s16	chro_g, Gc, Q11
+
+	pld	[U]
+	pld	[V]
+
+	/* chrominance + luminance */
+	vqadd.s16	red16_2, lumi2, chro_r
+	vqadd.s16	green16_2, lumi2, chro_g
+	vqadd.s16	blue16_2, lumi2, chro_b
+	vqadd.s16	red16_1, lumi1, chro_r
+	vqadd.s16	green16_1, lumi1, chro_g
+	vqadd.s16	blue16_1, lumi1, chro_b
+
+	/* clamp (divide by 64) */
+	vqrshrun.s16	green2, green16_2, #6
+	vqrshrun.s16	blue2, blue16_2, #6
+	vqrshrun.s16	red2, red16_2, #6
+	vqrshrun.s16	green1, green16_1, #6
+	vqrshrun.s16	red1, red16_1, #6
+	vqrshrun.s16	blue1, blue16_1, #6
+
+	pld	[Y1]
+
+	/* pack into RGB565 */
+	vshl.u8	out2l, green2, #3 // low 2a
+	vsri.u8	out2h, green2, #5 // high 2
+	vshl.u8	out1l, green1, #3 // low 1a
+	vsri.u8	out1h, green1, #5 // high 1
+	vsri.u8	out2l, blue2, #3 // low 2b
+	vsri.u8	out1l, blue1, #3 // low 1b
+
+	/* Y Bottom Row */
+	vld2.u8	{y1,y2}, [Y2,:128]!
+
+	/* Top Row output */
+	vzip.u8	out1h, out2h
+	vmull.u8	lumi2, y2, coefY
+	vzip.u8	out1l, out2l
+	vmull.u8	lumi1, y1, coefY
+	vst2.u8	{out1l, out1h}, [O1,:128]!
+	vst2.u8	{out2l, out2h}, [O1,:128]!
+
+	/* chrominance + luminance */
+	vqadd.s16	green16_2, lumi2, chro_g
+	vqadd.s16	red16_2, lumi2, chro_r
+	vqadd.s16	blue16_2, lumi2, chro_b
+	vqadd.s16	red16_1, lumi1, chro_r
+	vqadd.s16	green16_1, lumi1, chro_g
+	vqadd.s16	blue16_1, lumi1, chro_b
+
+	/* clamp (divide by 64) */
+	vqrshrun.s16	green2, green16_2, #6
+	vqrshrun.s16	blue2, blue16_2, #6
+	vqrshrun.s16	red2, red16_2, #6
+	vqrshrun.s16	green1, green16_1, #6
+	vqrshrun.s16	red1, red16_1, #6
+	vqrshrun.s16	blue1, blue16_1, #6
+
+	pld	[Y1]
+
+	/* pack into RGB565 */
+	vshl.u8	out2l, green2, #3 // low 2a
+	vsri.u8	out2h, green2, #5 // high 2
+	vshl.u8	out1l, green1, #3 // low 1a
+	vsri.u8	out1h, green1, #5 // high 1
+	vsri.u8	out2l, blue2, #3 // low 2b
+	vsri.u8	out1l, blue1, #3 // low 1b
+
+	vzip.u8	out1h, out2h
+	vzip.u8	out1l, out2l
+	vst2.u8	{out1l, out1h}, [O2,:128]!
+	vst2.u8	{out2l, out2h}, [O2,:128]!
+
+	/* next columns (x16) */
+	subs	COUNT,	COUNT,	#16
+	bgt		loop_col
+
+	/* next rows (x2) */
+	subs	HEIGHT,	#2
+	add		O1,	O2,	OPAD
+	add		Y1,	Y2,	YPAD
+	add		U,	U,	YPAD,	lsr #1
+	add		V,	V,	YPAD,	lsr #1
+	b		loop_row
--- a/ijkmediaplayer/jni/ijksdl/ffmpeg/abi_armv7a_neon/image_convert.c
+++ b/ijkmediaplayer/jni/ijksdl/ffmpeg/abi_armv7a_neon/image_convert.c
+/*****************************************************************************
+ * yuv_rgb.c : ARM NEONv1 YUV to RGB32 chroma conversion for VLC
+ *****************************************************************************
+ * Copyright (C) 2011 Sébastien Toque
+ *                    Rémi Denis-Courmont
+ * Copyright (C) 2013 Zhang Rui <bbcallen@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ *****************************************************************************/
+
+#include "../ijksdl_image_convert.h"
+
+#include <cpu-features.h>
+#include "chroma_neon.h"
+
+static void ijk_i420_rgb32_neon(int width, int height, uint8_t **dst_data, int *dst_linesize, const uint8_t **src_data, int *src_linesize)
+{
+    struct yuv_pack out = { dst_data[0], dst_linesize[0] };
+    struct yuv_planes_in in = { src_data[0], src_data[1], src_data[2], src_linesize[0] };
+    i420_rgb_neon(&out, &in, width, height);
+}
+
+static void ijk_i420_rgb16_neon(int width, int height, uint8_t **dst_data, int *dst_linesize, const uint8_t **src_data, int *src_linesize)
+{
+    struct yuv_pack out = { dst_data[0], dst_linesize[0] };
+    struct yuv_planes_in in = { src_data[0], src_data[1], src_data[2], src_linesize[0] };
+    i420_rv16_neon(&out, &in, width, height);
+}
+
+// FIXME: need nv12 and nv21 sample
+#if 0
+static void ijk_nv21_rgb32_neon(int width, int height, uint8_t **dst_data, int *dst_linesize, const uint8_t **src_data, int *src_linesize)
+{
+    struct yuv_pack out = {dst_data[0], dst_linesize[0]};
+    struct yuv_planes_in in = {src_data[0], src_data[1], src_data[2], src_linesize[0]};
+    nv21_rgb_neon(&out, &in, width, height);
+}
+
+static void ijk_nv12_rgb32_neon(int width, int height, uint8_t **dst_data, int *dst_linesize, const uint8_t **src_data, int *src_linesize)
+{
+    struct yuv_pack out = {dst_data[0], dst_linesize[0]};
+    struct yuv_planes_in in = {src_data[0], src_data[1], src_data[2], src_linesize[0]};
+    nv12_rgb_neon(&out, &in, width, height);
+}
+#endif
+
+int ijk_image_convert(int width, int height,
+    enum AVPixelFormat dst_format, uint8_t **dst_data, int *dst_linesize,
+    enum AVPixelFormat src_format, const uint8_t **src_data, int *src_linesize)
+{
+    if (!(android_getCpuFeatures() & (ANDROID_CPU_ARM_FEATURE_ARMv7 | ANDROID_CPU_ARM_FEATURE_NEON)))
+        return -1;
+
+    switch (src_format) {
+    case AV_PIX_FMT_YUV420P:
+        switch (dst_format) {
+        case AV_PIX_FMT_RGB565:
+            ijk_i420_rgb16_neon(width, height, dst_data, dst_linesize, src_data, src_linesize);
+            return 0;
+            break;
+        case AV_PIX_FMT_0BGR32:
+            ijk_i420_rgb32_neon(width, height, dst_data, dst_linesize, src_data, src_linesize);
+            return 0;
+        default:
+            break;
+        }
+        break;
+    default:
+        break;
+    }
+
+    return -1;
+}
+
--- a/ijkmediaplayer/jni/ijksdl/ffmpeg/abi_armv7a_neon/nv12_rgb.S
+++ b/ijkmediaplayer/jni/ijksdl/ffmpeg/abi_armv7a_neon/nv12_rgb.S
+ @*****************************************************************************
+ @ nv12_rgb.S : ARM NEONv1 NV12 to RGB chroma conversion
+ @*****************************************************************************
+ @ Copyright (C) 2011 Sébastien Toque
+ @                    Rémi Denis-Courmont
+ @
+ @ This program is free software; you can redistribute it and/or modify it
+ @ under the terms of the GNU Lesser General Public License as published by
+ @ the Free Software Foundation; either version 2.1 of the License, or
+ @ (at your option) any later version.
+ @
+ @ This program is distributed in the hope that it will be useful,
+ @ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ @ GNU Lesser General Public License for more details.
+ @
+ @ You should have received a copy of the GNU Lesser General Public License
+ @ along with this program; if not, write to the Free Software Foundation,
+ @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ @****************************************************************************/
+
+	.syntax unified
+	.fpu neon
+	.text
+
+/* ARM */
+#define O1	r0
+#define O2	r1
+#define WIDTH	r2
+#define HEIGHT	r3
+#define Y1	r4
+#define Y2	r5
+#define U	r6
+#define V	r7
+#define YPITCH	r8
+#define OPAD	r10
+#define YPAD	r11
+#define COUNT	ip
+#define OPITCH	lr
+
+/* NEON */
+#define coefY	D0
+#define coefRV	D1
+#define coefGU	D2
+#define coefGV	D3
+#define coefBU	D4
+#define Rc	Q3
+#define Gc	Q4
+#define Bc	Q5
+
+#define u	D24
+#define v	D25
+#define y1	D28
+#define y2	D29
+
+#define chro_r	Q6
+#define chro_g	Q7
+#define chro_b	Q8
+#define red		Q9
+#define green	Q10
+#define blue	Q11
+#define lumi	Q15
+
+#define red1	D24
+#define green1	D25
+#define blue1	D26
+#define alpha1	D27
+#define red2	D28
+#define green2	D29
+#define blue2	D30
+#define alpha2	D31
+
+coefficients:
+    .short  -15872
+    .short    4992
+    .short  -18432
+
+	.align 2
+	.global nv12_rgb_neon
+	.type	nv12_rgb_neon, %function
+nv12_rgb_neon:
+	push		{r4-r8,r10-r11,lr}
+	vpush		{q4-q7}
+
+	/* load arguments */
+	ldmia		r0,	{O1, OPITCH}
+	ldmia		r1,	{Y1, U, V, YPITCH}
+
+	/* round the width to be a multiple of 16 */
+	ands		OPAD, WIDTH, #15
+	sub			WIDTH, WIDTH, OPAD
+	addne		WIDTH, WIDTH, #16
+
+	/* init constants (scale value by 64) */
+	vmov.u8		coefY, #74
+	vmov.u8		coefRV, #115
+	vmov.u8		coefGU, #14
+	vmov.u8		coefGV, #34
+	vmov.u8		coefBU, #135
+	adr			OPAD, coefficients
+	vld1.s16	{d6[], d7[]}, [OPAD]!
+	vld1.s16	{d8[], d9[]}, [OPAD]!
+	vld1.s16	{d10[], d11[]}, [OPAD]!
+	vmov.u8		alpha1, #255
+
+	/* init padding */
+	cmp			HEIGHT,	#0
+	sub			OPAD,	OPITCH,	WIDTH, lsl #2
+	sub			YPAD,	YPITCH,	WIDTH
+
+loop_row:
+	movsgt	COUNT,	WIDTH
+	add		O2,	O1,	OPITCH
+	add		Y2,	Y1,	YPITCH
+	/* exit if all rows have been processed */
+	vpople	{q4-q7}
+	pople	{r4-r8,r10-r11,pc}
+
+loop_col:
+
+	/* Common U & V */
+
+	vld2.u8	{u,v}, [U,:128]!
+
+	vmull.u8	chro_r, v, coefRV
+	vmull.u8	chro_g, u, coefGU
+	vmlal.u8	chro_g, v, coefGV
+	vmull.u8	chro_b, u, coefBU
+
+	vadd.s16	chro_r, Rc, chro_r
+	vsub.s16	chro_g, Gc, chro_g
+	vadd.s16	chro_b, Bc, chro_b
+
+	pld	[U]
+
+	/* Y Top Row */
+	vld2.u8	{y1,y2}, [Y1,:128]!
+
+	/* y1 : chrominance + luminance, then clamp (divide by 64) */
+	vmull.u8	lumi, y1, coefY
+	vqadd.s16	red, lumi, chro_r
+	vqadd.s16	green, lumi, chro_g
+	vqadd.s16	blue, lumi, chro_b
+	vqrshrun.s16	red1, red, #6
+	vqrshrun.s16	green1, green, #6
+	vqrshrun.s16	blue1, blue, #6
+
+	/* y2 : chrominance + luminance, then clamp (divide by 64) */
+	vmull.u8	lumi, y2, coefY
+	vqadd.s16	red, lumi, chro_r
+	vqadd.s16	green, lumi, chro_g
+	vqadd.s16	blue, lumi, chro_b
+	vqrshrun.s16	red2, red, #6
+	vqrshrun.s16	green2, green, #6
+	vqrshrun.s16	blue2, blue, #6
+
+	pld	[Y1]
+
+	vmov.u8	alpha2, #255
+	vzip.u8	red1, red2
+	vzip.u8	green1, green2
+	vzip.u8	blue1, blue2
+
+	vst4.u8		{red1,green1,blue1,alpha1}, [O1,:128]!
+	vst4.u8		{red2,green2,blue2,alpha2}, [O1,:128]!
+
+	/* Y Bottom Row */
+	vld2.u8	{y1,y2}, [Y2,:128]!
+
+	/* y1 : chrominance + luminance, then clamp (divide by 64) */
+	vmull.u8	lumi, y1, coefY
+	vqadd.s16	red, lumi, chro_r
+	vqadd.s16	green, lumi, chro_g
+	vqadd.s16	blue, lumi, chro_b
+	vqrshrun.s16	red1, red, #6
+	vqrshrun.s16	green1, green, #6
+	vqrshrun.s16	blue1, blue, #6
+
+	/* y2 : chrominance + luminance, then clamp (divide by 64) */
+	vmull.u8	lumi, y2, coefY
+	vqadd.s16	red, lumi, chro_r
+	vqadd.s16	green, lumi, chro_g
+	vqadd.s16	blue, lumi, chro_b
+	vqrshrun.s16	red2, red, #6
+	vqrshrun.s16	green2, green, #6
+	vqrshrun.s16	blue2, blue, #6
+
+	pld	[Y2]
+
+	vmov.u8	alpha2, #255
+	vzip.u8	red1, red2
+	vzip.u8	green1, green2
+	vzip.u8	blue1, blue2
+
+	vst4.u8		{red1,green1,blue1,alpha1}, [O2,:128]!
+	vst4.u8		{red2,green2,blue2,alpha2}, [O2,:128]!
+
+	/* next columns (x16) */
+	subs	COUNT,	COUNT,	#16
+	bgt		loop_col
+
+	/* next rows (x2) */
+	subs	HEIGHT,	#2
+	add		O1,	O2,	OPAD
+	add		Y1,	Y2,	YPAD
+	add		U,	U,	YPAD
+	b		loop_row
--- a/ijkmediaplayer/jni/ijksdl/ffmpeg/abi_armv7a_neon/nv21_rgb.S
+++ b/ijkmediaplayer/jni/ijksdl/ffmpeg/abi_armv7a_neon/nv21_rgb.S
+ @*****************************************************************************
+ @ nv21_rgb.S : ARM NEONv1 NV21 to RGB chroma conversion
+ @*****************************************************************************
+ @ Copyright (C) 2011 Sébastien Toque
+ @                    Rémi Denis-Courmont
+ @
+ @ This program is free software; you can redistribute it and/or modify it
+ @ under the terms of the GNU Lesser General Public License as published by
+ @ the Free Software Foundation; either version 2.1 of the License, or
+ @ (at your option) any later version.
+ @
+ @ This program is distributed in the hope that it will be useful,
+ @ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ @ GNU Lesser General Public License for more details.
+ @
+ @ You should have received a copy of the GNU Lesser General Public License
+ @ along with this program; if not, write to the Free Software Foundation,
+ @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ @****************************************************************************/
+
+	.syntax unified
+	.fpu neon
+	.text
+
+/* ARM */
+#define O1	r0
+#define O2	r1
+#define WIDTH	r2
+#define HEIGHT	r3
+#define Y1	r4
+#define Y2	r5
+#define U	r6
+#define V	r7
+#define YPITCH	r8
+#define OPAD	r10
+#define YPAD	r11
+#define COUNT	ip
+#define OPITCH	lr
+
+/* NEON */
+#define coefY	D0
+#define coefRV	D1
+#define coefGU	D2
+#define coefGV	D3
+#define coefBU	D4
+#define Rc	Q3
+#define Gc	Q4
+#define Bc	Q5
+
+#define u	D24
+#define v	D25
+#define y1	D28
+#define y2	D29
+
+#define chro_r	Q6
+#define chro_g	Q7
+#define chro_b	Q8
+#define red		Q9
+#define green	Q10
+#define blue	Q11
+#define lumi	Q15
+
+#define red1	D24
+#define green1	D25
+#define blue1	D26
+#define alpha1	D27
+#define red2	D28
+#define green2	D29
+#define blue2	D30
+#define alpha2	D31
+
+coefficients:
+    .short  -15872
+    .short    4992
+    .short  -18432
+
+	.align 2
+	.global nv21_rgb_neon
+	.type	nv21_rgb_neon, %function
+nv21_rgb_neon:
+	push		{r4-r8,r10-r11,lr}
+	vpush		{q4-q7}
+
+	/* load arguments */
+	ldmia		r0,	{O1, OPITCH}
+	ldmia		r1,	{Y1, U, V, YPITCH}
+
+	/* round the width to be a multiple of 16 */
+	ands		OPAD, WIDTH, #15
+	sub			WIDTH, WIDTH, OPAD
+	addne		WIDTH, WIDTH, #16
+
+	/* init constants (scale value by 64) */
+	vmov.u8		coefY, #74
+	vmov.u8		coefRV, #115
+	vmov.u8		coefGU, #14
+	vmov.u8		coefGV, #34
+	vmov.u8		coefBU, #135
+	adr			OPAD, coefficients
+	vld1.s16	{d6[], d7[]}, [OPAD]!
+	vld1.s16	{d8[], d9[]}, [OPAD]!
+	vld1.s16	{d10[], d11[]}, [OPAD]!
+	vmov.u8		alpha1, #255
+
+	/* init padding */
+	cmp			HEIGHT,	#0
+	sub			OPAD,	OPITCH,	WIDTH, lsl #2
+	sub			YPAD,	YPITCH,	WIDTH
+
+loop_row:
+	movsgt	COUNT,	WIDTH
+	add		O2,	O1,	OPITCH
+	add		Y2,	Y1,	YPITCH
+	/* exit if all rows have been processed */
+	vpople	{q4-q7}
+	pople	{r4-r8,r10-r11,pc}
+
+loop_col:
+
+	/* Common U & V */
+
+	vld2.u8	{u,v}, [U,:128]!
+
+	vmull.u8	chro_r, u, coefRV
+	vmull.u8	chro_g, v, coefGU
+	vmlal.u8	chro_g, u, coefGV
+	vmull.u8	chro_b, v, coefBU
+
+	vadd.s16	chro_r, Rc, chro_r
+	vsub.s16	chro_g, Gc, chro_g
+	vadd.s16	chro_b, Bc, chro_b
+
+	pld	[U]
+
+	/* Y Top Row */
+	vld2.u8	{y1,y2}, [Y1,:128]!
+
+	/* y1 : chrominance + luminance, then clamp (divide by 64) */
+	vmull.u8	lumi, y1, coefY
+	vqadd.s16	red, lumi, chro_r
+	vqadd.s16	green, lumi, chro_g
+	vqadd.s16	blue, lumi, chro_b
+	vqrshrun.s16	red1, red, #6
+	vqrshrun.s16	green1, green, #6
+	vqrshrun.s16	blue1, blue, #6
+
+	/* y2 : chrominance + luminance, then clamp (divide by 64) */
+	vmull.u8	lumi, y2, coefY
+	vqadd.s16	red, lumi, chro_r
+	vqadd.s16	green, lumi, chro_g
+	vqadd.s16	blue, lumi, chro_b
+	vqrshrun.s16	red2, red, #6
+	vqrshrun.s16	green2, green, #6
+	vqrshrun.s16	blue2, blue, #6
+
+	pld	[Y1]
+
+	vmov.u8	alpha2, #255
+	vzip.u8	red1, red2
+	vzip.u8	green1, green2
+	vzip.u8	blue1, blue2
+
+	vst4.u8		{red1,green1,blue1,alpha1}, [O1,:128]!
+	vst4.u8		{red2,green2,blue2,alpha2}, [O1,:128]!
+
+	/* Y Bottom Row */
+	vld2.u8	{y1,y2}, [Y2,:128]!
+
+	/* y1 : chrominance + luminance, then clamp (divide by 64) */
+	vmull.u8	lumi, y1, coefY
+	vqadd.s16	red, lumi, chro_r
+	vqadd.s16	green, lumi, chro_g
+	vqadd.s16	blue, lumi, chro_b
+	vqrshrun.s16	red1, red, #6
+	vqrshrun.s16	green1, green, #6
+	vqrshrun.s16	blue1, blue, #6
+
+	/* y2 : chrominance + luminance, then clamp (divide by 64) */
+	vmull.u8	lumi, y2, coefY
+	vqadd.s16	red, lumi, chro_r
+	vqadd.s16	green, lumi, chro_g
+	vqadd.s16	blue, lumi, chro_b
+	vqrshrun.s16	red2, red, #6
+	vqrshrun.s16	green2, green, #6
+	vqrshrun.s16	blue2, blue, #6
+
+	pld	[Y2]
+
+	vmov.u8	alpha2, #255
+	vzip.u8	red1, red2
+	vzip.u8	green1, green2
+	vzip.u8	blue1, blue2
+
+	vst4.u8		{red1,green1,blue1,alpha1}, [O2,:128]!
+	vst4.u8		{red2,green2,blue2,alpha2}, [O2,:128]!
+
+	/* next columns (x16) */
+	subs	COUNT,	COUNT,	#16
+	bgt		loop_col
+
+	/* next rows (x2) */
+	subs	HEIGHT,	#2
+	add		O1,	O2,	OPAD
+	add		Y1,	Y2,	YPAD
+	add		U,	U,	YPAD
+	b		loop_row
--- a/ijkmediaplayer/jni/ijksdl/ffmpeg/ijksdl_image_convert.h
+++ b/ijkmediaplayer/jni/ijksdl/ffmpeg/ijksdl_image_convert.h
+/*
+ * ijksdl_ffinc.h
+ *      ffmpeg headers
+ *
+ * Copyright (c) 2013 Zhang Rui <bbcallen@gmail.com>
+ *
+ * This file is part of ijkPlayer.
+ *
+ * ijkPlayer is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * ijkPlayer is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with ijkPlayer; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef IJKSDL__FFMPEG__IJKSDL_IMAGE_CONVERT_H
+#define IJKSDL__FFMPEG__IJKSDL_IMAGE_CONVERT_H
+
+#include <stdint.h>
+#include "ijksdl_inc_ffmpeg.h"
+
+int ijk_image_convert(int width, int height,
+    enum AVPixelFormat dst_format, uint8_t **dst_data, int *dst_linesize,
+    enum AVPixelFormat src_format, const uint8_t **src_data, int *src_linesize);
+
+#endif
--- a/ijkmediaplayer/jni/ijksdl/ffmpeg/ijksdl_inc_ffmpeg.h
+++ b/ijkmediaplayer/jni/ijksdl/ffmpeg/ijksdl_inc_ffmpeg.h
@@ -21,8 +21,8 @@
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

-#ifndef IJKPLAYER__IJKSDL_FFINC_H
-#define IJKPLAYER__IJKSDL_FFINC_H
+#ifndef IJKSDL__FFMPEG__IJKSDL_FFINC_H
+#define IJKSDL__FFMPEG__IJKSDL_FFINC_H

 #include "libavutil/imgutils.h"
 #include "libavutil/pixfmt.h"

--- a/ijkmediaplayer/jni/ijksdl/ffmpeg/ijksdl_vout_overlay_ffmpeg.c
+++ b/ijkmediaplayer/jni/ijksdl/ffmpeg/ijksdl_vout_overlay_ffmpeg.c
@@ -29,6 +29,7 @@
 #include "../ijksdl_vout_internal.h"
 #include "../ijksdl_video.h"
 #include "ijksdl_inc_ffmpeg.h"
+#include "ijksdl_image_convert.h"

 typedef struct SDL_VoutOverlay_Opaque {
    SDL_mutex *mutex;
@@ -38,6 +39,8 @@ typedef struct SDL_VoutOverlay_Opaque {

    Uint16 pitches[AV_NUM_DATA_POINTERS];
    Uint8 *pixels[AV_NUM_DATA_POINTERS];
+
+    int no_neon_warned;
 } SDL_VoutOverlay_Opaque;

 /* Always assume a linesize alignment of 1 here */
@@ -117,6 +120,8 @@ SDL_VoutOverlay *SDL_VoutFFmpeg_CreateOverlay(int width, int height, Uint32 form
        return NULL;
    }

+    width = IJKALIGN(width, 32);
+
    SDL_VoutOverlay_Opaque *opaque = overlay->opaque;
    overlay->format = format;
    overlay->pitches = opaque->pitches;
@@ -140,7 +145,7 @@ SDL_VoutOverlay *SDL_VoutFFmpeg_CreateOverlay(int width, int height, Uint32 form
        break;
    }
    case SDL_FCC_RV32: {
-        ff_format = AV_PIX_FMT_RGB32;
+        ff_format = AV_PIX_FMT_0BGR32;
        planes = 1;
        break;
    }
@@ -170,11 +175,13 @@ SDL_VoutOverlay *SDL_VoutFFmpeg_CreateOverlay(int width, int height, Uint32 form

 int SDL_VoutFFmpeg_ConvertPicture(
    const SDL_VoutOverlay *overlay,
-    int width, int height, enum AVPixelFormat src_format, uint8_t **src_data, int *src_linesize,
+    int width, int height,
+    enum AVPixelFormat src_format, const uint8_t **src_data, int *src_linesize,
    struct SwsContext **p_sws_ctx, int sws_flags)
 {
    assert(overlay);
    assert(p_sws_ctx);
+    SDL_VoutOverlay_Opaque *opaque = overlay->opaque;
    AVPicture dest_pic = { { 0 } };

    enum AVPixelFormat dst_format = AV_PIX_FMT_NONE;
@@ -201,18 +208,27 @@ int SDL_VoutFFmpeg_ConvertPicture(
        dest_pic.linesize[i] = overlay->pitches[i];
    }

-    *p_sws_ctx = sws_getCachedContext(*p_sws_ctx,
-        width, height, src_format, width, height,
-        dst_format, sws_flags, NULL, NULL, NULL);
-    if (*p_sws_ctx == NULL) {
-        ALOGE("sws_getCachedContext failed");
-        return -1;
+    if (ijk_image_convert(width, height,
+        dst_format, dest_pic.data, dest_pic.linesize,
+        src_format, src_data, src_linesize)) {
+        *p_sws_ctx = sws_getCachedContext(*p_sws_ctx,
+            width, height, src_format, width, height,
+            dst_format, sws_flags, NULL, NULL, NULL);
+        if (*p_sws_ctx == NULL) {
+            ALOGE("sws_getCachedContext failed");
+            return -1;
+        }
+
+        sws_scale(*p_sws_ctx, (const uint8_t **) src_data, src_linesize,
+            0, height, dest_pic.data, dest_pic.linesize);
+
+        if (!opaque->no_neon_warned) {
+            opaque->no_neon_warned = 1;
+            ALOGE("non-neon image convert %s -> %s", av_get_pix_fmt_name(src_format), av_get_pix_fmt_name(dst_format));
+        }
    }

-    sws_scale(*p_sws_ctx, (const uint8_t **) src_data, src_linesize,
-        0, height, dest_pic.data, dest_pic.linesize);
-
-    // FIXME:
-    // duplicate_right_border_pixels(vp->bmp);
+// FIXME:
+// duplicate_right_border_pixels(vp->bmp);
    return 0;
 }
--- a/ijkmediaplayer/jni/ijksdl/ffmpeg/ijksdl_vout_overlay_ffmpeg.h
+++ b/ijkmediaplayer/jni/ijksdl/ffmpeg/ijksdl_vout_overlay_ffmpeg.h
@@ -21,8 +21,8 @@
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

-#ifndef IJKSDL__IJKSDL_VOUT_OVERLAY_FFMPEG_H
-#define IJKSDL__IJKSDL_VOUT_OVERLAY_FFMPEG_H
+#ifndef IJKSDL__FFMPEG__IJKSDL_VOUT_OVERLAY_FFMPEG_H
+#define IJKSDL__FFMPEG__IJKSDL_VOUT_OVERLAY_FFMPEG_H

 #include "../ijksdl_stdinc.h"
 #include "../ijksdl_vout.h"
@@ -33,7 +33,8 @@ SDL_VoutOverlay *SDL_VoutFFmpeg_CreateOverlay(int width, int height, Uint32 form

 int SDL_VoutFFmpeg_ConvertPicture(
    const SDL_VoutOverlay *overlay,
-    int width, int height, enum AVPixelFormat src_format, uint8_t **src_data, int *src_linesize,
+    int width, int height,
+    enum AVPixelFormat src_format, const uint8_t **src_data, int *src_linesize,
    struct SwsContext **p_sws_ctx, int sws_flags);

 #endif