提交 7fcda743 编写于 作者: Z Zhang Rui

ijksdl: add yuv2rgb neon optimize

上级 6800079e
......@@ -30,7 +30,7 @@ APP_CFLAGS := -O3 -Wall -pipe \
-Wno-psabi -Wa,--noexecstack \
-DANDROID -DNDEBUG
# -D__ARM_ARCH_5__ -D__ARM_ARCH_5E__ -D__ARM_ARCH_5T__ -D__ARM_ARCH_5TE__
# APP_CFLAGS += -D__ARM_ARCH_5__ -D__ARM_ARCH_5E__ -D__ARM_ARCH_5T__ -D__ARM_ARCH_5TE__
# armeabi-v7a
APP_CFLAGS += -march=armv7-a -mfpu=vfpv3-d16 -mfloat-abi=softfp
......
......@@ -682,7 +682,7 @@ static int queue_picture(FFPlayer *ffp, AVFrame *src_frame, double pts, int64_t
SDL_VoutLockYUVOverlay(vp->bmp);
if (SDL_VoutFFmpeg_ConvertPicture(vp->bmp, vp->width, vp->height,
src_frame->format, src_frame->data, src_frame->linesize,
src_frame->format, (const uint8_t**)src_frame->data, src_frame->linesize,
&is->img_convert_ctx, ffp->sws_flags) < 0) {
fprintf(stderr, "Cannot initialize the conversion context\n");
exit(1);
......
......@@ -464,8 +464,8 @@ inline static void ffp_reset_internal(FFPlayer *ffp)
ffp->sar_den = 0;
// ffp->overlay_format = SDL_FCC_YV12;
ffp->overlay_format = SDL_FCC_RV16;
// ffp->overlay_format = SDL_FCC_RV32;
// ffp->overlay_format = SDL_FCC_RV16;
ffp->overlay_format = SDL_FCC_RV32;
ffp->last_error = 0;
ffp->prepared = 0;
......
......@@ -36,6 +36,15 @@ LOCAL_SRC_FILES += ijksdl_vout.c
LOCAL_SRC_FILES += ffmpeg/ijksdl_vout_overlay_ffmpeg.c
ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
LOCAL_CFLAGS += -DHAVE_NEON=1
LOCAL_SRC_FILES += ffmpeg/abi_armv7a_neon/image_convert.c
LOCAL_SRC_FILES += ffmpeg/abi_armv7a_neon/i420_rgb.S.arm.neon
LOCAL_SRC_FILES += ffmpeg/abi_armv7a_neon/i420_rv16.S.arm.neon
else
LOCAL_SRC_FILES += ffmpeg/abi_all/image_convert.c
endif
LOCAL_SRC_FILES += android/android_audiotrack.c
LOCAL_SRC_FILES += android/android_nativewindow.c
LOCAL_SRC_FILES += android/ijksdl_android_jni.c
......@@ -44,6 +53,9 @@ LOCAL_SRC_FILES += android/ijksdl_vout_android_nativewindow.c
LOCAL_SRC_FILES += android/ijksdl_vout_android_surface.c
LOCAL_SHARED_LIBRARIES := ffmpeg ijkutil
LOCAL_STATIC_LIBRARIES := cpufeatures
LOCAL_MODULE := ijksdl
include $(BUILD_SHARED_LIBRARY)
$(call import-module,android/cpufeatures)
......@@ -237,7 +237,7 @@ int sdl_native_window_display_l(ANativeWindow *native_window, SDL_VoutOverlay *o
}
if (voutDesc->hal_format != overlayDesc->hal_format) {
SDLTRACE("ANativeWindow_setBuffersGeometry: w=%d, h=%d, f=%.4s(0x%x) => w=%d, h=%d, f=%.4s(0x%x)",
ALOGD("ANativeWindow_setBuffersGeometry: w=%d, h=%d, f=%.4s(0x%x) => w=%d, h=%d, f=%.4s(0x%x)",
curr_w, curr_h, (char*) &curr_format, curr_format,
buff_w, buff_h, (char*) &overlay->format, overlay->format);
retval = ANativeWindow_setBuffersGeometry(native_window, buff_w, buff_h, overlayDesc->hal_format);
......
/*****************************************************************************
* yuv_rgb.c : ARM NEONv1 YUV to RGB32 chroma conversion for VLC
*****************************************************************************
* Copyright (C) 2011 Sébastien Toque
* Rémi Denis-Courmont
* Copyright (C) 2013 Zhang Rui <bbcallen@gmail.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
*****************************************************************************/
#include "../ijksdl_image_convert.h"
int ijk_image_convert(int width, int height,
enum AVPixelFormat dst_format, uint8_t **dst_data, int *dst_linesize,
enum AVPixelFormat src_format, const uint8_t **src_data, int *src_linesize)
{
return -1;
}
/*****************************************************************************
* chroma_neon.h
*****************************************************************************
* Copyright (C) 2011 Rémi Denis-Courmont
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
*****************************************************************************/
/* Planes must start on a 16-bytes boundary. Pitches must be multiples of 16
* bytes even for subsampled components. */
/* Planar picture buffer.
* Pitch corresponds to luminance component in bytes. Chrominance pitches are
* inferred from the color subsampling ratio. */
struct yuv_planes
{
void *y, *u, *v;
size_t pitch;
};
struct yuv_planes_in
{
const void *y, *u, *v;
size_t pitch;
};
/* Packed picture buffer. Pitch is in bytes (_not_ pixels). */
struct yuv_pack
{
void *yuv;
size_t pitch;
};
/* I420 to RGBA conversion. */
void i420_rgb_neon (struct yuv_pack *const out,
const struct yuv_planes_in *const in,
int width, int height) __asm__("i420_rgb_neon");
/* I420 to RV16 conversion. */
void i420_rv16_neon (struct yuv_pack *const out,
const struct yuv_planes_in *const in,
int width, int height) __asm__("i420_rv16_neon");
/* NV21 to RGBA conversion. */
void nv21_rgb_neon (struct yuv_pack *const out,
const struct yuv_planes_in *const in,
int width, int height) __asm__("nv21_rgb_neon");
/* NV12 to RGBA conversion. */
void nv12_rgb_neon (struct yuv_pack *const out,
const struct yuv_planes_in *const in,
int width, int height) __asm__("nv12_rgb_neon");
@*****************************************************************************
@ i420_rgb.S : ARM NEONv1 I420 to RGB chroma conversion
@*****************************************************************************
@ Copyright (C) 2011 Sébastien Toque
@ Rémi Denis-Courmont
@
@ This program is free software; you can redistribute it and/or modify it
@ under the terms of the GNU Lesser General Public License as published by
@ the Free Software Foundation; either version 2.1 of the License, or
@ (at your option) any later version.
@
@ This program is distributed in the hope that it will be useful,
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@ GNU Lesser General Public License for more details.
@
@ You should have received a copy of the GNU Lesser General Public License
@ along with this program; if not, write to the Free Software Foundation,
@ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
@****************************************************************************/
.syntax unified
.fpu neon
.text
/* ARM */
#define O1 r0
#define O2 r1
#define WIDTH r2
#define HEIGHT r3
#define Y1 r4
#define Y2 r5
#define U r6
#define V r7
#define YPITCH r8
#define OPAD r10
#define YPAD r11
#define COUNT ip
#define OPITCH lr
/* NEON */
#define coefY D0
#define coefRV D1
#define coefGU D2
#define coefGV D3
#define coefBU D4
#define Rc Q3
#define Gc Q4
#define Bc Q5
#define u D24
#define v D25
#define y1 D18
#define y2 D19
#define chro_r Q6
#define chro_g Q7
#define chro_b Q8
#define lumi1 Q15
#define lumi2 Q10
#define red16_1 Q9
#define green16_1 Q10
#define blue16_1 Q11
#define red16_2 Q12
#define green16_2 Q13
#define blue16_2 Q14
#define red1 D24
#define green1 D25
#define blue1 D26
#define alpha1 D27
#define red2 D28
#define green2 D29
#define blue2 D30
#define alpha2 D31
coefficients:
.short -15872
.short 4992
.short -18432
.align 2
.global i420_rgb_neon
.type i420_rgb_neon, %function
i420_rgb_neon:
push {r4-r8,r10-r11,lr}
vpush {q4-q7}
/* load arguments */
ldmia r0, {O1, OPITCH}
ldmia r1, {Y1, U, V, YPITCH}
/* round the width to be a multiple of 16 */
ands OPAD, WIDTH, #15
sub WIDTH, WIDTH, OPAD
addne WIDTH, WIDTH, #16
/* init constants (scale value by 64) */
vmov.u8 coefY, #74
vmov.u8 coefRV, #115
vmov.u8 coefGU, #14
vmov.u8 coefGV, #34
vmov.u8 coefBU, #135
adr OPAD, coefficients
vld1.s16 {d6[], d7[]}, [OPAD]!
vld1.s16 {d8[], d9[]}, [OPAD]!
vld1.s16 {d10[], d11[]}, [OPAD]!
vmov.u8 alpha1, #255
/* init padding */
cmp HEIGHT, #0
sub OPAD, OPITCH, WIDTH, lsl #2
sub YPAD, YPITCH, WIDTH
loop_row:
movsgt COUNT, WIDTH
add O2, O1, OPITCH
add Y2, Y1, YPITCH
/* exit if all rows have been processed */
vpople {q4-q7}
pople {r4-r8,r10-r11,pc}
loop_col:
/* Common U & V */
vld1.u8 {u}, [U,:64]!
vld1.u8 {v}, [V,:64]!
/* Y Top Row */
vld2.u8 {y1,y2}, [Y1,:128]!
vmull.u8 Q14, v, coefRV
vmull.u8 Q11, u, coefGU
vmull.u8 Q13, u, coefBU
vmlal.u8 Q11, v, coefGV
vmull.u8 lumi2, y2, coefY
vmull.u8 lumi1, y1, coefY
vadd.s16 chro_r, Rc, Q14
vadd.s16 chro_b, Bc, Q13
vsub.s16 chro_g, Gc, Q11
pld [U]
pld [V]
/* chrominance + luminance */
vqadd.s16 red16_2, lumi2, chro_r
vqadd.s16 blue16_2, lumi2, chro_b
vqadd.s16 green16_2, lumi2, chro_g
vqadd.s16 red16_1, lumi1, chro_r
vqadd.s16 green16_1, lumi1, chro_g
vqadd.s16 blue16_1, lumi1, chro_b
/* clamp (divide by 64) */
vqrshrun.s16 blue2, blue16_2, #6
vqrshrun.s16 red2, red16_2, #6
vqrshrun.s16 green2, green16_2, #6
vqrshrun.s16 red1, red16_1, #6
vqrshrun.s16 green1, green16_1, #6
vqrshrun.s16 blue1, blue16_1, #6
pld [Y1]
/* Y Bottom Row */
vld2.u8 {y1,y2}, [Y2,:128]!
vmov.u8 alpha1, #255
vzip.u8 red1, red2
vzip.u8 green1, green2
vzip.u8 blue1, blue2
vmull.u8 lumi2, y2, coefY
vst4.u8 {red1,green1,blue1,alpha1}, [O1,:128]!
vst4.u8 {red2,green2,blue2,alpha2}, [O1,:128]!
/* chrominance + luminance */
vmull.u8 lumi1, y1, coefY
vqadd.s16 red16_2, lumi2, chro_r
vqadd.s16 green16_2, lumi2, chro_g
vqadd.s16 blue16_2, lumi2, chro_b
vqadd.s16 red16_1, lumi1, chro_r
vqadd.s16 green16_1, lumi1, chro_g
vqadd.s16 blue16_1, lumi1, chro_b
/* clamp (divide by 64) */
vqrshrun.s16 blue2, blue16_2, #6
vqrshrun.s16 red2, red16_2, #6
vqrshrun.s16 green2, green16_2, #6
vqrshrun.s16 red1, red16_1, #6
vqrshrun.s16 green1, green16_1, #6
vqrshrun.s16 blue1, blue16_1, #6
pld [Y2]
vmov.u8 alpha2, #255
vzip.u8 red1, red2
vzip.u8 green1, green2
vzip.u8 blue1, blue2
vst4.u8 {red1,green1,blue1,alpha1}, [O2,:128]!
vst4.u8 {red2,green2,blue2,alpha2}, [O2,:128]!
/* next columns (x16) */
subs COUNT, COUNT, #16
bgt loop_col
/* next rows (x2) */
subs HEIGHT, #2
add O1, O2, OPAD
add Y1, Y2, YPAD
add U, U, YPAD, lsr #1
add V, V, YPAD, lsr #1
b loop_row
@*****************************************************************************
@ i420_rv16.S : ARM NEONv1 I420 to RV16 chroma conversion
@*****************************************************************************
@ Copyright (C) 2011 Sébastien Toque
@ Rémi Denis-Courmont
@
@ This program is free software; you can redistribute it and/or modify it
@ under the terms of the GNU Lesser General Public License as published by
@ the Free Software Foundation; either version 2.1 of the License, or
@ (at your option) any later version.
@
@ This program is distributed in the hope that it will be useful,
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@ GNU Lesser General Public License for more details.
@
@ You should have received a copy of the GNU Lesser General Public License
@ along with this program; if not, write to the Free Software Foundation,
@ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
@****************************************************************************/
.syntax unified
.fpu neon
.text
/* ARM */
#define O1 r0
#define O2 r1
#define WIDTH r2
#define HEIGHT r3
#define Y1 r4
#define Y2 r5
#define U r6
#define V r7
#define YPITCH r8
#define OPAD r10
#define YPAD r11
#define COUNT ip
#define OPITCH lr
/* NEON */
#define coefY D0
#define coefRV D1
#define coefGU D2
#define coefGV D3
#define coefBU D4
#define Rc Q3
#define Gc Q4
#define Bc Q5
#define u D24
#define v D25
#define y1 D18
#define y2 D19
#define chro_r Q6
#define chro_g Q7
#define chro_b Q8
#define lumi1 Q15
#define lumi2 Q10
#define red16_1 Q9
#define green16_1 Q10
#define blue16_1 Q11
#define red16_2 Q12
#define green16_2 Q13
#define blue16_2 Q14
#define red1 D25
#define green1 D26
#define blue1 D27
#define red2 D29
#define green2 D30
#define blue2 D31
#define out1l D24
#define out1h D25
#define out2l D28
#define out2h D29
coefficients:
.short -15872
.short 4992
.short -18432
.align 2
.global i420_rv16_neon
.type i420_rv16_neon, %function
i420_rv16_neon:
push {r4-r8,r10-r11,lr}
vpush {q4-q7}
/* load arguments */
ldmia r0, {O1, OPITCH}
ldmia r1, {Y1, U, V, YPITCH}
/* round the width to be a multiple of 16 */
ands OPAD, WIDTH, #15
sub WIDTH, WIDTH, OPAD
addne WIDTH, WIDTH, #16
/* init constants (scale value by 64) */
vmov.u8 coefY, #74
vmov.u8 coefRV, #115
vmov.u8 coefGU, #14
vmov.u8 coefGV, #34
vmov.u8 coefBU, #135
adr OPAD, coefficients
vld1.s16 {d6[], d7[]}, [OPAD]!
vld1.s16 {d8[], d9[]}, [OPAD]!
vld1.s16 {d10[], d11[]}, [OPAD]!
/* init padding */
cmp HEIGHT, #0
sub OPAD, OPITCH, WIDTH, lsl #1
sub YPAD, YPITCH, WIDTH
loop_row:
movsgt COUNT, WIDTH
add O2, O1, OPITCH
add Y2, Y1, YPITCH
/* exit if all rows have been processed */
vpople {q4-q7}
pople {r4-r8,r10-r11,pc}
loop_col:
/* Common U & V */
vld1.u8 {u}, [U,:64]!
vld1.u8 {v}, [V,:64]!
/* Y Top Row */
vld2.u8 {y1,y2}, [Y1,:128]!
vmull.u8 Q14, v, coefRV
vmull.u8 Q11, u, coefGU
vmull.u8 Q13, u, coefBU
vmlal.u8 Q11, v, coefGV
vmull.u8 lumi2, y2, coefY
vmull.u8 lumi1, y1, coefY
vadd.s16 chro_r, Rc, Q14
vadd.s16 chro_b, Bc, Q13
vsub.s16 chro_g, Gc, Q11
pld [U]
pld [V]
/* chrominance + luminance */
vqadd.s16 red16_2, lumi2, chro_r
vqadd.s16 green16_2, lumi2, chro_g
vqadd.s16 blue16_2, lumi2, chro_b
vqadd.s16 red16_1, lumi1, chro_r
vqadd.s16 green16_1, lumi1, chro_g
vqadd.s16 blue16_1, lumi1, chro_b
/* clamp (divide by 64) */
vqrshrun.s16 green2, green16_2, #6
vqrshrun.s16 blue2, blue16_2, #6
vqrshrun.s16 red2, red16_2, #6
vqrshrun.s16 green1, green16_1, #6
vqrshrun.s16 red1, red16_1, #6
vqrshrun.s16 blue1, blue16_1, #6
pld [Y1]
/* pack into RGB565 */
vshl.u8 out2l, green2, #3 // low 2a
vsri.u8 out2h, green2, #5 // high 2
vshl.u8 out1l, green1, #3 // low 1a
vsri.u8 out1h, green1, #5 // high 1
vsri.u8 out2l, blue2, #3 // low 2b
vsri.u8 out1l, blue1, #3 // low 1b
/* Y Bottom Row */
vld2.u8 {y1,y2}, [Y2,:128]!
/* Top Row output */
vzip.u8 out1h, out2h
vmull.u8 lumi2, y2, coefY
vzip.u8 out1l, out2l
vmull.u8 lumi1, y1, coefY
vst2.u8 {out1l, out1h}, [O1,:128]!
vst2.u8 {out2l, out2h}, [O1,:128]!
/* chrominance + luminance */
vqadd.s16 green16_2, lumi2, chro_g
vqadd.s16 red16_2, lumi2, chro_r
vqadd.s16 blue16_2, lumi2, chro_b
vqadd.s16 red16_1, lumi1, chro_r
vqadd.s16 green16_1, lumi1, chro_g
vqadd.s16 blue16_1, lumi1, chro_b
/* clamp (divide by 64) */
vqrshrun.s16 green2, green16_2, #6
vqrshrun.s16 blue2, blue16_2, #6
vqrshrun.s16 red2, red16_2, #6
vqrshrun.s16 green1, green16_1, #6
vqrshrun.s16 red1, red16_1, #6
vqrshrun.s16 blue1, blue16_1, #6
pld [Y1]
/* pack into RGB565 */
vshl.u8 out2l, green2, #3 // low 2a
vsri.u8 out2h, green2, #5 // high 2
vshl.u8 out1l, green1, #3 // low 1a
vsri.u8 out1h, green1, #5 // high 1
vsri.u8 out2l, blue2, #3 // low 2b
vsri.u8 out1l, blue1, #3 // low 1b
vzip.u8 out1h, out2h
vzip.u8 out1l, out2l
vst2.u8 {out1l, out1h}, [O2,:128]!
vst2.u8 {out2l, out2h}, [O2,:128]!
/* next columns (x16) */
subs COUNT, COUNT, #16
bgt loop_col
/* next rows (x2) */
subs HEIGHT, #2
add O1, O2, OPAD
add Y1, Y2, YPAD
add U, U, YPAD, lsr #1
add V, V, YPAD, lsr #1
b loop_row
/*****************************************************************************
* yuv_rgb.c : ARM NEONv1 YUV to RGB32 chroma conversion for VLC
*****************************************************************************
* Copyright (C) 2011 Sébastien Toque
* Rémi Denis-Courmont
* Copyright (C) 2013 Zhang Rui <bbcallen@gmail.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
*****************************************************************************/
#include "../ijksdl_image_convert.h"
#include <cpu-features.h>
#include "chroma_neon.h"
static void ijk_i420_rgb32_neon(int width, int height, uint8_t **dst_data, int *dst_linesize, const uint8_t **src_data, int *src_linesize)
{
struct yuv_pack out = { dst_data[0], dst_linesize[0] };
struct yuv_planes_in in = { src_data[0], src_data[1], src_data[2], src_linesize[0] };
i420_rgb_neon(&out, &in, width, height);
}
static void ijk_i420_rgb16_neon(int width, int height, uint8_t **dst_data, int *dst_linesize, const uint8_t **src_data, int *src_linesize)
{
struct yuv_pack out = { dst_data[0], dst_linesize[0] };
struct yuv_planes_in in = { src_data[0], src_data[1], src_data[2], src_linesize[0] };
i420_rv16_neon(&out, &in, width, height);
}
// FIXME: need nv12 and nv21 sample
#if 0
static void ijk_nv21_rgb32_neon(int width, int height, uint8_t **dst_data, int *dst_linesize, const uint8_t **src_data, int *src_linesize)
{
struct yuv_pack out = {dst_data[0], dst_linesize[0]};
struct yuv_planes_in in = {src_data[0], src_data[1], src_data[2], src_linesize[0]};
nv21_rgb_neon(&out, &in, width, height);
}
static void ijk_nv12_rgb32_neon(int width, int height, uint8_t **dst_data, int *dst_linesize, const uint8_t **src_data, int *src_linesize)
{
struct yuv_pack out = {dst_data[0], dst_linesize[0]};
struct yuv_planes_in in = {src_data[0], src_data[1], src_data[2], src_linesize[0]};
nv12_rgb_neon(&out, &in, width, height);
}
#endif
int ijk_image_convert(int width, int height,
enum AVPixelFormat dst_format, uint8_t **dst_data, int *dst_linesize,
enum AVPixelFormat src_format, const uint8_t **src_data, int *src_linesize)
{
if (!(android_getCpuFeatures() & (ANDROID_CPU_ARM_FEATURE_ARMv7 | ANDROID_CPU_ARM_FEATURE_NEON)))
return -1;
switch (src_format) {
case AV_PIX_FMT_YUV420P:
switch (dst_format) {
case AV_PIX_FMT_RGB565:
ijk_i420_rgb16_neon(width, height, dst_data, dst_linesize, src_data, src_linesize);
return 0;
break;
case AV_PIX_FMT_0BGR32:
ijk_i420_rgb32_neon(width, height, dst_data, dst_linesize, src_data, src_linesize);
return 0;
default:
break;
}
break;
default:
break;
}
return -1;
}
@*****************************************************************************
@ nv12_rgb.S : ARM NEONv1 NV12 to RGB chroma conversion
@*****************************************************************************
@ Copyright (C) 2011 Sébastien Toque
@ Rémi Denis-Courmont
@
@ This program is free software; you can redistribute it and/or modify it
@ under the terms of the GNU Lesser General Public License as published by
@ the Free Software Foundation; either version 2.1 of the License, or
@ (at your option) any later version.
@
@ This program is distributed in the hope that it will be useful,
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@ GNU Lesser General Public License for more details.
@
@ You should have received a copy of the GNU Lesser General Public License
@ along with this program; if not, write to the Free Software Foundation,
@ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
@****************************************************************************/
.syntax unified
.fpu neon
.text
/* ARM */
#define O1 r0
#define O2 r1
#define WIDTH r2
#define HEIGHT r3
#define Y1 r4
#define Y2 r5
#define U r6
#define V r7
#define YPITCH r8
#define OPAD r10
#define YPAD r11
#define COUNT ip
#define OPITCH lr
/* NEON */
#define coefY D0
#define coefRV D1
#define coefGU D2
#define coefGV D3
#define coefBU D4
#define Rc Q3
#define Gc Q4
#define Bc Q5
#define u D24
#define v D25
#define y1 D28
#define y2 D29
#define chro_r Q6
#define chro_g Q7
#define chro_b Q8
#define red Q9
#define green Q10
#define blue Q11
#define lumi Q15
#define red1 D24
#define green1 D25
#define blue1 D26
#define alpha1 D27
#define red2 D28
#define green2 D29
#define blue2 D30
#define alpha2 D31
coefficients:
.short -15872
.short 4992
.short -18432
.align 2
.global nv12_rgb_neon
.type nv12_rgb_neon, %function
nv12_rgb_neon:
push {r4-r8,r10-r11,lr}
vpush {q4-q7}
/* load arguments */
ldmia r0, {O1, OPITCH}
ldmia r1, {Y1, U, V, YPITCH}
/* round the width to be a multiple of 16 */
ands OPAD, WIDTH, #15
sub WIDTH, WIDTH, OPAD
addne WIDTH, WIDTH, #16
/* init constants (scale value by 64) */
vmov.u8 coefY, #74
vmov.u8 coefRV, #115
vmov.u8 coefGU, #14
vmov.u8 coefGV, #34
vmov.u8 coefBU, #135
adr OPAD, coefficients
vld1.s16 {d6[], d7[]}, [OPAD]!
vld1.s16 {d8[], d9[]}, [OPAD]!
vld1.s16 {d10[], d11[]}, [OPAD]!
vmov.u8 alpha1, #255
/* init padding */
cmp HEIGHT, #0
sub OPAD, OPITCH, WIDTH, lsl #2
sub YPAD, YPITCH, WIDTH
loop_row:
movsgt COUNT, WIDTH
add O2, O1, OPITCH
add Y2, Y1, YPITCH
/* exit if all rows have been processed */
vpople {q4-q7}
pople {r4-r8,r10-r11,pc}
loop_col:
/* Common U & V */
vld2.u8 {u,v}, [U,:128]!
vmull.u8 chro_r, v, coefRV
vmull.u8 chro_g, u, coefGU
vmlal.u8 chro_g, v, coefGV
vmull.u8 chro_b, u, coefBU
vadd.s16 chro_r, Rc, chro_r
vsub.s16 chro_g, Gc, chro_g
vadd.s16 chro_b, Bc, chro_b
pld [U]
/* Y Top Row */
vld2.u8 {y1,y2}, [Y1,:128]!
/* y1 : chrominance + luminance, then clamp (divide by 64) */
vmull.u8 lumi, y1, coefY
vqadd.s16 red, lumi, chro_r
vqadd.s16 green, lumi, chro_g
vqadd.s16 blue, lumi, chro_b
vqrshrun.s16 red1, red, #6
vqrshrun.s16 green1, green, #6
vqrshrun.s16 blue1, blue, #6
/* y2 : chrominance + luminance, then clamp (divide by 64) */
vmull.u8 lumi, y2, coefY
vqadd.s16 red, lumi, chro_r
vqadd.s16 green, lumi, chro_g
vqadd.s16 blue, lumi, chro_b
vqrshrun.s16 red2, red, #6
vqrshrun.s16 green2, green, #6
vqrshrun.s16 blue2, blue, #6
pld [Y1]
vmov.u8 alpha2, #255
vzip.u8 red1, red2
vzip.u8 green1, green2
vzip.u8 blue1, blue2
vst4.u8 {red1,green1,blue1,alpha1}, [O1,:128]!
vst4.u8 {red2,green2,blue2,alpha2}, [O1,:128]!
/* Y Bottom Row */
vld2.u8 {y1,y2}, [Y2,:128]!
/* y1 : chrominance + luminance, then clamp (divide by 64) */
vmull.u8 lumi, y1, coefY
vqadd.s16 red, lumi, chro_r
vqadd.s16 green, lumi, chro_g
vqadd.s16 blue, lumi, chro_b
vqrshrun.s16 red1, red, #6
vqrshrun.s16 green1, green, #6
vqrshrun.s16 blue1, blue, #6
/* y2 : chrominance + luminance, then clamp (divide by 64) */
vmull.u8 lumi, y2, coefY
vqadd.s16 red, lumi, chro_r
vqadd.s16 green, lumi, chro_g
vqadd.s16 blue, lumi, chro_b
vqrshrun.s16 red2, red, #6
vqrshrun.s16 green2, green, #6
vqrshrun.s16 blue2, blue, #6
pld [Y2]
vmov.u8 alpha2, #255
vzip.u8 red1, red2
vzip.u8 green1, green2
vzip.u8 blue1, blue2
vst4.u8 {red1,green1,blue1,alpha1}, [O2,:128]!
vst4.u8 {red2,green2,blue2,alpha2}, [O2,:128]!
/* next columns (x16) */
subs COUNT, COUNT, #16
bgt loop_col
/* next rows (x2) */
subs HEIGHT, #2
add O1, O2, OPAD
add Y1, Y2, YPAD
add U, U, YPAD
b loop_row
@*****************************************************************************
@ nv21_rgb.S : ARM NEONv1 NV21 to RGB chroma conversion
@*****************************************************************************
@ Copyright (C) 2011 Sébastien Toque
@ Rémi Denis-Courmont
@
@ This program is free software; you can redistribute it and/or modify it
@ under the terms of the GNU Lesser General Public License as published by
@ the Free Software Foundation; either version 2.1 of the License, or
@ (at your option) any later version.
@
@ This program is distributed in the hope that it will be useful,
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@ GNU Lesser General Public License for more details.
@
@ You should have received a copy of the GNU Lesser General Public License
@ along with this program; if not, write to the Free Software Foundation,
@ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
@****************************************************************************/
.syntax unified
.fpu neon
.text
/* ARM */
#define O1 r0
#define O2 r1
#define WIDTH r2
#define HEIGHT r3
#define Y1 r4
#define Y2 r5
#define U r6
#define V r7
#define YPITCH r8
#define OPAD r10
#define YPAD r11
#define COUNT ip
#define OPITCH lr
/* NEON */
#define coefY D0
#define coefRV D1
#define coefGU D2
#define coefGV D3
#define coefBU D4
#define Rc Q3
#define Gc Q4
#define Bc Q5
#define u D24
#define v D25
#define y1 D28
#define y2 D29
#define chro_r Q6
#define chro_g Q7
#define chro_b Q8
#define red Q9
#define green Q10
#define blue Q11
#define lumi Q15
#define red1 D24
#define green1 D25
#define blue1 D26
#define alpha1 D27
#define red2 D28
#define green2 D29
#define blue2 D30
#define alpha2 D31
coefficients:
.short -15872
.short 4992
.short -18432
.align 2
.global nv21_rgb_neon
.type nv21_rgb_neon, %function
nv21_rgb_neon:
push {r4-r8,r10-r11,lr}
vpush {q4-q7}
/* load arguments */
ldmia r0, {O1, OPITCH}
ldmia r1, {Y1, U, V, YPITCH}
/* round the width to be a multiple of 16 */
ands OPAD, WIDTH, #15
sub WIDTH, WIDTH, OPAD
addne WIDTH, WIDTH, #16
/* init constants (scale value by 64) */
vmov.u8 coefY, #74
vmov.u8 coefRV, #115
vmov.u8 coefGU, #14
vmov.u8 coefGV, #34
vmov.u8 coefBU, #135
adr OPAD, coefficients
vld1.s16 {d6[], d7[]}, [OPAD]!
vld1.s16 {d8[], d9[]}, [OPAD]!
vld1.s16 {d10[], d11[]}, [OPAD]!
vmov.u8 alpha1, #255
/* init padding */
cmp HEIGHT, #0
sub OPAD, OPITCH, WIDTH, lsl #2
sub YPAD, YPITCH, WIDTH
loop_row:
movsgt COUNT, WIDTH
add O2, O1, OPITCH
add Y2, Y1, YPITCH
/* exit if all rows have been processed */
vpople {q4-q7}
pople {r4-r8,r10-r11,pc}
loop_col:
/* Common U & V */
vld2.u8 {u,v}, [U,:128]!
vmull.u8 chro_r, u, coefRV
vmull.u8 chro_g, v, coefGU
vmlal.u8 chro_g, u, coefGV
vmull.u8 chro_b, v, coefBU
vadd.s16 chro_r, Rc, chro_r
vsub.s16 chro_g, Gc, chro_g
vadd.s16 chro_b, Bc, chro_b
pld [U]
/* Y Top Row */
vld2.u8 {y1,y2}, [Y1,:128]!
/* y1 : chrominance + luminance, then clamp (divide by 64) */
vmull.u8 lumi, y1, coefY
vqadd.s16 red, lumi, chro_r
vqadd.s16 green, lumi, chro_g
vqadd.s16 blue, lumi, chro_b
vqrshrun.s16 red1, red, #6
vqrshrun.s16 green1, green, #6
vqrshrun.s16 blue1, blue, #6
/* y2 : chrominance + luminance, then clamp (divide by 64) */
vmull.u8 lumi, y2, coefY
vqadd.s16 red, lumi, chro_r
vqadd.s16 green, lumi, chro_g
vqadd.s16 blue, lumi, chro_b
vqrshrun.s16 red2, red, #6
vqrshrun.s16 green2, green, #6
vqrshrun.s16 blue2, blue, #6
pld [Y1]
vmov.u8 alpha2, #255
vzip.u8 red1, red2
vzip.u8 green1, green2
vzip.u8 blue1, blue2
vst4.u8 {red1,green1,blue1,alpha1}, [O1,:128]!
vst4.u8 {red2,green2,blue2,alpha2}, [O1,:128]!
/* Y Bottom Row */
vld2.u8 {y1,y2}, [Y2,:128]!
/* y1 : chrominance + luminance, then clamp (divide by 64) */
vmull.u8 lumi, y1, coefY
vqadd.s16 red, lumi, chro_r
vqadd.s16 green, lumi, chro_g
vqadd.s16 blue, lumi, chro_b
vqrshrun.s16 red1, red, #6
vqrshrun.s16 green1, green, #6
vqrshrun.s16 blue1, blue, #6
/* y2 : chrominance + luminance, then clamp (divide by 64) */
vmull.u8 lumi, y2, coefY
vqadd.s16 red, lumi, chro_r
vqadd.s16 green, lumi, chro_g
vqadd.s16 blue, lumi, chro_b
vqrshrun.s16 red2, red, #6
vqrshrun.s16 green2, green, #6
vqrshrun.s16 blue2, blue, #6
pld [Y2]
vmov.u8 alpha2, #255
vzip.u8 red1, red2
vzip.u8 green1, green2
vzip.u8 blue1, blue2
vst4.u8 {red1,green1,blue1,alpha1}, [O2,:128]!
vst4.u8 {red2,green2,blue2,alpha2}, [O2,:128]!
/* next columns (x16) */
subs COUNT, COUNT, #16
bgt loop_col
/* next rows (x2) */
subs HEIGHT, #2
add O1, O2, OPAD
add Y1, Y2, YPAD
add U, U, YPAD
b loop_row
/*
* ijksdl_ffinc.h
* ffmpeg headers
*
* Copyright (c) 2013 Zhang Rui <bbcallen@gmail.com>
*
* This file is part of ijkPlayer.
*
* ijkPlayer is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* ijkPlayer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with ijkPlayer; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef IJKSDL__FFMPEG__IJKSDL_IMAGE_CONVERT_H
#define IJKSDL__FFMPEG__IJKSDL_IMAGE_CONVERT_H
#include <stdint.h>
#include "ijksdl_inc_ffmpeg.h"
int ijk_image_convert(int width, int height,
enum AVPixelFormat dst_format, uint8_t **dst_data, int *dst_linesize,
enum AVPixelFormat src_format, const uint8_t **src_data, int *src_linesize);
#endif
......@@ -21,8 +21,8 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef IJKPLAYER__IJKSDL_FFINC_H
#define IJKPLAYER__IJKSDL_FFINC_H
#ifndef IJKSDL__FFMPEG__IJKSDL_FFINC_H
#define IJKSDL__FFMPEG__IJKSDL_FFINC_H
#include "libavutil/imgutils.h"
#include "libavutil/pixfmt.h"
......
......@@ -29,6 +29,7 @@
#include "../ijksdl_vout_internal.h"
#include "../ijksdl_video.h"
#include "ijksdl_inc_ffmpeg.h"
#include "ijksdl_image_convert.h"
typedef struct SDL_VoutOverlay_Opaque {
SDL_mutex *mutex;
......@@ -38,6 +39,8 @@ typedef struct SDL_VoutOverlay_Opaque {
Uint16 pitches[AV_NUM_DATA_POINTERS];
Uint8 *pixels[AV_NUM_DATA_POINTERS];
int no_neon_warned;
} SDL_VoutOverlay_Opaque;
/* Always assume a linesize alignment of 1 here */
......@@ -117,6 +120,8 @@ SDL_VoutOverlay *SDL_VoutFFmpeg_CreateOverlay(int width, int height, Uint32 form
return NULL;
}
width = IJKALIGN(width, 32);
SDL_VoutOverlay_Opaque *opaque = overlay->opaque;
overlay->format = format;
overlay->pitches = opaque->pitches;
......@@ -140,7 +145,7 @@ SDL_VoutOverlay *SDL_VoutFFmpeg_CreateOverlay(int width, int height, Uint32 form
break;
}
case SDL_FCC_RV32: {
ff_format = AV_PIX_FMT_RGB32;
ff_format = AV_PIX_FMT_0BGR32;
planes = 1;
break;
}
......@@ -170,11 +175,13 @@ SDL_VoutOverlay *SDL_VoutFFmpeg_CreateOverlay(int width, int height, Uint32 form
int SDL_VoutFFmpeg_ConvertPicture(
const SDL_VoutOverlay *overlay,
int width, int height, enum AVPixelFormat src_format, uint8_t **src_data, int *src_linesize,
int width, int height,
enum AVPixelFormat src_format, const uint8_t **src_data, int *src_linesize,
struct SwsContext **p_sws_ctx, int sws_flags)
{
assert(overlay);
assert(p_sws_ctx);
SDL_VoutOverlay_Opaque *opaque = overlay->opaque;
AVPicture dest_pic = { { 0 } };
enum AVPixelFormat dst_format = AV_PIX_FMT_NONE;
......@@ -201,18 +208,27 @@ int SDL_VoutFFmpeg_ConvertPicture(
dest_pic.linesize[i] = overlay->pitches[i];
}
*p_sws_ctx = sws_getCachedContext(*p_sws_ctx,
width, height, src_format, width, height,
dst_format, sws_flags, NULL, NULL, NULL);
if (*p_sws_ctx == NULL) {
ALOGE("sws_getCachedContext failed");
return -1;
if (ijk_image_convert(width, height,
dst_format, dest_pic.data, dest_pic.linesize,
src_format, src_data, src_linesize)) {
*p_sws_ctx = sws_getCachedContext(*p_sws_ctx,
width, height, src_format, width, height,
dst_format, sws_flags, NULL, NULL, NULL);
if (*p_sws_ctx == NULL) {
ALOGE("sws_getCachedContext failed");
return -1;
}
sws_scale(*p_sws_ctx, (const uint8_t **) src_data, src_linesize,
0, height, dest_pic.data, dest_pic.linesize);
if (!opaque->no_neon_warned) {
opaque->no_neon_warned = 1;
ALOGE("non-neon image convert %s -> %s", av_get_pix_fmt_name(src_format), av_get_pix_fmt_name(dst_format));
}
}
sws_scale(*p_sws_ctx, (const uint8_t **) src_data, src_linesize,
0, height, dest_pic.data, dest_pic.linesize);
// FIXME:
// duplicate_right_border_pixels(vp->bmp);
// FIXME:
// duplicate_right_border_pixels(vp->bmp);
return 0;
}
......@@ -21,8 +21,8 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef IJKSDL__IJKSDL_VOUT_OVERLAY_FFMPEG_H
#define IJKSDL__IJKSDL_VOUT_OVERLAY_FFMPEG_H
#ifndef IJKSDL__FFMPEG__IJKSDL_VOUT_OVERLAY_FFMPEG_H
#define IJKSDL__FFMPEG__IJKSDL_VOUT_OVERLAY_FFMPEG_H
#include "../ijksdl_stdinc.h"
#include "../ijksdl_vout.h"
......@@ -33,7 +33,8 @@ SDL_VoutOverlay *SDL_VoutFFmpeg_CreateOverlay(int width, int height, Uint32 form
int SDL_VoutFFmpeg_ConvertPicture(
const SDL_VoutOverlay *overlay,
int width, int height, enum AVPixelFormat src_format, uint8_t **src_data, int *src_linesize,
int width, int height,
enum AVPixelFormat src_format, const uint8_t **src_data, int *src_linesize,
struct SwsContext **p_sws_ctx, int sws_flags);
#endif
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册