i420_rv16.S 5.2 KB
Newer Older
Z
Zhang Rui 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227
 @*****************************************************************************
 @ i420_rv16.S : ARM NEONv1 I420 to RV16 chroma conversion
 @*****************************************************************************
 @ Copyright (C) 2011 Sébastien Toque
 @                    Rémi Denis-Courmont
 @
 @ This program is free software; you can redistribute it and/or modify it
 @ under the terms of the GNU Lesser General Public License as published by
 @ the Free Software Foundation; either version 2.1 of the License, or
 @ (at your option) any later version.
 @
 @ This program is distributed in the hope that it will be useful,
 @ but WITHOUT ANY WARRANTY; without even the implied warranty of
 @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 @ GNU Lesser General Public License for more details.
 @
 @ You should have received a copy of the GNU Lesser General Public License
 @ along with this program; if not, write to the Free Software Foundation,
 @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
 @****************************************************************************/

	.syntax unified
	.fpu neon
	.text

/* ARM */
#define O1	r0
#define O2	r1
#define WIDTH	r2
#define HEIGHT	r3
#define Y1	r4
#define Y2	r5
#define U	r6
#define V	r7
#define YPITCH	r8
#define OPAD	r10
#define YPAD	r11
#define COUNT	ip
#define OPITCH	lr

/* NEON */
#define coefY	D0
#define coefRV	D1
#define coefGU	D2
#define coefGV	D3
#define coefBU	D4
#define Rc	Q3
#define Gc	Q4
#define Bc	Q5

#define u	D24
#define v	D25
#define y1	D18
#define y2	D19

#define chro_r	Q6
#define chro_g	Q7
#define chro_b	Q8
#define lumi1	Q15
#define lumi2	Q10
#define red16_1		Q9
#define green16_1	Q10
#define blue16_1	Q11
#define red16_2		Q12
#define green16_2	Q13
#define blue16_2	Q14

#define red1	D25
#define green1	D26
#define blue1	D27
#define red2	D29
#define green2	D30
#define blue2	D31

#define out1l	D24
#define out1h	D25
#define out2l	D28
#define out2h	D29

coefficients:
    .short  -15872
    .short    4992
    .short  -18432

	.align 2
	.global i420_rv16_neon
	.type	i420_rv16_neon, %function
i420_rv16_neon:
	push		{r4-r8,r10-r11,lr}
	vpush		{q4-q7}

	/* load arguments */
	ldmia		r0,	{O1, OPITCH}
	ldmia		r1,	{Y1, U, V, YPITCH}

	/* round the width to be a multiple of 16 */
	ands		OPAD, WIDTH, #15
	sub			WIDTH, WIDTH, OPAD
	addne		WIDTH, WIDTH, #16

	/* init constants (scale value by 64) */
	vmov.u8		coefY, #74
	vmov.u8		coefRV, #115
	vmov.u8		coefGU, #14
	vmov.u8		coefGV, #34
	vmov.u8		coefBU, #135
	adr			OPAD, coefficients
	vld1.s16	{d6[], d7[]}, [OPAD]!
	vld1.s16	{d8[], d9[]}, [OPAD]!
	vld1.s16	{d10[], d11[]}, [OPAD]!

	/* init padding */
	cmp			HEIGHT,	#0
	sub			OPAD,	OPITCH,	WIDTH, lsl #1
	sub			YPAD,	YPITCH,	WIDTH

loop_row:
	movsgt	COUNT,	WIDTH
	add		O2,	O1,	OPITCH
	add		Y2,	Y1,	YPITCH
	/* exit if all rows have been processed */
	vpople	{q4-q7}
	pople	{r4-r8,r10-r11,pc}

loop_col:

	/* Common U & V */

	vld1.u8	{u}, [U,:64]!
	vld1.u8	{v}, [V,:64]!

	/* Y Top Row */
	vld2.u8	{y1,y2}, [Y1,:128]!

	vmull.u8	Q14, v, coefRV
	vmull.u8	Q11, u, coefGU
	vmull.u8	Q13, u, coefBU
	vmlal.u8	Q11, v, coefGV

	vmull.u8	lumi2, y2, coefY
	vmull.u8	lumi1, y1, coefY
	vadd.s16	chro_r, Rc, Q14
	vadd.s16	chro_b, Bc, Q13
	vsub.s16	chro_g, Gc, Q11

	pld	[U]
	pld	[V]

	/* chrominance + luminance */
	vqadd.s16	red16_2, lumi2, chro_r
	vqadd.s16	green16_2, lumi2, chro_g
	vqadd.s16	blue16_2, lumi2, chro_b
	vqadd.s16	red16_1, lumi1, chro_r
	vqadd.s16	green16_1, lumi1, chro_g
	vqadd.s16	blue16_1, lumi1, chro_b

	/* clamp (divide by 64) */
	vqrshrun.s16	green2, green16_2, #6
	vqrshrun.s16	blue2, blue16_2, #6
	vqrshrun.s16	red2, red16_2, #6
	vqrshrun.s16	green1, green16_1, #6
	vqrshrun.s16	red1, red16_1, #6
	vqrshrun.s16	blue1, blue16_1, #6

	pld	[Y1]

	/* pack into RGB565 */
	vshl.u8	out2l, green2, #3 // low 2a
	vsri.u8	out2h, green2, #5 // high 2
	vshl.u8	out1l, green1, #3 // low 1a
	vsri.u8	out1h, green1, #5 // high 1
	vsri.u8	out2l, blue2, #3 // low 2b
	vsri.u8	out1l, blue1, #3 // low 1b

	/* Y Bottom Row */
	vld2.u8	{y1,y2}, [Y2,:128]!

	/* Top Row output */
	vzip.u8	out1h, out2h
	vmull.u8	lumi2, y2, coefY
	vzip.u8	out1l, out2l
	vmull.u8	lumi1, y1, coefY
	vst2.u8	{out1l, out1h}, [O1,:128]!
	vst2.u8	{out2l, out2h}, [O1,:128]!

	/* chrominance + luminance */
	vqadd.s16	green16_2, lumi2, chro_g
	vqadd.s16	red16_2, lumi2, chro_r
	vqadd.s16	blue16_2, lumi2, chro_b
	vqadd.s16	red16_1, lumi1, chro_r
	vqadd.s16	green16_1, lumi1, chro_g
	vqadd.s16	blue16_1, lumi1, chro_b

	/* clamp (divide by 64) */
	vqrshrun.s16	green2, green16_2, #6
	vqrshrun.s16	blue2, blue16_2, #6
	vqrshrun.s16	red2, red16_2, #6
	vqrshrun.s16	green1, green16_1, #6
	vqrshrun.s16	red1, red16_1, #6
	vqrshrun.s16	blue1, blue16_1, #6

	pld	[Y1]

	/* pack into RGB565 */
	vshl.u8	out2l, green2, #3 // low 2a
	vsri.u8	out2h, green2, #5 // high 2
	vshl.u8	out1l, green1, #3 // low 1a
	vsri.u8	out1h, green1, #5 // high 1
	vsri.u8	out2l, blue2, #3 // low 2b
	vsri.u8	out1l, blue1, #3 // low 1b

	vzip.u8	out1h, out2h
	vzip.u8	out1l, out2l
	vst2.u8	{out1l, out1h}, [O2,:128]!
	vst2.u8	{out2l, out2h}, [O2,:128]!

	/* next columns (x16) */
	subs	COUNT,	COUNT,	#16
	bgt		loop_col

	/* next rows (x2) */
	subs	HEIGHT,	#2
	add		O1,	O2,	OPAD
	add		Y1,	Y2,	YPAD
	add		U,	U,	YPAD,	lsr #1
	add		V,	V,	YPAD,	lsr #1
	b		loop_row