hevcdsp: remove an unneeded variable in the loop filter

beta0 and beta1 will always be the same within a CU Signed-off-by: N Mickaël Raulet <mraulet@insa-rennes.fr> cherry picked from commit 4a23d824741a289c7d2d2f2871d1e2621b63fa1b Signed-off-by: N Michael Niedermayer <michaelni@gmx.at>

hevcdsp: remove an unneeded variable in the loop filter
beta0 and beta1 will always be the same within a CU Signed-off-by: N Mickaël Raulet <mraulet@insa-rennes.fr> cherry picked from commit 4a23d824741a289c7d2d2f2871d1e2621b63fa1b Signed-off-by: N Michael Niedermayer <michaelni@gmx.at>
d7e162d4 · Anton Khirnov · Michael Niedermayer · ae2f048f · d7e162d4 · d7e162d4
4 changed file
--- a/libavcodec/hevc_filter.c
+++ b/libavcodec/hevc_filter.c
@@ -340,7 +340,7 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
    uint8_t *src;
    int x, y;
    int chroma;
-    int c_tc[2], beta[2], tc[2];
+    int c_tc[2], tc[2], beta;
    uint8_t no_p[2] = { 0 };
    uint8_t no_q[2] = { 0 };

@@ -381,13 +381,11 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
            const int bs0 = s->vertical_bs[(x +  y      * s->bs_width) >> 2];
            const int bs1 = s->vertical_bs[(x + (y + 4) * s->bs_width) >> 2];
            if (bs0 || bs1) {
-                const int qp0 = (get_qPy(s, x - 1, y)     + get_qPy(s, x, y)     + 1) >> 1;
-                const int qp1 = (get_qPy(s, x - 1, y + 4) + get_qPy(s, x, y + 4) + 1) >> 1;
+                const int qp = (get_qPy(s, x - 1, y)     + get_qPy(s, x, y)     + 1) >> 1;

-                beta[0] = betatable[av_clip(qp0 + beta_offset, 0, MAX_QP)];
-                beta[1] = betatable[av_clip(qp1 + beta_offset, 0, MAX_QP)];
-                tc[0]   = bs0 ? TC_CALC(qp0, bs0) : 0;
-                tc[1]   = bs1 ? TC_CALC(qp1, bs1) : 0;
+                beta    = betatable[av_clip(qp + beta_offset, 0, MAX_QP)];
+                tc[0]   = bs0 ? TC_CALC(qp, bs0) : 0;
+                tc[1]   = bs1 ? TC_CALC(qp, bs1) : 0;
                src     = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->sps->pixel_shift)];
                if (pcmf) {
                    no_p[0] = get_pcm(s, x - 1, y);
@@ -447,16 +445,14 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
            const int bs0 = s->horizontal_bs[( x      + y * s->bs_width) >> 2];
            const int bs1 = s->horizontal_bs[((x + 4) + y * s->bs_width) >> 2];
            if (bs0 || bs1) {
-                const int qp0 = (get_qPy(s, x, y - 1)     + get_qPy(s, x, y)     + 1) >> 1;
-                const int qp1 = (get_qPy(s, x + 4, y - 1) + get_qPy(s, x + 4, y) + 1) >> 1;
+                const int qp = (get_qPy(s, x, y - 1)     + get_qPy(s, x, y)     + 1) >> 1;

                tc_offset   = x >= x0 ? cur_tc_offset : left_tc_offset;
                beta_offset = x >= x0 ? cur_beta_offset : left_beta_offset;

-                beta[0] = betatable[av_clip(qp0 + beta_offset, 0, MAX_QP)];
-                beta[1] = betatable[av_clip(qp1 + beta_offset, 0, MAX_QP)];
-                tc[0]   = bs0 ? TC_CALC(qp0, bs0) : 0;
-                tc[1]   = bs1 ? TC_CALC(qp1, bs1) : 0;
+                beta    = betatable[av_clip(qp + beta_offset, 0, MAX_QP)];
+                tc[0]   = bs0 ? TC_CALC(qp, bs0) : 0;
+                tc[1]   = bs1 ? TC_CALC(qp, bs1) : 0;
                src     = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->sps->pixel_shift)];
                if (pcmf) {
                    no_p[0] = get_pcm(s, x, y - 1);

--- a/libavcodec/hevcdsp.h
+++ b/libavcodec/hevcdsp.h
@@ -97,20 +97,20 @@ typedef struct HEVCDSPContext {
                                         int ox1, intptr_t mx, intptr_t my, int width);

    void (*hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
-                                    int *beta, int *tc,
+                                    int beta, int *tc,
                                    uint8_t *no_p, uint8_t *no_q);
    void (*hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
-                                    int *beta, int *tc,
+                                    int beta, int *tc,
                                    uint8_t *no_p, uint8_t *no_q);
    void (*hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
                                      int *tc, uint8_t *no_p, uint8_t *no_q);
    void (*hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
                                      int *tc, uint8_t *no_p, uint8_t *no_q);
    void (*hevc_h_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
-                                      int *beta, int *tc,
+                                      int beta, int *tc,
                                      uint8_t *no_p, uint8_t *no_q);
    void (*hevc_v_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
-                                      int *beta, int *tc,
+                                      int beta, int *tc,
                                      uint8_t *no_p, uint8_t *no_q);
    void (*hevc_h_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
                                        int *tc, uint8_t *no_p,

--- a/libavcodec/hevcdsp_template.c
+++ b/libavcodec/hevcdsp_template.c
@@ -1564,7 +1564,7 @@ static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uin

 static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix,
                                        ptrdiff_t _xstride, ptrdiff_t _ystride,
-                                        int *_beta, int *_tc,
+                                        int beta, int *_tc,
                                        uint8_t *_no_p, uint8_t *_no_q)
 {
    int d, j;
@@ -1572,6 +1572,8 @@ static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix,
    ptrdiff_t xstride = _xstride / sizeof(pixel);
    ptrdiff_t ystride = _ystride / sizeof(pixel);

+    beta <<= BIT_DEPTH - 8;
+
    for (j = 0; j < 2; j++) {
        const int dp0  = abs(P2  - 2 * P1  + P0);
        const int dq0  = abs(Q2  - 2 * Q1  + Q0);
@@ -1579,7 +1581,6 @@ static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix,
        const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
        const int d0   = dp0 + dq0;
        const int d3   = dp3 + dq3;
-        const int beta = _beta[j] << (BIT_DEPTH - 8);
        const int tc   = _tc[j]   << (BIT_DEPTH - 8);
        const int no_p = _no_p[j];
        const int no_q = _no_q[j];
@@ -1706,7 +1707,7 @@ static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
 }

 static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
-                                          int *beta, int *tc, uint8_t *no_p,
+                                          int beta, int *tc, uint8_t *no_p,
                                          uint8_t *no_q)
 {
    FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel),
@@ -1714,7 +1715,7 @@ static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
 }

 static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
-                                          int *beta, int *tc, uint8_t *no_p,
+                                          int beta, int *tc, uint8_t *no_p,
                                          uint8_t *no_q)
 {
    FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride,

--- a/libavcodec/x86/hevc_deblock.asm
+++ b/libavcodec/x86/hevc_deblock.asm
@@ -310,7 +310,7 @@ INIT_XMM sse2
 %endmacro

 ALIGN 16
-; input in m0 ... m3 and tcs in tc (r2). Output in m1 and m2
+; input in m0 ... m3 and tcs in r2. Output in m1 and m2
 %macro CHROMA_DEBLOCK_BODY 1
    psubw            m4, m2, m1; q0 - p0
    psubw            m5, m0, m3; p1 - q1
@@ -339,7 +339,7 @@ ALIGN 16
    psubw            m2, m5; q0 - delta0
 %endmacro

-; input in m0 ... m7, betas in r2 tcs in r3. Output in m1...m6
+; input in m0 ... m7, beta in r2 tcs in r3. Output in m1...m6
 %macro LUMA_DEBLOCK_BODY 2
    psllw            m9, m2, 1; *2
    psubw           m10, m1, m9
@@ -352,20 +352,11 @@ ALIGN 16
    ABS1            m11, m13 ; 0dq0, 0dq3 , 1dq0, 1dq3

    ;beta calculations
-    mov             r11, [betaq];
 %if %1 > 8
-    shl             r11, %1 - 8
-%endif
-    movd            m13, r11d; beta0
-    add           betaq, 4;
-    punpcklwd       m13, m13
-    mov             r12, [betaq];
-%if %1 > 8
-    shl             r12, %1 - 8
+    shl             betaq, %1 - 8
 %endif
-    movd            m14, r12d; beta1
-    punpcklwd       m14, m14
-    pshufd          m13, m14, 0; beta0, beta1
+    movd            m13, betaq
+    SPLATW          m13, m13, 0
    ;end beta calculations

    paddw            m9, m10, m11;   0d0, 0d3  ,  1d0, 1d3
@@ -412,31 +403,31 @@ ALIGN 16
    ; end calc for weak filter

    ; filtering mask
-    mov              r2, r13
-    shr              r2, 3
-    movd            m15, r2d
+    mov             r11, r13
+    shr             r11, 3
+    movd            m15, r11d
    and             r13, 1
    movd            m11, r13d
    shufps          m11, m15, 0
-    shl              r2, 1
-    or              r13, r2
+    shl             r11, 1
+    or              r13, r11

    pcmpeqd         m11, [pd_1]; filtering mask

    ;decide between strong and weak filtering
    ;tc25 calculations
-    mov             r2d, [tcq];
+    mov            r11d, [tcq];
 %if %1 > 8
-    shl              r2, %1 - 8
+    shl             r11, %1 - 8
 %endif
-    movd             m8, r2d; tc0
+    movd             m8, r11d; tc0
    add             tcq, 4;
    mov             r3d, [tcq];
 %if %1 > 8
    shl              r3, %1 - 8
 %endif
    movd             m9, r3d; tc1
-    add             r2d, r3d; tc0 + tc1
+    add            r11d, r3d; tc0 + tc1
    jz             .bypassluma
    punpcklwd        m8, m8
    punpcklwd        m9, m9
@@ -460,8 +451,8 @@ ALIGN 16

    psraw           m13, 3; beta >> 3
    pcmpgtw         m13, m12;
-    movmskps         r2, m13;
-    and             r14, r2; strong mask , beta_2 and beta_3 comparisons
+    movmskps        r11, m13;
+    and             r14, r11; strong mask , beta_2 and beta_3 comparisons
    ;----beta_3 comparison end-----
    ;----tc25 comparison---
    psubw           m12, m3, m4;      p0 - q0
@@ -471,24 +462,24 @@ ALIGN 16
    pshuflw         m12, m12, 0xf0 ;0b11110000;

    pcmpgtw          m8, m12; tc25 comparisons
-    movmskps         r2, m8;
-    and             r14, r2; strong mask, beta_2, beta_3 and tc25 comparisons
+    movmskps        r11, m8;
+    and             r14, r11; strong mask, beta_2, beta_3 and tc25 comparisons
    ;----tc25 comparison end---
-    mov              r2, r14;
-    shr              r2, 1;
-    and             r14, r2; strong mask, bits 2 and 0
+    mov             r11, r14;
+    shr             r11, 1;
+    and             r14, r11; strong mask, bits 2 and 0

    pmullw          m14, m9, [pw_m2]; -tc * 2
    paddw            m9, m9

    and             r14, 5; 0b101
-    mov              r2, r14; strong mask
+    mov             r11, r14; strong mask
    shr             r14, 2;
    movd            m12, r14d; store to xmm for mask generation
    shl             r14, 1
-    and              r2, 1
-    movd            m10, r2d; store to xmm for mask generation
-    or              r14, r2; final strong mask, bits 1 and 0
+    and             r11, 1
+    movd            m10, r11d; store to xmm for mask generation
+    or              r14, r11; final strong mask, bits 1 and 0
    jz      .weakfilter

    shufps          m10, m12, 0
@@ -578,23 +569,18 @@ ALIGN 16
    jz             .store

    ; weak filtering mask
-    mov              r2, r14
-    shr              r2, 1
-    movd            m12, r2d
+    mov             r11, r14
+    shr             r11, 1
+    movd            m12, r11d
    and             r14, 1
    movd            m11, r14d
    shufps          m11, m12, 0
    pcmpeqd         m11, [pd_1]; filtering mask

-    mov             r13, r11; beta0
-    shr             r13, 1;
-    add             r11, r13
-    shr             r11, 3; ((beta0+(beta0>>1))>>3))
-
-    mov             r13, r12; beta1
+    mov             r13, betaq
    shr             r13, 1;
-    add             r12, r13
-    shr             r12, 3; ((beta1+(beta1>>1))>>3))
+    add             betaq, r13
+    shr             betaq, 3; ((beta + (beta >> 1)) >> 3))

    mova            m13, [pw_8]
    psubw           m12, m4, m3 ; q0 - p0
@@ -633,11 +619,8 @@ ALIGN 16
    paddw           m15, m2; p1'

    ;beta calculations
-    movd            m10, r11d; beta0
-    punpcklwd       m10, m10
-    movd            m13, r12d; beta1
-    punpcklwd       m13, m13
-    shufps          m10, m13, 0; betax0, betax1
+    movd            m10, betaq
+    SPLATW          m10, m10, 0

    movd            m13, r7d; 1dp0 + 1dp3
    movd             m8, r8d; 0dp0 + 0dp3