swscale: NEON optimized unscaled rgba to nv12 conversion

Signed-off-by: Yu Xiaolei <dreifachstein@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
2013-12-14 15:31:42 +08:00 · 2013-12-14 15:31:42 +08:00 · 1c67ad9d93
commit 1c67ad9d93
parent ffbcb1c6f0
7 changed files with 577 additions and 0 deletions
--- a/libswscale/arm/Makefile
+++ b/libswscale/arm/Makefile
@ -0,0 +1,4 @@
+OBJS        += arm/swscale_unscaled.o
+
+NEON-OBJS   += arm/rgb2yuv_neon_32.o
+NEON-OBJS   += arm/rgb2yuv_neon_16.o
--- a/libswscale/arm/rgb2yuv_neon_16.S
+++ b/libswscale/arm/rgb2yuv_neon_16.S
@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "rgb2yuv_neon_common.S"
+
+/* downsampled R16G16B16 x8 */
+alias_qw    r16x8,  q7
+alias_qw    g16x8,  q8
+alias_qw    b16x8,  q9
+
+alias   n16x16_l,   q11
+alias   n16x16_h,   q12
+
+alias   y16x16_l,   q13
+alias   y16x16_h,   q14
+
+alias_qw    y8x16,  q15
+
+.macro init     src
+    vld3.i32    {q13_l, q14_l, q15_l},          [\src]!
+    vld3.i32    {q13_h[0], q14_h[0], q15_h[0]}, [\src]
+    vrshrn.i32  CO_R,   q13, #7
+    vrshrn.i32  CO_G,   q14, #7
+    vrshrn.i32  CO_B,   q15, #7
+
+    vmov.u8     BIAS_Y, #16
+    vmov.u8     BIAS_U, #128
+.endm
+
+
+.macro compute_y_16x1_step  action, s8x16, coeff
+    vmovl.u8    n16x16_l,   \s8x16\()_l
+    vmovl.u8    n16x16_h,   \s8x16\()_h
+
+    \action     y16x16_l,   n16x16_l,   \coeff
+    \action     y16x16_h,   n16x16_h,   \coeff
+.endm
+
+.macro compute_y_16x1
+    compute_y_16x1_step vmul, r8x16, CO_RY
+    compute_y_16x1_step vmla, g8x16, CO_GY
+    compute_y_16x1_step vmla, b8x16, CO_BY
+
+    vrshrn.i16  y8x16_l,    y16x16_l,   #8
+    vrshrn.i16  y8x16_h,    y16x16_h,   #8
+
+    vadd.u8     y8x16,      y8x16,      BIAS_Y
+.endm
+
+alias   c16x8,      q15
+alias_qw    c8x8x2, q10
+
+
+.macro compute_chroma_8x1   c, C
+    vmul    c16x8,  r16x8,  CO_R\C
+    vmla    c16x8,  g16x8,  CO_G\C
+    vmla    c16x8,  b16x8,  CO_B\C
+
+    vrshrn.i16  \c\()8x8,   c16x8,      #8
+    vadd.u8     \c\()8x8,   \c\()8x8,   BIAS_\C
+.endm
+
+    loop_420sp  rgbx, nv12, init, kernel_420_16x2, 16
--- a/libswscale/arm/rgb2yuv_neon_32.S
+++ b/libswscale/arm/rgb2yuv_neon_32.S
@ -0,0 +1,119 @@
+/*
+ * Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "rgb2yuv_neon_common.S"
+
+/* downsampled R16G16B16 x8 */
+alias_qw    r16x8,  q7
+alias_qw    g16x8,  q8
+alias_qw    b16x8,  q9
+
+alias   n16x16_o,   q11
+alias   n16x16_ol,  q11_l
+alias   n16x16_oh,  q11_h
+
+alias   y32x16_el,  q12
+alias   y32x16_eh,  q13
+alias   y32x16_ol,  q14
+alias   y32x16_oh,  q15
+
+alias   y16x16_e,   q12
+alias   y16x16_el,  q12_l
+alias   y16x16_eh,  q12_h
+alias   y16x16_o,   q13
+alias   y16x16_ol,  q13_l
+alias   y16x16_oh,  q13_h
+
+
+alias   y8x16,  y16x16_e
+
+
+.macro init     src
+    // load s32x3x3, narrow to s16x3x3
+    vld3.i32    {q13_l, q14_l, q15_l},          [\src]!
+    vld3.i32    {q13_h[0], q14_h[0], q15_h[0]}, [\src]
+
+    vmovn.i32   CO_R, q13
+    vmovn.i32   CO_G, q14
+    vmovn.i32   CO_B, q15
+
+    vmov.u8     BIAS_Y, #16
+    vmov.u8     BIAS_U, #128
+.endm
+
+
+.macro compute_y_16x1_step  action, s8x16, coeff
+    vmov.u8     n16x16_o,   #0
+    vtrn.u8     \s8x16,     n16x16_o
+
+    \action     y32x16_el,  \s8x16\()_l,    \coeff
+    \action     y32x16_eh,  \s8x16\()_h,    \coeff
+    \action     y32x16_ol,  n16x16_ol,      \coeff
+    \action     y32x16_oh,  n16x16_oh,      \coeff
+.endm
+
+/*
+ * in:      r8x16, g8x16, b8x16
+ * out:     y8x16
+ * clobber: q11-q15, r8x16, g8x16, b8x16
+ */
+.macro compute_y_16x1
+    compute_y_16x1_step vmull, r8x16, CO_RY
+    compute_y_16x1_step vmlal, g8x16, CO_GY
+    compute_y_16x1_step vmlal, b8x16, CO_BY
+
+    vrshrn.i32  y16x16_el,  y32x16_el,  #15
+    vrshrn.i32  y16x16_eh,  y32x16_eh,  #15
+    vrshrn.i32  y16x16_ol,  y32x16_ol,  #15
+    vrshrn.i32  y16x16_oh,  y32x16_oh,  #15
+
+    vtrn.8      y16x16_e,   y16x16_o
+    vadd.u8     y8x16,      y8x16,      BIAS_Y
+.endm
+
+alias   c32x8_l,    q14
+alias   c32x8_h,    q15
+
+alias_qw    c16x8,  q13
+alias_qw    c8x8x2, q10
+
+.macro compute_chroma_8x1_step  action, s16x8, coeff
+    \action     c32x8_l,    \s16x8\()_l,    \coeff
+    \action     c32x8_h,    \s16x8\()_h,    \coeff
+.endm
+
+/*
+ * in:      r16x8, g16x8, b16x8
+ * out:     c8x8
+ * clobber: q14-q15
+ */
+.macro compute_chroma_8x1   c, C
+    compute_chroma_8x1_step vmull, r16x8, CO_R\C
+    compute_chroma_8x1_step vmlal, g16x8, CO_G\C
+    compute_chroma_8x1_step vmlal, b16x8, CO_B\C
+
+    vrshrn.i32  c16x8_l,    c32x8_l,    #15
+    vrshrn.i32  c16x8_h,    c32x8_h,    #15
+    vmovn.i16   \c\()8x8,   c16x8
+    vadd.u8     \c\()8x8,   \c\()8x8,   BIAS_\C
+.endm
+
+
+    loop_420sp  rgbx, nv12, init, kernel_420_16x2, 32
--- a/libswscale/arm/rgb2yuv_neon_common.S
+++ b/libswscale/arm/rgb2yuv_neon_common.S
@ -0,0 +1,291 @@
+/*
+ * Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+.macro alias name, tgt, set=1
+.if \set != 0
+    \name   .req    \tgt
+.else
+    .unreq  \name
+.endif
+.endm
+
+.altmacro
+
+.macro alias_dw_all qw, dw_l, dw_h
+    alias   q\qw\()_l, d\dw_l
+    alias   q\qw\()_h, d\dw_h
+    .if \qw < 15
+        alias_dw_all  %(\qw + 1), %(\dw_l + 2), %(\dw_h + 2)
+    .endif
+.endm
+
+alias_dw_all    0, 0, 1
+
+.noaltmacro
+
+.macro alias_qw     name, qw, set=1
+    alias   \name\(), \qw, \set
+    alias   \name\()_l, \qw\()_l, \set
+    alias   \name\()_h, \qw\()_h, \set
+.endm
+
+.macro prologue
+    push            {r4-r12, lr}
+    vpush           {q4-q7}
+.endm
+
+.macro epilogue
+    vpop            {q4-q7}
+    pop             {r4-r12, pc}
+.endm
+
+.macro  load_arg    reg, ix
+    ldr     \reg,   [sp, #((10 * 4 + 4 * 16) + (\ix - 4) * 4)]
+.endm
+
+
+/* ()_to_()_neon(const uint8_t *src, uint8_t *y, uint8_t *chroma
+ *                  int width, int height,
+ *                  int y_stride, int c_stride, int src_stride,
+ *                  int32_t coeff_table[9]);
+ */
+.macro  alias_loop_420sp set=1
+    alias   src,        r0, \set
+    alias   src0,       src, \set
+    alias   y,          r1, \set
+    alias   y0,         y, \set
+    alias   chroma,     r2, \set
+    alias   width,      r3, \set
+    alias   header,     width, \set
+
+    alias   height,     r4, \set
+    alias   y_stride,   r5, \set
+    alias   c_stride,   r6, \set
+    alias   c_padding,  c_stride, \set
+    alias   src_stride, r7, \set
+
+    alias   y0_end,     r8, \set
+
+    alias   src_padding,r9, \set
+    alias   y_padding,  r10, \set
+
+    alias   src1,       r11, \set
+    alias   y1,         r12, \set
+
+    alias   coeff_table,r12, \set
+.endm
+
+
+.macro  loop_420sp s_fmt, d_fmt, init, kernel, precision
+
+function \s_fmt\()_to_\d_fmt\()_neon_\precision, export=1
+    prologue
+
+    alias_loop_420sp
+
+    load_arg    height,         4
+    load_arg    y_stride,       5
+    load_arg    c_stride,       6
+    load_arg    src_stride,     7
+    load_arg    coeff_table,    8
+
+    \init       coeff_table
+
+    sub         y_padding,      y_stride,       width
+    sub         c_padding,      c_stride,       width
+    sub         src_padding,    src_stride,     width, LSL #2
+
+    add         y0_end,         y0,             width
+    and         header,         width,          #15
+
+    add         y1,             y0,             y_stride
+    add         src1,           src0,           src_stride
+
+0:
+    cmp         header,     #0
+    beq         1f
+
+    \kernel     \s_fmt, \d_fmt, src0, src1, y0, y1, chroma, header
+
+1:
+    \kernel     \s_fmt, \d_fmt, src0, src1, y0, y1, chroma
+
+    cmp         y0,         y0_end
+    blt         1b
+2:
+    add         y0,         y1,         y_padding
+    add         y0_end,     y1,         y_stride
+    add         chroma,     chroma,     c_padding
+    add         src0,       src1,       src_padding
+
+    add         y1,         y0,         y_stride
+    add         src1,       src0,       src_stride
+
+    subs        height,     height,     #2
+
+    bgt         0b
+
+    epilogue
+
+    alias_loop_420sp 0
+
+endfunc
+.endm
+
+.macro downsample
+    vpaddl.u8   r16x8,  r8x16
+    vpaddl.u8   g16x8,  g8x16
+    vpaddl.u8   b16x8,  b8x16
+.endm
+
+
+/* acculumate and right shift by 2 */
+.macro downsample_ars2
+    vpadal.u8   r16x8,  r8x16
+    vpadal.u8   g16x8,  g8x16
+    vpadal.u8   b16x8,  b8x16
+
+    vrshr.u16   r16x8,  r16x8,  #2
+    vrshr.u16   g16x8,  g16x8,  #2
+    vrshr.u16   b16x8,  b16x8,  #2
+.endm
+
+.macro store_y8_16x1            dst, count
+.if \count == 0
+    vstmia      \dst!,  {y8x16}
+.else
+    vstmia      \dst,   {y8x16}
+    add         \dst,   \dst,           \count
+.endif
+.endm
+
+.macro store_chroma_nv12_8x1    dst, count
+.if \count == 0
+    vst2.i8     {u8x8, v8x8},   [\dst]!
+.else
+    vst2.i8     {u8x8, v8x8},   [\dst], \count
+.endif
+.endm
+
+.macro store_chroma_nv21_8x1    dst, count
+.if \count == 0
+    vst2.i8     {v8x8, u8x8},   [\dst]!
+.else
+    vst2.i8     {v8x8, u8x8},   [\dst], \count
+.endif
+.endm
+
+.macro load_8888_16x1   a, b, c, d, src, count
+.if \count == 0
+    vld4.8      {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l},  [\src]!
+    vld4.8      {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h},  [\src]!
+.else
+    vld4.8      {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l},  [\src]!
+    vld4.8      {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h},  [\src]
+    sub         \src,   \src,   #32
+    add         \src,   \src,   \count, LSL #2
+.endif
+.endm
+
+.macro load_rgbx_16x1   src, count
+    load_8888_16x1  r, g, b, x, \src, \count
+.endm
+
+.macro load_bgrx_16x1   src, count
+    load_8888_16x1  b, g, r, x, \src, \count
+.endm
+
+.macro alias_src_rgbx   set
+    alias_src_8888  r, g, b, x, \set
+.endm
+
+.macro alias_src_bgrx   set
+    alias_src_8888  b, g, r, x, \set
+.endm
+
+.macro alias_dst_nv12   set
+    alias   u8x8, c8x8x2_l, \set
+    alias   v8x8, c8x8x2_h, \set
+.endm
+
+.macro alias_dst_nv21   set
+    alias   v8x8, c8x8x2_l, \set
+    alias   u8x8, c8x8x2_h, \set
+.endm
+
+
+// common aliases
+
+alias   CO_R    d0
+CO_RY   .dn     d0.s16[0]
+CO_RU   .dn     d0.s16[1]
+CO_RV   .dn     d0.s16[2]
+
+alias   CO_G    d1
+CO_GY   .dn     d1.s16[0]
+CO_GU   .dn     d1.s16[1]
+CO_GV   .dn     d1.s16[2]
+
+alias   CO_B    d2
+CO_BY   .dn     d2.s16[0]
+CO_BU   .dn     d2.s16[1]
+CO_BV   .dn     d2.s16[2]
+
+alias   BIAS_U, d3
+alias   BIAS_V, BIAS_U
+
+alias   BIAS_Y, q2
+
+
+/* q3-q6 R8G8B8X8 x16 */
+
+.macro alias_src_8888   a, b, c, d, set
+    alias_qw  \a\()8x16, q3, \set
+    alias_qw  \b\()8x16, q4, \set
+    alias_qw  \c\()8x16, q5, \set
+    alias_qw  \d\()8x16, q6, \set
+.endm
+
+.macro kernel_420_16x2  rgb_fmt, yuv_fmt, rgb0, rgb1, y0, y1, chroma, count=0
+    alias_src_\rgb_fmt
+    alias_dst_\yuv_fmt
+
+    load_\rgb_fmt\()_16x1   \rgb0, \count
+
+    downsample
+    compute_y_16x1
+    store_y8_16x1   \y0, \count
+
+
+    load_\rgb_fmt\()_16x1   \rgb1, \count
+    downsample_ars2
+    compute_y_16x1
+    store_y8_16x1   \y1, \count
+
+    compute_chroma_8x1  u, U
+    compute_chroma_8x1  v, V
+
+    store_chroma_\yuv_fmt\()_8x1 \chroma, \count
+
+    alias_dst_\yuv_fmt 0
+    alias_src_\rgb_fmt 0
+.endm
--- a/libswscale/arm/swscale_unscaled.c
+++ b/libswscale/arm/swscale_unscaled.c
@ -0,0 +1,79 @@
+/*
+ * Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+#include "libavutil/arm/cpu.h"
+
+extern void rgbx_to_nv12_neon_32(const uint8_t *src, uint8_t *y, uint8_t *chroma,
+                int width, int height,
+                int y_stride, int c_stride, int src_stride,
+                int32_t coeff_tbl[9]);
+
+extern void rgbx_to_nv12_neon_16(const uint8_t *src, uint8_t *y, uint8_t *chroma,
+                int width, int height,
+                int y_stride, int c_stride, int src_stride,
+                int32_t coeff_tbl[9]);
+
+static int rgbx_to_nv12_neon_32_wrapper(SwsContext *context, const uint8_t *src[],
+                        int srcStride[], int srcSliceY, int srcSliceH,
+                        uint8_t *dst[], int dstStride[]) {
+
+    rgbx_to_nv12_neon_32(src[0] + srcSliceY * srcStride[0],
+            dst[0] + srcSliceY * dstStride[0],
+            dst[1] + (srcSliceY / 2) * dstStride[1],
+            context->srcW, srcSliceH,
+            dstStride[0], dstStride[1], srcStride[0],
+            context->input_rgb2yuv_table);
+
+    return 0;
+}
+
+static int rgbx_to_nv12_neon_16_wrapper(SwsContext *context, const uint8_t *src[],
+                        int srcStride[], int srcSliceY, int srcSliceH,
+                        uint8_t *dst[], int dstStride[]) {
+
+    rgbx_to_nv12_neon_16(src[0] + srcSliceY * srcStride[0],
+            dst[0] + srcSliceY * dstStride[0],
+            dst[1] + (srcSliceY / 2) * dstStride[1],
+            context->srcW, srcSliceH,
+            dstStride[0], dstStride[1], srcStride[0],
+            context->input_rgb2yuv_table);
+
+    return 0;
+}
+
+static void get_unscaled_swscale_neon(SwsContext *c) {
+    int accurate_rnd = c->flags & SWS_ACCURATE_RND;
+    if (c->srcFormat == AV_PIX_FMT_RGBA
+            && c->dstFormat == AV_PIX_FMT_NV12
+            && (c->srcW >= 16)) {
+        c->swscale = accurate_rnd ? rgbx_to_nv12_neon_32_wrapper
+                        : rgbx_to_nv12_neon_16_wrapper;
+    }
+}
+
+void ff_get_unscaled_swscale_arm(SwsContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+    if (have_neon(cpu_flags))
+        get_unscaled_swscale_neon(c);
+}
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@ -835,6 +835,7 @@ extern const AVClass sws_context_class;
 void ff_get_unscaled_swscale(SwsContext *c);
 void ff_get_unscaled_swscale_bfin(SwsContext *c);
 void ff_get_unscaled_swscale_ppc(SwsContext *c);
+void ff_get_unscaled_swscale_arm(SwsContext *c);

 /**
 * Return function pointer to fastest main scaler path function depending
--- a/libswscale/swscale_unscaled.c
+++ b/libswscale/swscale_unscaled.c
@ -1384,6 +1384,9 @@ void ff_get_unscaled_swscale(SwsContext *c)
        ff_get_unscaled_swscale_bfin(c);
    if (ARCH_PPC)
        ff_get_unscaled_swscale_ppc(c);
+    if (ARCH_ARM)
+        ff_get_unscaled_swscale_arm(c);
+
 }

 /* Convert the palette to the same packed 32-bit format as the palette */