swscale/la: Optimize the functions of the swscale series with lsx.

./configure --disable-lasx ffmpeg -i ~/media/1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -s 640x480 -pix_fmt bgra -y /dev/null -an before: 91fps after: 160fps Reviewed-by: Shiyou Yin <yinshiyou-hf@loongson.cn> Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
2023-05-25 15:24:30 +08:00 · 2023-05-25 15:24:30 +08:00 · 4501b1dfd7
commit 4501b1dfd7
parent f6077cc666
10 changed files with 4256 additions and 7 deletions
--- a/libswscale/loongarch/Makefile
+++ b/libswscale/loongarch/Makefile
@ -4,3 +4,8 @@ LASX-OBJS-$(CONFIG_SWSCALE) += loongarch/swscale_lasx.o \
                               loongarch/yuv2rgb_lasx.o \
                               loongarch/rgb2rgb_lasx.o \
                               loongarch/output_lasx.o
+LSX-OBJS-$(CONFIG_SWSCALE)  += loongarch/swscale.o \
+                               loongarch/swscale_lsx.o \
+                               loongarch/input.o   \
+                               loongarch/output.o  \
+                               loongarch/output_lsx.o
--- a/libswscale/loongarch/input.S
+++ b/libswscale/loongarch/input.S
@ -0,0 +1,285 @@
+/*
+ * Loongson LSX optimized swscale
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/loongarch/loongson_asm.S"
+
+/* void planar_rgb_to_y_lsx(uint8_t *_dst, const uint8_t *src[4],
+ *                          int width, int32_t *rgb2yuv)
+ */
+function planar_rgb_to_y_lsx
+    ld.d            a5,     a1,    0
+    ld.d            a6,     a1,    8
+    ld.d            a7,     a1,    16
+
+    ld.w            t1,     a3,    0     // ry
+    ld.w            t2,     a3,    4     // gy
+    ld.w            t3,     a3,    8     // by
+    li.w            t4,     9
+    li.w            t5,     524544
+    li.w            t7,     4
+    li.w            t8,     8
+    vldi            vr7,    0
+    vreplgr2vr.w    vr1,    t1
+    vreplgr2vr.w    vr2,    t2
+    vreplgr2vr.w    vr3,    t3
+    vreplgr2vr.w    vr4,    t4
+    vreplgr2vr.w    vr5,    t5
+    bge             a2,     t8,    .WIDTH8
+    bge             a2,     t7,    .WIDTH4
+    blt             zero,   a2,    .WIDTH
+    b               .END
+
+.WIDTH8:
+    vld             vr8,    a5,    0
+    vld             vr9,    a6,    0
+    vld             vr10,   a7,    0
+    vilvl.b         vr11,   vr7,   vr8
+    vilvl.b         vr12,   vr7,   vr9
+    vilvl.b         vr13,   vr7,   vr10
+    vilvl.h         vr14,   vr7,   vr11
+    vilvl.h         vr15,   vr7,   vr12
+    vilvl.h         vr16,   vr7,   vr13
+    vilvh.h         vr17,   vr7,   vr11
+    vilvh.h         vr18,   vr7,   vr12
+    vilvh.h         vr19,   vr7,   vr13
+    vmul.w          vr20,   vr1,   vr16
+    vmul.w          vr21,   vr1,   vr19
+    vmadd.w         vr20,   vr2,   vr14
+    vmadd.w         vr20,   vr3,   vr15
+    vmadd.w         vr21,   vr2,   vr17
+    vmadd.w         vr21,   vr3,   vr18
+    vadd.w          vr20,   vr20,  vr5
+    vadd.w          vr21,   vr21,  vr5
+    vsra.w          vr20,   vr20,  vr4
+    vsra.w          vr21,   vr21,  vr4
+    vpickev.h       vr20,   vr21,  vr20
+    vst             vr20,   a0,    0
+    addi.d          a2,     a2,    -8
+    addi.d          a5,     a5,    8
+    addi.d          a6,     a6,    8
+    addi.d          a7,     a7,    8
+    addi.d          a0,     a0,    16
+    bge             a2,     t8,    .WIDTH8
+    bge             a2,     t7,    .WIDTH4
+    blt             zero,   a2,    .WIDTH
+    b               .END
+
+.WIDTH4:
+    vld             vr8,    a5,    0
+    vld             vr9,    a6,    0
+    vld             vr10,   a7,    0
+    vilvl.b         vr11,   vr7,   vr8
+    vilvl.b         vr12,   vr7,   vr9
+    vilvl.b         vr13,   vr7,   vr10
+    vilvl.h         vr14,   vr7,   vr11
+    vilvl.h         vr15,   vr7,   vr12
+    vilvl.h         vr16,   vr7,   vr13
+    vmul.w          vr17,   vr1,   vr16
+    vmadd.w         vr17,   vr2,   vr14
+    vmadd.w         vr17,   vr3,   vr15
+    vadd.w          vr17,   vr17,  vr5
+    vsra.w          vr17,   vr17,  vr4
+    vpickev.h       vr17,   vr17,  vr17
+    vstelm.d        vr17,   a0,    0,    0
+    addi.d          a2,     a2,    -4
+    addi.d          a5,     a5,    4
+    addi.d          a6,     a6,    4
+    addi.d          a7,     a7,    4
+    addi.d          a0,     a0,    8
+    bge             a2,     t7,    .WIDTH4
+    blt             zero,   a2,    .WIDTH
+    b               .END
+
+.WIDTH:
+    ld.bu           t0,     a5,    0
+    ld.bu           t4,     a6,    0
+    ld.bu           t6,     a7,    0
+    mul.w           t8,     t6,    t1
+    mul.w           t7,     t0,    t2
+    add.w           t8,     t8,    t7
+    mul.w           t7,     t4,    t3
+    add.w           t8,     t8,    t7
+    add.w           t8,     t8,    t5
+    srai.w          t8,     t8,    9
+    st.h            t8,     a0,    0
+    addi.d          a2,     a2,    -1
+    addi.d          a5,     a5,    1
+    addi.d          a6,     a6,    1
+    addi.d          a7,     a7,    1
+    addi.d          a0,     a0,    2
+    blt             zero,   a2,    .WIDTH
+.END:
+endfunc
+
+/* void planar_rgb_to_uv_lsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
+ *                           int width, int32_t *rgb2yuv)
+ */
+function planar_rgb_to_uv_lsx
+    addi.d          sp,     sp,    -24
+    st.d            s1,     sp,    0
+    st.d            s2,     sp,    8
+    st.d            s3,     sp,    16
+
+    ld.d            a5,     a2,    0
+    ld.d            a6,     a2,    8
+    ld.d            a7,     a2,    16
+    ld.w            t1,     a4,    12     // ru
+    ld.w            t2,     a4,    16     // gu
+    ld.w            t3,     a4,    20     // bu
+    ld.w            s1,     a4,    24     // rv
+    ld.w            s2,     a4,    28     // gv
+    ld.w            s3,     a4,    32     // bv
+    li.w            t4,     9
+    li.w            t5,     4194560
+    li.w            t7,     4
+    li.w            t8,     8
+    vldi            vr0,    0
+    vreplgr2vr.w    vr1,    t1
+    vreplgr2vr.w    vr2,    t2
+    vreplgr2vr.w    vr3,    t3
+    vreplgr2vr.w    vr4,    s1
+    vreplgr2vr.w    vr5,    s2
+    vreplgr2vr.w    vr6,    s3
+    vreplgr2vr.w    vr7,    t4
+    vreplgr2vr.w    vr8,    t5
+    bge             a2,     t8,    .LOOP_WIDTH8
+    bge             a2,     t7,    .LOOP_WIDTH4
+    blt             zero,   a2,    .LOOP_WIDTH
+    b               .LOOP_END
+
+.LOOP_WIDTH8:
+    vld             vr9,    a5,    0
+    vld             vr10,   a6,    0
+    vld             vr11,   a7,    0
+    vilvl.b         vr9,    vr0,   vr9
+    vilvl.b         vr10,   vr0,   vr10
+    vilvl.b         vr11,   vr0,   vr11
+    vilvl.h         vr12,   vr0,   vr9
+    vilvl.h         vr13,   vr0,   vr10
+    vilvl.h         vr14,   vr0,   vr11
+    vilvh.h         vr15,   vr0,   vr9
+    vilvh.h         vr16,   vr0,   vr10
+    vilvh.h         vr17,   vr0,   vr11
+    vmul.w          vr18,   vr1,   vr14
+    vmul.w          vr19,   vr1,   vr17
+    vmul.w          vr20,   vr4,   vr14
+    vmul.w          vr21,   vr4,   vr17
+    vmadd.w         vr18,   vr2,   vr12
+    vmadd.w         vr18,   vr3,   vr13
+    vmadd.w         vr19,   vr2,   vr15
+    vmadd.w         vr19,   vr3,   vr16
+    vmadd.w         vr20,   vr5,   vr12
+    vmadd.w         vr20,   vr6,   vr13
+    vmadd.w         vr21,   vr5,   vr15
+    vmadd.w         vr21,   vr6,   vr16
+    vadd.w          vr18,   vr18,  vr8
+    vadd.w          vr19,   vr19,  vr8
+    vadd.w          vr20,   vr20,  vr8
+    vadd.w          vr21,   vr21,  vr8
+    vsra.w          vr18,   vr18,  vr7
+    vsra.w          vr19,   vr19,  vr7
+    vsra.w          vr20,   vr20,  vr7
+    vsra.w          vr21,   vr21,  vr7
+    vpickev.h       vr18,   vr19,  vr18
+    vpickev.h       vr20,   vr21,  vr20
+    vst             vr18,   a0,    0
+    vst             vr20,   a1,    0
+    addi.d          a3,     a3,    -8
+    addi.d          a5,     a5,    8
+    addi.d          a6,     a6,    8
+    addi.d          a7,     a7,    8
+    addi.d          a0,     a0,    16
+    addi.d          a1,     a1,    16
+    bge             a3,     t8,    .LOOP_WIDTH8
+    bge             a3,     t7,    .LOOP_WIDTH4
+    blt             zero,   a3,    .LOOP_WIDTH
+    b               .LOOP_END
+
+.LOOP_WIDTH4:
+    vld             vr9,    a5,    0
+    vld             vr10,   a6,    0
+    vld             vr11,   a7,    0
+    vilvl.b         vr9,    vr0,   vr9
+    vilvl.b         vr10,   vr0,   vr10
+    vilvl.b         vr11,   vr0,   vr11
+    vilvl.h         vr12,   vr0,   vr9
+    vilvl.h         vr13,   vr0,   vr10
+    vilvl.h         vr14,   vr0,   vr11
+    vmul.w          vr18,   vr1,   vr14
+    vmul.w          vr19,   vr4,   vr14
+    vmadd.w         vr18,   vr2,   vr12
+    vmadd.w         vr18,   vr3,   vr13
+    vmadd.w         vr19,   vr5,   vr12
+    vmadd.w         vr19,   vr6,   vr13
+    vadd.w          vr18,   vr18,  vr8
+    vadd.w          vr19,   vr19,  vr8
+    vsra.w          vr18,   vr18,  vr7
+    vsra.w          vr19,   vr19,  vr7
+    vpickev.h       vr18,   vr18,  vr18
+    vpickev.h       vr19,   vr19,  vr19
+    vstelm.d        vr18,   a0,    0,    0
+    vstelm.d        vr19,   a1,    0,    0
+    addi.d          a3,     a3,    -4
+    addi.d          a5,     a5,    4
+    addi.d          a6,     a6,    4
+    addi.d          a7,     a7,    4
+    addi.d          a0,     a0,    8
+    addi.d          a1,     a1,    8
+    bge             a3,     t7,    .LOOP_WIDTH4
+    blt             zero,   a3,    .LOOP_WIDTH
+    b               .LOOP_END
+
+.LOOP_WIDTH:
+    ld.bu           t0,     a5,    0
+    ld.bu           t4,     a6,    0
+    ld.bu           t6,     a7,    0
+    mul.w           t8,     t6,    t1
+    mul.w           t7,     t0,    t2
+    add.w           t8,     t8,    t7
+    mul.w           t7,     t4,    t3
+    add.w           t8,     t8,    t7
+    add.w           t8,     t8,    t5
+    srai.w          t8,     t8,    9
+    st.h            t8,     a0,    0
+    mul.w           t8,     t6,    s1
+    mul.w           t7,     t0,    s2
+    add.w           t8,     t8,    t7
+    mul.w           t7,     t4,    s3
+    add.w           t8,     t8,    t7
+    add.w           t8,     t8,    t5
+    srai.w          t8,     t8,    9
+    st.h            t8,     a1,    0
+    addi.d          a3,     a3,    -1
+    addi.d          a5,     a5,    1
+    addi.d          a6,     a6,    1
+    addi.d          a7,     a7,    1
+    addi.d          a0,     a0,    2
+    addi.d          a1,     a1,    2
+    blt             zero,   a3,    .LOOP_WIDTH
+
+.LOOP_END:
+    ld.d            s1,     sp,    0
+    ld.d            s2,     sp,    8
+    ld.d            s3,     sp,    16
+    addi.d          sp,     sp,    24
+endfunc
--- a/libswscale/loongarch/output.S
+++ b/libswscale/loongarch/output.S
@ -0,0 +1,138 @@
+/*
+ * Loongson LSX optimized swscale
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/loongarch/loongson_asm.S"
+
+/* static void ff_yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
+ *                                 const int16_t **src, uint8_t *dest, int dstW,
+ *                                 const uint8_t *dither, int offset)
+ */
+function ff_yuv2planeX_8_lsx
+    addi.w          t1,     a6,     1
+    addi.w          t2,     a6,     2
+    addi.w          t3,     a6,     3
+    addi.w          t4,     a6,     4
+    addi.w          t5,     a6,     5
+    addi.w          t6,     a6,     6
+    addi.w          t7,     a6,     7
+    andi            t0,     a6,     7
+    andi            t1,     t1,     7
+    andi            t2,     t2,     7
+    andi            t3,     t3,     7
+    andi            t4,     t4,     7
+    andi            t5,     t5,     7
+    andi            t6,     t6,     7
+    andi            t7,     t7,     7
+    ldx.bu          t0,     a5,     t0
+    ldx.bu          t1,     a5,     t1
+    ldx.bu          t2,     a5,     t2
+    ldx.bu          t3,     a5,     t3
+    ldx.bu          t4,     a5,     t4
+    ldx.bu          t5,     a5,     t5
+    ldx.bu          t6,     a5,     t6
+    ldx.bu          t7,     a5,     t7
+    vreplgr2vr.w    vr0,    t0
+    vreplgr2vr.w    vr1,    t1
+    vreplgr2vr.w    vr2,    t2
+    vreplgr2vr.w    vr3,    t3
+    vreplgr2vr.w    vr4,    t4
+    vreplgr2vr.w    vr5,    t5
+    vreplgr2vr.w    vr6,    t6
+    vreplgr2vr.w    vr7,    t7
+    vilvl.w         vr0,    vr2,    vr0
+    vilvl.w         vr4,    vr6,    vr4
+    vilvl.w         vr1,    vr3,    vr1
+    vilvl.w         vr5,    vr7,    vr5
+    vilvl.d         vr12,   vr4,    vr0
+    vilvl.d         vr13,   vr5,    vr1
+    li.w            t5,     0
+    li.w            t8,     8
+    bge             a4,     t8,     .WIDTH8
+    blt             zero,   a4,     .WIDTH
+    b               .END
+
+.WIDTH8:
+    li.d            t1,     0
+    li.d            t4,     0
+    vslli.w         vr2,    vr12,   12
+    vslli.w         vr3,    vr13,   12
+    move            t3,     a0
+
+.FILTERSIZE8:
+    ldx.d           t2,     a2,     t1
+    vldx            vr4,    t2,     t5
+    vldrepl.h       vr5,    t3,     0
+    vmaddwev.w.h    vr2,    vr4,    vr5
+    vmaddwod.w.h    vr3,    vr4,    vr5
+    addi.d          t1,     t1,     8
+    addi.d          t3,     t3,     2
+    addi.d          t4,     t4,     1
+    blt             t4,     a1,     .FILTERSIZE8
+    vsrai.w         vr2,    vr2,    19
+    vsrai.w         vr3,    vr3,    19
+    vclip255.w      vr2,    vr2
+    vclip255.w      vr3,    vr3
+    vpickev.h       vr2,    vr3,    vr2
+    vpickev.b       vr2,    vr2,    vr2
+    vbsrl.v         vr3,    vr2,    4
+    vilvl.b         vr2,    vr3,    vr2
+    fst.d           f2,     a3,     0
+    addi.d          t5,     t5,     16
+    addi.d          a4,     a4,     -8
+    addi.d          a3,     a3,     8
+    bge             a4,     t8,     .WIDTH8
+    blt             zero,   a4,     .WIDTH
+    b               .END
+
+.WIDTH:
+    li.d            t1,     0
+    li.d            t4,     0
+    vslli.w         vr2,    vr12,   12
+    vslli.w         vr3,    vr13,   12
+.FILTERSIZE:
+    ldx.d           t2,     a2,     t1
+    vldx            vr4,    t2,     t5
+    vldrepl.h       vr5,    a0,     0
+    vmaddwev.w.h    vr2,    vr4,    vr5
+    vmaddwod.w.h    vr3,    vr4,    vr5
+    addi.d          t1,     t1,     8
+    addi.d          a0,     a0,     2
+    addi.d          t4,     t4,     1
+    blt             t4,     a1,     .FILTERSIZE
+    vsrai.w         vr2,    vr2,    19
+    vsrai.w         vr3,    vr3,    19
+    vclip255.w      vr2,    vr2
+    vclip255.w      vr3,    vr3
+    vpickev.h       vr2,    vr3,    vr2
+    vpickev.b       vr2,    vr2,    vr2
+    vbsrl.v         vr3,    vr2,    4
+    vilvl.b         vr2,    vr3,    vr2
+
+.DEST:
+    vstelm.b        vr2,    a3,     0,    0
+    vbsrl.v         vr2,    vr2,    1
+    addi.d          a4,     a4,     -1
+    addi.d          a3,     a3,     1
+    blt             zero,   a4,     .DEST
+.END:
+endfunc
--- a/libswscale/loongarch/output_lasx.c
+++ b/libswscale/loongarch/output_lasx.c
@ -1773,11 +1773,9 @@ YUV2RGBWRAPPER(yuv2, rgb_full, bgr4_byte_full,  AV_PIX_FMT_BGR4_BYTE, 0)
 YUV2RGBWRAPPER(yuv2, rgb_full, rgb4_byte_full,  AV_PIX_FMT_RGB4_BYTE, 0)
 YUV2RGBWRAPPER(yuv2, rgb_full, bgr8_full,   AV_PIX_FMT_BGR8,  0)
 YUV2RGBWRAPPER(yuv2, rgb_full, rgb8_full,   AV_PIX_FMT_RGB8,  0)
-#undef yuvTorgb
-#undef yuvTorgb_setup


-av_cold void ff_sws_init_output_loongarch(SwsContext *c)
+av_cold void ff_sws_init_output_lasx(SwsContext *c)
 {

    if(c->flags & SWS_FULL_CHR_H_INT) {
--- a/libswscale/loongarch/output_lsx.c
+++ b/libswscale/loongarch/output_lsx.c
--- a/libswscale/loongarch/swscale.S
+++ b/libswscale/loongarch/swscale.S
--- a/libswscale/loongarch/swscale_init_loongarch.c
+++ b/libswscale/loongarch/swscale_init_loongarch.c
@ -27,8 +27,33 @@
 av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
 {
    int cpu_flags = av_get_cpu_flags();
+    if (have_lsx(cpu_flags)) {
+        ff_sws_init_output_lsx(c);
+        if (c->srcBpc == 8) {
+            if (c->dstBpc <= 14) {
+                c->hyScale = c->hcScale = ff_hscale_8_to_15_lsx;
+            } else {
+                c->hyScale = c->hcScale = ff_hscale_8_to_19_lsx;
+            }
+        } else {
+            c->hyScale = c->hcScale = c->dstBpc > 14 ? ff_hscale_16_to_19_lsx
+                                                     : ff_hscale_16_to_15_lsx;
+        }
+        switch (c->srcFormat) {
+        case AV_PIX_FMT_GBRAP:
+        case AV_PIX_FMT_GBRP:
+            {
+                c->readChrPlanar = planar_rgb_to_uv_lsx;
+                c->readLumPlanar = planar_rgb_to_y_lsx;
+            }
+            break;
+        }
+        if (c->dstBpc == 8)
+            c->yuv2planeX = ff_yuv2planeX_8_lsx;
+    }
+#if HAVE_LASX
    if (have_lasx(cpu_flags)) {
-        ff_sws_init_output_loongarch(c);
+        ff_sws_init_output_lasx(c);
        if (c->srcBpc == 8) {
            if (c->dstBpc <= 14) {
                c->hyScale = c->hcScale = ff_hscale_8_to_15_lasx;
@ -51,17 +76,21 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
        if (c->dstBpc == 8)
            c->yuv2planeX = ff_yuv2planeX_8_lasx;
    }
+#endif // #if HAVE_LASX
 }

 av_cold void rgb2rgb_init_loongarch(void)
 {
+#if HAVE_LASX
    int cpu_flags = av_get_cpu_flags();
    if (have_lasx(cpu_flags))
        interleaveBytes = ff_interleave_bytes_lasx;
+#endif // #if HAVE_LASX
 }

 av_cold SwsFunc ff_yuv2rgb_init_loongarch(SwsContext *c)
 {
+#if HAVE_LASX
    int cpu_flags = av_get_cpu_flags();
    if (have_lasx(cpu_flags)) {
        switch (c->dstFormat) {
@ -91,5 +120,6 @@ av_cold SwsFunc ff_yuv2rgb_init_loongarch(SwsContext *c)
                    return yuv420_abgr32_lasx;
        }
    }
+#endif // #if HAVE_LASX
    return NULL;
 }
--- a/libswscale/loongarch/swscale_loongarch.h
+++ b/libswscale/loongarch/swscale_loongarch.h
@ -24,7 +24,45 @@

 #include "libswscale/swscale.h"
 #include "libswscale/swscale_internal.h"
+#include "config.h"

+void ff_hscale_8_to_15_lsx(SwsContext *c, int16_t *dst, int dstW,
+                           const uint8_t *src, const int16_t *filter,
+                           const int32_t *filterPos, int filterSize);
+
+void ff_hscale_8_to_19_lsx(SwsContext *c, int16_t *_dst, int dstW,
+                           const uint8_t *src, const int16_t *filter,
+                           const int32_t *filterPos, int filterSize);
+
+void ff_hscale_16_to_15_lsx(SwsContext *c, int16_t *_dst, int dstW,
+                            const uint8_t *_src, const int16_t *filter,
+                            const int32_t *filterPos, int filterSize);
+
+void ff_hscale_16_to_15_sub_lsx(SwsContext *c, int16_t *_dst, int dstW,
+                                const uint8_t *_src, const int16_t *filter,
+                                const int32_t *filterPos, int filterSize, int sh);
+
+void ff_hscale_16_to_19_lsx(SwsContext *c, int16_t *_dst, int dstW,
+                            const uint8_t *_src, const int16_t *filter,
+                            const int32_t *filterPos, int filterSize);
+
+void ff_hscale_16_to_19_sub_lsx(SwsContext *c, int16_t *_dst, int dstW,
+                                const uint8_t *_src, const int16_t *filter,
+                                const int32_t *filterPos, int filterSize, int sh);
+
+void planar_rgb_to_uv_lsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
+                          int width, int32_t *rgb2yuv, void *opq);
+
+void planar_rgb_to_y_lsx(uint8_t *_dst, const uint8_t *src[4], int width,
+                         int32_t *rgb2yuv, void *opq);
+
+void ff_yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
+                         const int16_t **src, uint8_t *dest, int dstW,
+                         const uint8_t *dither, int offset);
+
+av_cold void ff_sws_init_output_lsx(SwsContext *c);
+
+#if HAVE_LASX
 void ff_hscale_8_to_15_lasx(SwsContext *c, int16_t *dst, int dstW,
                            const uint8_t *src, const int16_t *filter,
                            const int32_t *filterPos, int filterSize);
@ -69,10 +107,11 @@ void ff_interleave_bytes_lasx(const uint8_t *src1, const uint8_t *src2,
                              uint8_t *dest, int width, int height,
                              int src1Stride, int src2Stride, int dstStride);

-av_cold void ff_sws_init_output_loongarch(SwsContext *c);
-
 void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
                          const int16_t **src, uint8_t *dest, int dstW,
                          const uint8_t *dither, int offset);

+av_cold void ff_sws_init_output_lasx(SwsContext *c);
+#endif // #if HAVE_LASX
+
 #endif /* SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H */
--- a/libswscale/loongarch/swscale_lsx.c
+++ b/libswscale/loongarch/swscale_lsx.c
@ -0,0 +1,57 @@
+/*
+ * Loongson LSX optimized swscale
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "swscale_loongarch.h"
+
+void ff_hscale_16_to_15_lsx(SwsContext *c, int16_t *_dst, int dstW,
+                            const uint8_t *_src, const int16_t *filter,
+                            const int32_t *filterPos, int filterSize)
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
+    int sh              = desc->comp[0].depth - 1;
+
+    if (sh < 15) {
+        sh = isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8 ? 13 :
+                      (desc->comp[0].depth - 1);
+    } else if (desc->flags && AV_PIX_FMT_FLAG_FLOAT) {
+        sh = 15;
+    }
+    ff_hscale_16_to_15_sub_lsx(c, _dst, dstW, _src, filter, filterPos, filterSize, sh);
+}
+
+void ff_hscale_16_to_19_lsx(SwsContext *c, int16_t *_dst, int dstW,
+                            const uint8_t *_src, const int16_t *filter,
+                            const int32_t *filterPos, int filterSize)
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
+    int bits            = desc->comp[0].depth - 1;
+    int sh              = bits - 4;
+
+    if ((isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8) && desc->comp[0].depth<16) {
+
+        sh = 9;
+    } else if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) { /* float input are process like uint 16bpc */
+        sh = 16 - 1 - 4;
+    }
+    ff_hscale_16_to_19_sub_lsx(c, _dst, dstW, _src, filter, filterPos, filterSize, sh);
+}
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@ -653,7 +653,7 @@ static av_cold int initFilter(int16_t **outFilter, int32_t **filterPos,
            filterAlign = 1;
    }

-    if (have_lasx(cpu_flags)) {
+    if (have_lasx(cpu_flags) || have_lsx(cpu_flags)) {
        int reNum = minFilterSize & (0x07);

        if (minFilterSize < 5)
@ -1806,6 +1806,7 @@ static av_cold int sws_init_single_context(SwsContext *c, SwsFilter *srcFilter,
            const int filterAlign = X86_MMX(cpu_flags)     ? 4 :
                                    PPC_ALTIVEC(cpu_flags) ? 8 :
                                    have_neon(cpu_flags)   ? 4 :
+                                    have_lsx(cpu_flags)    ? 8 :
                                    have_lasx(cpu_flags)   ? 8 : 1;

            if ((ret = initFilter(&c->hLumFilter, &c->hLumFilterPos,