swscale/la: Optimize the functions of the swscale series with lsx.
./configure --disable-lasx ffmpeg -i ~/media/1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -s 640x480 -pix_fmt bgra -y /dev/null -an before: 91fps after: 160fps Reviewed-by: Shiyou Yin <yinshiyou-hf@loongson.cn> Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
This commit is contained in:
parent
f6077cc666
commit
4501b1dfd7
@ -4,3 +4,8 @@ LASX-OBJS-$(CONFIG_SWSCALE) += loongarch/swscale_lasx.o \
|
||||
loongarch/yuv2rgb_lasx.o \
|
||||
loongarch/rgb2rgb_lasx.o \
|
||||
loongarch/output_lasx.o
|
||||
LSX-OBJS-$(CONFIG_SWSCALE) += loongarch/swscale.o \
|
||||
loongarch/swscale_lsx.o \
|
||||
loongarch/input.o \
|
||||
loongarch/output.o \
|
||||
loongarch/output_lsx.o
|
||||
|
285
libswscale/loongarch/input.S
Normal file
285
libswscale/loongarch/input.S
Normal file
@ -0,0 +1,285 @@
|
||||
/*
|
||||
* Loongson LSX optimized swscale
|
||||
*
|
||||
* Copyright (c) 2023 Loongson Technology Corporation Limited
|
||||
* Contributed by Lu Wang <wanglu@loongson.cn>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavcodec/loongarch/loongson_asm.S"
|
||||
|
||||
/* void planar_rgb_to_y_lsx(uint8_t *_dst, const uint8_t *src[4],
|
||||
* int width, int32_t *rgb2yuv)
|
||||
*/
|
||||
function planar_rgb_to_y_lsx
|
||||
ld.d a5, a1, 0
|
||||
ld.d a6, a1, 8
|
||||
ld.d a7, a1, 16
|
||||
|
||||
ld.w t1, a3, 0 // ry
|
||||
ld.w t2, a3, 4 // gy
|
||||
ld.w t3, a3, 8 // by
|
||||
li.w t4, 9
|
||||
li.w t5, 524544
|
||||
li.w t7, 4
|
||||
li.w t8, 8
|
||||
vldi vr7, 0
|
||||
vreplgr2vr.w vr1, t1
|
||||
vreplgr2vr.w vr2, t2
|
||||
vreplgr2vr.w vr3, t3
|
||||
vreplgr2vr.w vr4, t4
|
||||
vreplgr2vr.w vr5, t5
|
||||
bge a2, t8, .WIDTH8
|
||||
bge a2, t7, .WIDTH4
|
||||
blt zero, a2, .WIDTH
|
||||
b .END
|
||||
|
||||
.WIDTH8:
|
||||
vld vr8, a5, 0
|
||||
vld vr9, a6, 0
|
||||
vld vr10, a7, 0
|
||||
vilvl.b vr11, vr7, vr8
|
||||
vilvl.b vr12, vr7, vr9
|
||||
vilvl.b vr13, vr7, vr10
|
||||
vilvl.h vr14, vr7, vr11
|
||||
vilvl.h vr15, vr7, vr12
|
||||
vilvl.h vr16, vr7, vr13
|
||||
vilvh.h vr17, vr7, vr11
|
||||
vilvh.h vr18, vr7, vr12
|
||||
vilvh.h vr19, vr7, vr13
|
||||
vmul.w vr20, vr1, vr16
|
||||
vmul.w vr21, vr1, vr19
|
||||
vmadd.w vr20, vr2, vr14
|
||||
vmadd.w vr20, vr3, vr15
|
||||
vmadd.w vr21, vr2, vr17
|
||||
vmadd.w vr21, vr3, vr18
|
||||
vadd.w vr20, vr20, vr5
|
||||
vadd.w vr21, vr21, vr5
|
||||
vsra.w vr20, vr20, vr4
|
||||
vsra.w vr21, vr21, vr4
|
||||
vpickev.h vr20, vr21, vr20
|
||||
vst vr20, a0, 0
|
||||
addi.d a2, a2, -8
|
||||
addi.d a5, a5, 8
|
||||
addi.d a6, a6, 8
|
||||
addi.d a7, a7, 8
|
||||
addi.d a0, a0, 16
|
||||
bge a2, t8, .WIDTH8
|
||||
bge a2, t7, .WIDTH4
|
||||
blt zero, a2, .WIDTH
|
||||
b .END
|
||||
|
||||
.WIDTH4:
|
||||
vld vr8, a5, 0
|
||||
vld vr9, a6, 0
|
||||
vld vr10, a7, 0
|
||||
vilvl.b vr11, vr7, vr8
|
||||
vilvl.b vr12, vr7, vr9
|
||||
vilvl.b vr13, vr7, vr10
|
||||
vilvl.h vr14, vr7, vr11
|
||||
vilvl.h vr15, vr7, vr12
|
||||
vilvl.h vr16, vr7, vr13
|
||||
vmul.w vr17, vr1, vr16
|
||||
vmadd.w vr17, vr2, vr14
|
||||
vmadd.w vr17, vr3, vr15
|
||||
vadd.w vr17, vr17, vr5
|
||||
vsra.w vr17, vr17, vr4
|
||||
vpickev.h vr17, vr17, vr17
|
||||
vstelm.d vr17, a0, 0, 0
|
||||
addi.d a2, a2, -4
|
||||
addi.d a5, a5, 4
|
||||
addi.d a6, a6, 4
|
||||
addi.d a7, a7, 4
|
||||
addi.d a0, a0, 8
|
||||
bge a2, t7, .WIDTH4
|
||||
blt zero, a2, .WIDTH
|
||||
b .END
|
||||
|
||||
.WIDTH:
|
||||
ld.bu t0, a5, 0
|
||||
ld.bu t4, a6, 0
|
||||
ld.bu t6, a7, 0
|
||||
mul.w t8, t6, t1
|
||||
mul.w t7, t0, t2
|
||||
add.w t8, t8, t7
|
||||
mul.w t7, t4, t3
|
||||
add.w t8, t8, t7
|
||||
add.w t8, t8, t5
|
||||
srai.w t8, t8, 9
|
||||
st.h t8, a0, 0
|
||||
addi.d a2, a2, -1
|
||||
addi.d a5, a5, 1
|
||||
addi.d a6, a6, 1
|
||||
addi.d a7, a7, 1
|
||||
addi.d a0, a0, 2
|
||||
blt zero, a2, .WIDTH
|
||||
.END:
|
||||
endfunc
|
||||
|
||||
/* void planar_rgb_to_uv_lsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
|
||||
* int width, int32_t *rgb2yuv)
|
||||
*/
|
||||
function planar_rgb_to_uv_lsx
|
||||
addi.d sp, sp, -24
|
||||
st.d s1, sp, 0
|
||||
st.d s2, sp, 8
|
||||
st.d s3, sp, 16
|
||||
|
||||
ld.d a5, a2, 0
|
||||
ld.d a6, a2, 8
|
||||
ld.d a7, a2, 16
|
||||
ld.w t1, a4, 12 // ru
|
||||
ld.w t2, a4, 16 // gu
|
||||
ld.w t3, a4, 20 // bu
|
||||
ld.w s1, a4, 24 // rv
|
||||
ld.w s2, a4, 28 // gv
|
||||
ld.w s3, a4, 32 // bv
|
||||
li.w t4, 9
|
||||
li.w t5, 4194560
|
||||
li.w t7, 4
|
||||
li.w t8, 8
|
||||
vldi vr0, 0
|
||||
vreplgr2vr.w vr1, t1
|
||||
vreplgr2vr.w vr2, t2
|
||||
vreplgr2vr.w vr3, t3
|
||||
vreplgr2vr.w vr4, s1
|
||||
vreplgr2vr.w vr5, s2
|
||||
vreplgr2vr.w vr6, s3
|
||||
vreplgr2vr.w vr7, t4
|
||||
vreplgr2vr.w vr8, t5
|
||||
bge a2, t8, .LOOP_WIDTH8
|
||||
bge a2, t7, .LOOP_WIDTH4
|
||||
blt zero, a2, .LOOP_WIDTH
|
||||
b .LOOP_END
|
||||
|
||||
.LOOP_WIDTH8:
|
||||
vld vr9, a5, 0
|
||||
vld vr10, a6, 0
|
||||
vld vr11, a7, 0
|
||||
vilvl.b vr9, vr0, vr9
|
||||
vilvl.b vr10, vr0, vr10
|
||||
vilvl.b vr11, vr0, vr11
|
||||
vilvl.h vr12, vr0, vr9
|
||||
vilvl.h vr13, vr0, vr10
|
||||
vilvl.h vr14, vr0, vr11
|
||||
vilvh.h vr15, vr0, vr9
|
||||
vilvh.h vr16, vr0, vr10
|
||||
vilvh.h vr17, vr0, vr11
|
||||
vmul.w vr18, vr1, vr14
|
||||
vmul.w vr19, vr1, vr17
|
||||
vmul.w vr20, vr4, vr14
|
||||
vmul.w vr21, vr4, vr17
|
||||
vmadd.w vr18, vr2, vr12
|
||||
vmadd.w vr18, vr3, vr13
|
||||
vmadd.w vr19, vr2, vr15
|
||||
vmadd.w vr19, vr3, vr16
|
||||
vmadd.w vr20, vr5, vr12
|
||||
vmadd.w vr20, vr6, vr13
|
||||
vmadd.w vr21, vr5, vr15
|
||||
vmadd.w vr21, vr6, vr16
|
||||
vadd.w vr18, vr18, vr8
|
||||
vadd.w vr19, vr19, vr8
|
||||
vadd.w vr20, vr20, vr8
|
||||
vadd.w vr21, vr21, vr8
|
||||
vsra.w vr18, vr18, vr7
|
||||
vsra.w vr19, vr19, vr7
|
||||
vsra.w vr20, vr20, vr7
|
||||
vsra.w vr21, vr21, vr7
|
||||
vpickev.h vr18, vr19, vr18
|
||||
vpickev.h vr20, vr21, vr20
|
||||
vst vr18, a0, 0
|
||||
vst vr20, a1, 0
|
||||
addi.d a3, a3, -8
|
||||
addi.d a5, a5, 8
|
||||
addi.d a6, a6, 8
|
||||
addi.d a7, a7, 8
|
||||
addi.d a0, a0, 16
|
||||
addi.d a1, a1, 16
|
||||
bge a3, t8, .LOOP_WIDTH8
|
||||
bge a3, t7, .LOOP_WIDTH4
|
||||
blt zero, a3, .LOOP_WIDTH
|
||||
b .LOOP_END
|
||||
|
||||
.LOOP_WIDTH4:
|
||||
vld vr9, a5, 0
|
||||
vld vr10, a6, 0
|
||||
vld vr11, a7, 0
|
||||
vilvl.b vr9, vr0, vr9
|
||||
vilvl.b vr10, vr0, vr10
|
||||
vilvl.b vr11, vr0, vr11
|
||||
vilvl.h vr12, vr0, vr9
|
||||
vilvl.h vr13, vr0, vr10
|
||||
vilvl.h vr14, vr0, vr11
|
||||
vmul.w vr18, vr1, vr14
|
||||
vmul.w vr19, vr4, vr14
|
||||
vmadd.w vr18, vr2, vr12
|
||||
vmadd.w vr18, vr3, vr13
|
||||
vmadd.w vr19, vr5, vr12
|
||||
vmadd.w vr19, vr6, vr13
|
||||
vadd.w vr18, vr18, vr8
|
||||
vadd.w vr19, vr19, vr8
|
||||
vsra.w vr18, vr18, vr7
|
||||
vsra.w vr19, vr19, vr7
|
||||
vpickev.h vr18, vr18, vr18
|
||||
vpickev.h vr19, vr19, vr19
|
||||
vstelm.d vr18, a0, 0, 0
|
||||
vstelm.d vr19, a1, 0, 0
|
||||
addi.d a3, a3, -4
|
||||
addi.d a5, a5, 4
|
||||
addi.d a6, a6, 4
|
||||
addi.d a7, a7, 4
|
||||
addi.d a0, a0, 8
|
||||
addi.d a1, a1, 8
|
||||
bge a3, t7, .LOOP_WIDTH4
|
||||
blt zero, a3, .LOOP_WIDTH
|
||||
b .LOOP_END
|
||||
|
||||
.LOOP_WIDTH:
|
||||
ld.bu t0, a5, 0
|
||||
ld.bu t4, a6, 0
|
||||
ld.bu t6, a7, 0
|
||||
mul.w t8, t6, t1
|
||||
mul.w t7, t0, t2
|
||||
add.w t8, t8, t7
|
||||
mul.w t7, t4, t3
|
||||
add.w t8, t8, t7
|
||||
add.w t8, t8, t5
|
||||
srai.w t8, t8, 9
|
||||
st.h t8, a0, 0
|
||||
mul.w t8, t6, s1
|
||||
mul.w t7, t0, s2
|
||||
add.w t8, t8, t7
|
||||
mul.w t7, t4, s3
|
||||
add.w t8, t8, t7
|
||||
add.w t8, t8, t5
|
||||
srai.w t8, t8, 9
|
||||
st.h t8, a1, 0
|
||||
addi.d a3, a3, -1
|
||||
addi.d a5, a5, 1
|
||||
addi.d a6, a6, 1
|
||||
addi.d a7, a7, 1
|
||||
addi.d a0, a0, 2
|
||||
addi.d a1, a1, 2
|
||||
blt zero, a3, .LOOP_WIDTH
|
||||
|
||||
.LOOP_END:
|
||||
ld.d s1, sp, 0
|
||||
ld.d s2, sp, 8
|
||||
ld.d s3, sp, 16
|
||||
addi.d sp, sp, 24
|
||||
endfunc
|
138
libswscale/loongarch/output.S
Normal file
138
libswscale/loongarch/output.S
Normal file
@ -0,0 +1,138 @@
|
||||
/*
|
||||
* Loongson LSX optimized swscale
|
||||
*
|
||||
* Copyright (c) 2023 Loongson Technology Corporation Limited
|
||||
* Contributed by Lu Wang <wanglu@loongson.cn>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavcodec/loongarch/loongson_asm.S"
|
||||
|
||||
/* static void ff_yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
|
||||
* const int16_t **src, uint8_t *dest, int dstW,
|
||||
* const uint8_t *dither, int offset)
|
||||
*/
|
||||
function ff_yuv2planeX_8_lsx
|
||||
addi.w t1, a6, 1
|
||||
addi.w t2, a6, 2
|
||||
addi.w t3, a6, 3
|
||||
addi.w t4, a6, 4
|
||||
addi.w t5, a6, 5
|
||||
addi.w t6, a6, 6
|
||||
addi.w t7, a6, 7
|
||||
andi t0, a6, 7
|
||||
andi t1, t1, 7
|
||||
andi t2, t2, 7
|
||||
andi t3, t3, 7
|
||||
andi t4, t4, 7
|
||||
andi t5, t5, 7
|
||||
andi t6, t6, 7
|
||||
andi t7, t7, 7
|
||||
ldx.bu t0, a5, t0
|
||||
ldx.bu t1, a5, t1
|
||||
ldx.bu t2, a5, t2
|
||||
ldx.bu t3, a5, t3
|
||||
ldx.bu t4, a5, t4
|
||||
ldx.bu t5, a5, t5
|
||||
ldx.bu t6, a5, t6
|
||||
ldx.bu t7, a5, t7
|
||||
vreplgr2vr.w vr0, t0
|
||||
vreplgr2vr.w vr1, t1
|
||||
vreplgr2vr.w vr2, t2
|
||||
vreplgr2vr.w vr3, t3
|
||||
vreplgr2vr.w vr4, t4
|
||||
vreplgr2vr.w vr5, t5
|
||||
vreplgr2vr.w vr6, t6
|
||||
vreplgr2vr.w vr7, t7
|
||||
vilvl.w vr0, vr2, vr0
|
||||
vilvl.w vr4, vr6, vr4
|
||||
vilvl.w vr1, vr3, vr1
|
||||
vilvl.w vr5, vr7, vr5
|
||||
vilvl.d vr12, vr4, vr0
|
||||
vilvl.d vr13, vr5, vr1
|
||||
li.w t5, 0
|
||||
li.w t8, 8
|
||||
bge a4, t8, .WIDTH8
|
||||
blt zero, a4, .WIDTH
|
||||
b .END
|
||||
|
||||
.WIDTH8:
|
||||
li.d t1, 0
|
||||
li.d t4, 0
|
||||
vslli.w vr2, vr12, 12
|
||||
vslli.w vr3, vr13, 12
|
||||
move t3, a0
|
||||
|
||||
.FILTERSIZE8:
|
||||
ldx.d t2, a2, t1
|
||||
vldx vr4, t2, t5
|
||||
vldrepl.h vr5, t3, 0
|
||||
vmaddwev.w.h vr2, vr4, vr5
|
||||
vmaddwod.w.h vr3, vr4, vr5
|
||||
addi.d t1, t1, 8
|
||||
addi.d t3, t3, 2
|
||||
addi.d t4, t4, 1
|
||||
blt t4, a1, .FILTERSIZE8
|
||||
vsrai.w vr2, vr2, 19
|
||||
vsrai.w vr3, vr3, 19
|
||||
vclip255.w vr2, vr2
|
||||
vclip255.w vr3, vr3
|
||||
vpickev.h vr2, vr3, vr2
|
||||
vpickev.b vr2, vr2, vr2
|
||||
vbsrl.v vr3, vr2, 4
|
||||
vilvl.b vr2, vr3, vr2
|
||||
fst.d f2, a3, 0
|
||||
addi.d t5, t5, 16
|
||||
addi.d a4, a4, -8
|
||||
addi.d a3, a3, 8
|
||||
bge a4, t8, .WIDTH8
|
||||
blt zero, a4, .WIDTH
|
||||
b .END
|
||||
|
||||
.WIDTH:
|
||||
li.d t1, 0
|
||||
li.d t4, 0
|
||||
vslli.w vr2, vr12, 12
|
||||
vslli.w vr3, vr13, 12
|
||||
.FILTERSIZE:
|
||||
ldx.d t2, a2, t1
|
||||
vldx vr4, t2, t5
|
||||
vldrepl.h vr5, a0, 0
|
||||
vmaddwev.w.h vr2, vr4, vr5
|
||||
vmaddwod.w.h vr3, vr4, vr5
|
||||
addi.d t1, t1, 8
|
||||
addi.d a0, a0, 2
|
||||
addi.d t4, t4, 1
|
||||
blt t4, a1, .FILTERSIZE
|
||||
vsrai.w vr2, vr2, 19
|
||||
vsrai.w vr3, vr3, 19
|
||||
vclip255.w vr2, vr2
|
||||
vclip255.w vr3, vr3
|
||||
vpickev.h vr2, vr3, vr2
|
||||
vpickev.b vr2, vr2, vr2
|
||||
vbsrl.v vr3, vr2, 4
|
||||
vilvl.b vr2, vr3, vr2
|
||||
|
||||
.DEST:
|
||||
vstelm.b vr2, a3, 0, 0
|
||||
vbsrl.v vr2, vr2, 1
|
||||
addi.d a4, a4, -1
|
||||
addi.d a3, a3, 1
|
||||
blt zero, a4, .DEST
|
||||
.END:
|
||||
endfunc
|
@ -1773,11 +1773,9 @@ YUV2RGBWRAPPER(yuv2, rgb_full, bgr4_byte_full, AV_PIX_FMT_BGR4_BYTE, 0)
|
||||
YUV2RGBWRAPPER(yuv2, rgb_full, rgb4_byte_full, AV_PIX_FMT_RGB4_BYTE, 0)
|
||||
YUV2RGBWRAPPER(yuv2, rgb_full, bgr8_full, AV_PIX_FMT_BGR8, 0)
|
||||
YUV2RGBWRAPPER(yuv2, rgb_full, rgb8_full, AV_PIX_FMT_RGB8, 0)
|
||||
#undef yuvTorgb
|
||||
#undef yuvTorgb_setup
|
||||
|
||||
|
||||
av_cold void ff_sws_init_output_loongarch(SwsContext *c)
|
||||
av_cold void ff_sws_init_output_lasx(SwsContext *c)
|
||||
{
|
||||
|
||||
if(c->flags & SWS_FULL_CHR_H_INT) {
|
||||
|
1828
libswscale/loongarch/output_lsx.c
Normal file
1828
libswscale/loongarch/output_lsx.c
Normal file
File diff suppressed because it is too large
Load Diff
1868
libswscale/loongarch/swscale.S
Normal file
1868
libswscale/loongarch/swscale.S
Normal file
File diff suppressed because it is too large
Load Diff
@ -27,8 +27,33 @@
|
||||
av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
if (have_lsx(cpu_flags)) {
|
||||
ff_sws_init_output_lsx(c);
|
||||
if (c->srcBpc == 8) {
|
||||
if (c->dstBpc <= 14) {
|
||||
c->hyScale = c->hcScale = ff_hscale_8_to_15_lsx;
|
||||
} else {
|
||||
c->hyScale = c->hcScale = ff_hscale_8_to_19_lsx;
|
||||
}
|
||||
} else {
|
||||
c->hyScale = c->hcScale = c->dstBpc > 14 ? ff_hscale_16_to_19_lsx
|
||||
: ff_hscale_16_to_15_lsx;
|
||||
}
|
||||
switch (c->srcFormat) {
|
||||
case AV_PIX_FMT_GBRAP:
|
||||
case AV_PIX_FMT_GBRP:
|
||||
{
|
||||
c->readChrPlanar = planar_rgb_to_uv_lsx;
|
||||
c->readLumPlanar = planar_rgb_to_y_lsx;
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (c->dstBpc == 8)
|
||||
c->yuv2planeX = ff_yuv2planeX_8_lsx;
|
||||
}
|
||||
#if HAVE_LASX
|
||||
if (have_lasx(cpu_flags)) {
|
||||
ff_sws_init_output_loongarch(c);
|
||||
ff_sws_init_output_lasx(c);
|
||||
if (c->srcBpc == 8) {
|
||||
if (c->dstBpc <= 14) {
|
||||
c->hyScale = c->hcScale = ff_hscale_8_to_15_lasx;
|
||||
@ -51,17 +76,21 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
|
||||
if (c->dstBpc == 8)
|
||||
c->yuv2planeX = ff_yuv2planeX_8_lasx;
|
||||
}
|
||||
#endif // #if HAVE_LASX
|
||||
}
|
||||
|
||||
av_cold void rgb2rgb_init_loongarch(void)
|
||||
{
|
||||
#if HAVE_LASX
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
if (have_lasx(cpu_flags))
|
||||
interleaveBytes = ff_interleave_bytes_lasx;
|
||||
#endif // #if HAVE_LASX
|
||||
}
|
||||
|
||||
av_cold SwsFunc ff_yuv2rgb_init_loongarch(SwsContext *c)
|
||||
{
|
||||
#if HAVE_LASX
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
if (have_lasx(cpu_flags)) {
|
||||
switch (c->dstFormat) {
|
||||
@ -91,5 +120,6 @@ av_cold SwsFunc ff_yuv2rgb_init_loongarch(SwsContext *c)
|
||||
return yuv420_abgr32_lasx;
|
||||
}
|
||||
}
|
||||
#endif // #if HAVE_LASX
|
||||
return NULL;
|
||||
}
|
||||
|
@ -24,7 +24,45 @@
|
||||
|
||||
#include "libswscale/swscale.h"
|
||||
#include "libswscale/swscale_internal.h"
|
||||
#include "config.h"
|
||||
|
||||
void ff_hscale_8_to_15_lsx(SwsContext *c, int16_t *dst, int dstW,
|
||||
const uint8_t *src, const int16_t *filter,
|
||||
const int32_t *filterPos, int filterSize);
|
||||
|
||||
void ff_hscale_8_to_19_lsx(SwsContext *c, int16_t *_dst, int dstW,
|
||||
const uint8_t *src, const int16_t *filter,
|
||||
const int32_t *filterPos, int filterSize);
|
||||
|
||||
void ff_hscale_16_to_15_lsx(SwsContext *c, int16_t *_dst, int dstW,
|
||||
const uint8_t *_src, const int16_t *filter,
|
||||
const int32_t *filterPos, int filterSize);
|
||||
|
||||
void ff_hscale_16_to_15_sub_lsx(SwsContext *c, int16_t *_dst, int dstW,
|
||||
const uint8_t *_src, const int16_t *filter,
|
||||
const int32_t *filterPos, int filterSize, int sh);
|
||||
|
||||
void ff_hscale_16_to_19_lsx(SwsContext *c, int16_t *_dst, int dstW,
|
||||
const uint8_t *_src, const int16_t *filter,
|
||||
const int32_t *filterPos, int filterSize);
|
||||
|
||||
void ff_hscale_16_to_19_sub_lsx(SwsContext *c, int16_t *_dst, int dstW,
|
||||
const uint8_t *_src, const int16_t *filter,
|
||||
const int32_t *filterPos, int filterSize, int sh);
|
||||
|
||||
void planar_rgb_to_uv_lsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
|
||||
int width, int32_t *rgb2yuv, void *opq);
|
||||
|
||||
void planar_rgb_to_y_lsx(uint8_t *_dst, const uint8_t *src[4], int width,
|
||||
int32_t *rgb2yuv, void *opq);
|
||||
|
||||
void ff_yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
|
||||
const int16_t **src, uint8_t *dest, int dstW,
|
||||
const uint8_t *dither, int offset);
|
||||
|
||||
av_cold void ff_sws_init_output_lsx(SwsContext *c);
|
||||
|
||||
#if HAVE_LASX
|
||||
void ff_hscale_8_to_15_lasx(SwsContext *c, int16_t *dst, int dstW,
|
||||
const uint8_t *src, const int16_t *filter,
|
||||
const int32_t *filterPos, int filterSize);
|
||||
@ -69,10 +107,11 @@ void ff_interleave_bytes_lasx(const uint8_t *src1, const uint8_t *src2,
|
||||
uint8_t *dest, int width, int height,
|
||||
int src1Stride, int src2Stride, int dstStride);
|
||||
|
||||
av_cold void ff_sws_init_output_loongarch(SwsContext *c);
|
||||
|
||||
void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
|
||||
const int16_t **src, uint8_t *dest, int dstW,
|
||||
const uint8_t *dither, int offset);
|
||||
|
||||
av_cold void ff_sws_init_output_lasx(SwsContext *c);
|
||||
#endif // #if HAVE_LASX
|
||||
|
||||
#endif /* SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H */
|
||||
|
57
libswscale/loongarch/swscale_lsx.c
Normal file
57
libswscale/loongarch/swscale_lsx.c
Normal file
@ -0,0 +1,57 @@
|
||||
/*
|
||||
* Loongson LSX optimized swscale
|
||||
*
|
||||
* Copyright (c) 2023 Loongson Technology Corporation Limited
|
||||
* Contributed by Lu Wang <wanglu@loongson.cn>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "swscale_loongarch.h"
|
||||
|
||||
void ff_hscale_16_to_15_lsx(SwsContext *c, int16_t *_dst, int dstW,
|
||||
const uint8_t *_src, const int16_t *filter,
|
||||
const int32_t *filterPos, int filterSize)
|
||||
{
|
||||
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
|
||||
int sh = desc->comp[0].depth - 1;
|
||||
|
||||
if (sh < 15) {
|
||||
sh = isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8 ? 13 :
|
||||
(desc->comp[0].depth - 1);
|
||||
} else if (desc->flags && AV_PIX_FMT_FLAG_FLOAT) {
|
||||
sh = 15;
|
||||
}
|
||||
ff_hscale_16_to_15_sub_lsx(c, _dst, dstW, _src, filter, filterPos, filterSize, sh);
|
||||
}
|
||||
|
||||
void ff_hscale_16_to_19_lsx(SwsContext *c, int16_t *_dst, int dstW,
|
||||
const uint8_t *_src, const int16_t *filter,
|
||||
const int32_t *filterPos, int filterSize)
|
||||
{
|
||||
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
|
||||
int bits = desc->comp[0].depth - 1;
|
||||
int sh = bits - 4;
|
||||
|
||||
if ((isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8) && desc->comp[0].depth<16) {
|
||||
|
||||
sh = 9;
|
||||
} else if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) { /* float input are process like uint 16bpc */
|
||||
sh = 16 - 1 - 4;
|
||||
}
|
||||
ff_hscale_16_to_19_sub_lsx(c, _dst, dstW, _src, filter, filterPos, filterSize, sh);
|
||||
}
|
@ -653,7 +653,7 @@ static av_cold int initFilter(int16_t **outFilter, int32_t **filterPos,
|
||||
filterAlign = 1;
|
||||
}
|
||||
|
||||
if (have_lasx(cpu_flags)) {
|
||||
if (have_lasx(cpu_flags) || have_lsx(cpu_flags)) {
|
||||
int reNum = minFilterSize & (0x07);
|
||||
|
||||
if (minFilterSize < 5)
|
||||
@ -1806,6 +1806,7 @@ static av_cold int sws_init_single_context(SwsContext *c, SwsFilter *srcFilter,
|
||||
const int filterAlign = X86_MMX(cpu_flags) ? 4 :
|
||||
PPC_ALTIVEC(cpu_flags) ? 8 :
|
||||
have_neon(cpu_flags) ? 4 :
|
||||
have_lsx(cpu_flags) ? 8 :
|
||||
have_lasx(cpu_flags) ? 8 : 1;
|
||||
|
||||
if ((ret = initFilter(&c->hLumFilter, &c->hLumFilterPos,
|
||||
|
Loading…
x
Reference in New Issue
Block a user