avcodec/la: Add LSX optimization for h264 chroma and intrapred.
./configure --disable-lasx ffmpeg -i 1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -y /dev/null -an before: 199fps after: 214fps Reviewed-by: Shiyou Yin <yinshiyou-hf@loongson.cn> Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
This commit is contained in:
committed by
Michael Niedermayer
parent
7845b5ecd6
commit
8815a7719e
@@ -9,11 +9,9 @@ OBJS-$(CONFIG_HPELDSP) += loongarch/hpeldsp_init_loongarch.o
|
||||
OBJS-$(CONFIG_IDCTDSP) += loongarch/idctdsp_init_loongarch.o
|
||||
OBJS-$(CONFIG_VIDEODSP) += loongarch/videodsp_init.o
|
||||
OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_init_loongarch.o
|
||||
LASX-OBJS-$(CONFIG_H264CHROMA) += loongarch/h264chroma_lasx.o
|
||||
LASX-OBJS-$(CONFIG_H264QPEL) += loongarch/h264qpel_lasx.o
|
||||
LASX-OBJS-$(CONFIG_H264DSP) += loongarch/h264dsp_lasx.o \
|
||||
loongarch/h264_deblock_lasx.o
|
||||
LASX-OBJS-$(CONFIG_H264PRED) += loongarch/h264_intrapred_lasx.o
|
||||
LASX-OBJS-$(CONFIG_VC1_DECODER) += loongarch/vc1dsp_lasx.o
|
||||
LASX-OBJS-$(CONFIG_HPELDSP) += loongarch/hpeldsp_lasx.o
|
||||
LASX-OBJS-$(CONFIG_IDCTDSP) += loongarch/simple_idct_lasx.o \
|
||||
@@ -33,3 +31,5 @@ LSX-OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_lsx.o \
|
||||
LSX-OBJS-$(CONFIG_H264DSP) += loongarch/h264idct.o \
|
||||
loongarch/h264idct_loongarch.o \
|
||||
loongarch/h264dsp.o
|
||||
LSX-OBJS-$(CONFIG_H264CHROMA) += loongarch/h264chroma.o
|
||||
LSX-OBJS-$(CONFIG_H264PRED) += loongarch/h264intrapred.o
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
|
||||
#include "libavutil/loongarch/cpu.h"
|
||||
#include "libavcodec/h264pred.h"
|
||||
#include "h264_intrapred_lasx.h"
|
||||
#include "h264_intrapred_loongarch.h"
|
||||
|
||||
av_cold void ff_h264_pred_init_loongarch(H264PredContext *h, int codec_id,
|
||||
const int bit_depth,
|
||||
@@ -30,6 +30,22 @@ av_cold void ff_h264_pred_init_loongarch(H264PredContext *h, int codec_id,
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (bit_depth == 8) {
|
||||
if (have_lsx(cpu_flags)) {
|
||||
if (chroma_format_idc <= 1) {
|
||||
}
|
||||
if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
|
||||
} else {
|
||||
if (chroma_format_idc <= 1) {
|
||||
}
|
||||
if (codec_id == AV_CODEC_ID_SVQ3) {
|
||||
h->pred16x16[PLANE_PRED8x8] = ff_h264_pred16x16_plane_svq3_8_lsx;
|
||||
} else if (codec_id == AV_CODEC_ID_RV40) {
|
||||
h->pred16x16[PLANE_PRED8x8] = ff_h264_pred16x16_plane_rv40_8_lsx;
|
||||
} else {
|
||||
h->pred16x16[PLANE_PRED8x8] = ff_h264_pred16x16_plane_h264_8_lsx;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (have_lasx(cpu_flags)) {
|
||||
if (chroma_format_idc <= 1) {
|
||||
}
|
||||
|
||||
@@ -1,121 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2021 Loongson Technology Corporation Limited
|
||||
* Contributed by Hao Chen <chenhao@loongson.cn>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/loongarch/loongson_intrinsics.h"
|
||||
#include "h264_intrapred_lasx.h"
|
||||
|
||||
#define PRED16X16_PLANE \
|
||||
ptrdiff_t stride_1, stride_2, stride_3, stride_4, stride_5, stride_6; \
|
||||
ptrdiff_t stride_8, stride_15; \
|
||||
int32_t res0, res1, res2, res3, cnt; \
|
||||
uint8_t *src0, *src1; \
|
||||
__m256i reg0, reg1, reg2, reg3, reg4; \
|
||||
__m256i tmp0, tmp1, tmp2, tmp3; \
|
||||
__m256i shuff = {0x0B040A0509060807, 0x0F000E010D020C03, 0, 0}; \
|
||||
__m256i mult = {0x0004000300020001, 0x0008000700060005, 0, 0}; \
|
||||
__m256i int_mult1 = {0x0000000100000000, 0x0000000300000002, \
|
||||
0x0000000500000004, 0x0000000700000006}; \
|
||||
\
|
||||
stride_1 = -stride; \
|
||||
stride_2 = stride << 1; \
|
||||
stride_3 = stride_2 + stride; \
|
||||
stride_4 = stride_2 << 1; \
|
||||
stride_5 = stride_4 + stride; \
|
||||
stride_6 = stride_3 << 1; \
|
||||
stride_8 = stride_4 << 1; \
|
||||
stride_15 = (stride_8 << 1) - stride; \
|
||||
src0 = src - 1; \
|
||||
src1 = src0 + stride_8; \
|
||||
\
|
||||
reg0 = __lasx_xvldx(src0, -stride); \
|
||||
reg1 = __lasx_xvldx(src, (8 - stride)); \
|
||||
reg0 = __lasx_xvilvl_d(reg1, reg0); \
|
||||
reg0 = __lasx_xvshuf_b(reg0, reg0, shuff); \
|
||||
reg0 = __lasx_xvhsubw_hu_bu(reg0, reg0); \
|
||||
reg0 = __lasx_xvmul_h(reg0, mult); \
|
||||
res1 = (src1[0] - src0[stride_6]) + \
|
||||
2 * (src1[stride] - src0[stride_5]) + \
|
||||
3 * (src1[stride_2] - src0[stride_4]) + \
|
||||
4 * (src1[stride_3] - src0[stride_3]) + \
|
||||
5 * (src1[stride_4] - src0[stride_2]) + \
|
||||
6 * (src1[stride_5] - src0[stride]) + \
|
||||
7 * (src1[stride_6] - src0[0]) + \
|
||||
8 * (src0[stride_15] - src0[stride_1]); \
|
||||
reg0 = __lasx_xvhaddw_w_h(reg0, reg0); \
|
||||
reg0 = __lasx_xvhaddw_d_w(reg0, reg0); \
|
||||
reg0 = __lasx_xvhaddw_q_d(reg0, reg0); \
|
||||
res0 = __lasx_xvpickve2gr_w(reg0, 0); \
|
||||
|
||||
#define PRED16X16_PLANE_END \
|
||||
res2 = (src0[stride_15] + src[15 - stride] + 1) << 4; \
|
||||
res3 = 7 * (res0 + res1); \
|
||||
res2 -= res3; \
|
||||
reg0 = __lasx_xvreplgr2vr_w(res0); \
|
||||
reg1 = __lasx_xvreplgr2vr_w(res1); \
|
||||
reg2 = __lasx_xvreplgr2vr_w(res2); \
|
||||
reg3 = __lasx_xvmul_w(reg0, int_mult1); \
|
||||
reg4 = __lasx_xvslli_w(reg0, 3); \
|
||||
reg4 = __lasx_xvadd_w(reg4, reg3); \
|
||||
for (cnt = 8; cnt--;) { \
|
||||
tmp0 = __lasx_xvadd_w(reg2, reg3); \
|
||||
tmp1 = __lasx_xvadd_w(reg2, reg4); \
|
||||
tmp0 = __lasx_xvssrani_hu_w(tmp1, tmp0, 5); \
|
||||
tmp0 = __lasx_xvpermi_d(tmp0, 0xD8); \
|
||||
reg2 = __lasx_xvadd_w(reg2, reg1); \
|
||||
tmp2 = __lasx_xvadd_w(reg2, reg3); \
|
||||
tmp3 = __lasx_xvadd_w(reg2, reg4); \
|
||||
tmp1 = __lasx_xvssrani_hu_w(tmp3, tmp2, 5); \
|
||||
tmp1 = __lasx_xvpermi_d(tmp1, 0xD8); \
|
||||
tmp0 = __lasx_xvssrani_bu_h(tmp1, tmp0, 0); \
|
||||
reg2 = __lasx_xvadd_w(reg2, reg1); \
|
||||
__lasx_xvstelm_d(tmp0, src, 0, 0); \
|
||||
__lasx_xvstelm_d(tmp0, src, 8, 2); \
|
||||
src += stride; \
|
||||
__lasx_xvstelm_d(tmp0, src, 0, 1); \
|
||||
__lasx_xvstelm_d(tmp0, src, 8, 3); \
|
||||
src += stride; \
|
||||
}
|
||||
|
||||
|
||||
void ff_h264_pred16x16_plane_h264_8_lasx(uint8_t *src, ptrdiff_t stride)
|
||||
{
|
||||
PRED16X16_PLANE
|
||||
res0 = (5 * res0 + 32) >> 6;
|
||||
res1 = (5 * res1 + 32) >> 6;
|
||||
PRED16X16_PLANE_END
|
||||
}
|
||||
|
||||
void ff_h264_pred16x16_plane_rv40_8_lasx(uint8_t *src, ptrdiff_t stride)
|
||||
{
|
||||
PRED16X16_PLANE
|
||||
res0 = (res0 + (res0 >> 2)) >> 4;
|
||||
res1 = (res1 + (res1 >> 2)) >> 4;
|
||||
PRED16X16_PLANE_END
|
||||
}
|
||||
|
||||
void ff_h264_pred16x16_plane_svq3_8_lasx(uint8_t *src, ptrdiff_t stride)
|
||||
{
|
||||
PRED16X16_PLANE
|
||||
cnt = (5 * (res0/4)) / 16;
|
||||
res0 = (5 * (res1/4)) / 16;
|
||||
res1 = cnt;
|
||||
PRED16X16_PLANE_END
|
||||
}
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2021 Loongson Technology Corporation Limited
|
||||
* Copyright (c) 2023 Loongson Technology Corporation Limited
|
||||
* Contributed by Hao Chen <chenhao@loongson.cn>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
@@ -19,13 +19,17 @@
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_LOONGARCH_H264_INTRAPRED_LASX_H
|
||||
#define AVCODEC_LOONGARCH_H264_INTRAPRED_LASX_H
|
||||
#ifndef AVCODEC_LOONGARCH_H264_INTRAPRED_LOONGARCH_H
|
||||
#define AVCODEC_LOONGARCH_H264_INTRAPRED_LOONGARCH_H
|
||||
|
||||
#include "libavcodec/avcodec.h"
|
||||
|
||||
void ff_h264_pred16x16_plane_h264_8_lsx(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_h264_pred16x16_plane_rv40_8_lsx(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_h264_pred16x16_plane_svq3_8_lsx(uint8_t *src, ptrdiff_t stride);
|
||||
|
||||
void ff_h264_pred16x16_plane_h264_8_lasx(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_h264_pred16x16_plane_rv40_8_lasx(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_h264_pred16x16_plane_svq3_8_lasx(uint8_t *src, ptrdiff_t stride);
|
||||
|
||||
#endif // #ifndef AVCODEC_LOONGARCH_H264_INTRAPRED_LASX_H
|
||||
#endif // #ifndef AVCODEC_LOONGARCH_H264_INTRAPRED_LOONGARCH_H
|
||||
966
libavcodec/loongarch/h264chroma.S
Normal file
966
libavcodec/loongarch/h264chroma.S
Normal file
@@ -0,0 +1,966 @@
|
||||
/*
|
||||
* Loongson LSX/LASX optimized h264chroma
|
||||
*
|
||||
* Copyright (c) 2023 Loongson Technology Corporation Limited
|
||||
* Contributed by Lu Wang <wanglu@loongson.cn>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "loongson_asm.S"
|
||||
|
||||
/* void ff_put_h264_chroma_mc8_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y) */
|
||||
function ff_put_h264_chroma_mc8_lsx
|
||||
li.d t8, 8
|
||||
sub.d t1, t8, a4 // 8-x
|
||||
sub.d t2, t8, a5 // 8-y
|
||||
mul.d t3, t1, t2 // A
|
||||
mul.d t4, a4, t2 // B
|
||||
mul.d t5, t1, a5 // C
|
||||
mul.d t6, a4, a5 // D
|
||||
add.d t0, t4, t5 // E
|
||||
vreplgr2vr.b vr0, t3
|
||||
vreplgr2vr.b vr1, t4
|
||||
vreplgr2vr.b vr2, t5
|
||||
vreplgr2vr.b vr3, t6
|
||||
vreplgr2vr.b vr4, t0
|
||||
slli.d t2, a2, 1
|
||||
add.d t3, t2, a2
|
||||
slli.d t4, a2, 2
|
||||
|
||||
bge zero, t6, .ENDLOOP_D
|
||||
move t1, a3
|
||||
vilvl.b vr9, vr1, vr0
|
||||
vilvl.b vr10, vr3, vr2
|
||||
.LOOP_D:
|
||||
vld vr5, a1, 0
|
||||
vld vr6, a1, 1
|
||||
add.d a1, a1, a2
|
||||
vld vr7, a1, 0
|
||||
vld vr8, a1, 1
|
||||
vilvl.b vr11, vr6, vr5
|
||||
vilvl.b vr12, vr8, vr7
|
||||
vmulwev.h.bu vr13, vr9, vr11
|
||||
vmaddwod.h.bu vr13, vr9, vr11
|
||||
vmulwev.h.bu vr14, vr10, vr12
|
||||
vmaddwod.h.bu vr14, vr10, vr12
|
||||
vadd.h vr13, vr13, vr14
|
||||
vsrarni.b.h vr13, vr13, 6
|
||||
vstelm.d vr13, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
add.d a1, a1, a2
|
||||
vld vr5, a1, 0
|
||||
vld vr6, a1, 1
|
||||
vilvl.b vr11, vr8, vr7
|
||||
vilvl.b vr12, vr6, vr5
|
||||
vmulwev.h.bu vr13, vr9, vr11
|
||||
vmaddwod.h.bu vr13, vr9, vr11
|
||||
vmulwev.h.bu vr14, vr10, vr12
|
||||
vmaddwod.h.bu vr14, vr10, vr12
|
||||
vadd.h vr13, vr13, vr14
|
||||
vsrarni.b.h vr13, vr13, 6
|
||||
vstelm.d vr13, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
add.d a1, a1, a2
|
||||
vld vr7, a1, 0
|
||||
vld vr8, a1, 1
|
||||
vilvl.b vr11, vr6, vr5
|
||||
vilvl.b vr12, vr8, vr7
|
||||
vmulwev.h.bu vr13, vr9, vr11
|
||||
vmaddwod.h.bu vr13, vr9, vr11
|
||||
vmulwev.h.bu vr14, vr10, vr12
|
||||
vmaddwod.h.bu vr14, vr10, vr12
|
||||
vadd.h vr13, vr13, vr14
|
||||
vsrarni.b.h vr13, vr13, 6
|
||||
vstelm.d vr13, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
add.d a1, a1, a2
|
||||
vld vr5, a1, 0
|
||||
vld vr6, a1, 1
|
||||
vilvl.b vr11, vr8, vr7
|
||||
vilvl.b vr12, vr6, vr5
|
||||
vmulwev.h.bu vr13, vr9, vr11
|
||||
vmaddwod.h.bu vr13, vr9, vr11
|
||||
vmulwev.h.bu vr14, vr10, vr12
|
||||
vmaddwod.h.bu vr14, vr10, vr12
|
||||
vadd.h vr13, vr13, vr14
|
||||
vsrarni.b.h vr13, vr13, 6
|
||||
vstelm.d vr13, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
|
||||
addi.d t1, t1, -4
|
||||
blt zero, t1, .LOOP_D
|
||||
b .ENDLOOP
|
||||
.ENDLOOP_D:
|
||||
|
||||
bge zero, t0, .ENDLOOP_E
|
||||
move t1, a3
|
||||
li.d t7, 1
|
||||
slt t8, zero, t5
|
||||
maskeqz t5, a2, t8
|
||||
masknez t7, t7, t8
|
||||
or t7, t7, t5
|
||||
vilvl.b vr7, vr4, vr0
|
||||
.LOOP_E:
|
||||
vld vr5, a1, 0
|
||||
vldx vr6, a1, t7
|
||||
vilvl.b vr5, vr6, vr5
|
||||
vmulwev.h.bu vr6, vr7, vr5
|
||||
vmaddwod.h.bu vr6, vr7, vr5
|
||||
vsrarni.b.h vr6, vr6, 6
|
||||
vstelm.d vr6, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
add.d a1, a1, a2
|
||||
vld vr5, a1, 0
|
||||
vldx vr6, a1, t7
|
||||
vilvl.b vr5, vr6, vr5
|
||||
vmulwev.h.bu vr6, vr7, vr5
|
||||
vmaddwod.h.bu vr6, vr7, vr5
|
||||
vsrarni.b.h vr6, vr6, 6
|
||||
vstelm.d vr6, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
add.d a1, a1, a2
|
||||
vld vr5, a1, 0
|
||||
vldx vr6, a1, t7
|
||||
vilvl.b vr5, vr6, vr5
|
||||
vmulwev.h.bu vr6, vr7, vr5
|
||||
vmaddwod.h.bu vr6, vr7, vr5
|
||||
vsrarni.b.h vr6, vr6, 6
|
||||
vstelm.d vr6, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
add.d a1, a1, a2
|
||||
vld vr5, a1, 0
|
||||
vldx vr6, a1, t7
|
||||
vilvl.b vr5, vr6, vr5
|
||||
vmulwev.h.bu vr6, vr7, vr5
|
||||
vmaddwod.h.bu vr6, vr7, vr5
|
||||
vsrarni.b.h vr6, vr6, 6
|
||||
vstelm.d vr6, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
add.d a1, a1, a2
|
||||
|
||||
addi.d t1, t1, -4
|
||||
blt zero, t1, .LOOP_E
|
||||
b .ENDLOOP
|
||||
.ENDLOOP_E:
|
||||
|
||||
move t1, a3
|
||||
.LOOP:
|
||||
vld vr5, a1, 0
|
||||
vmulwev.h.bu vr6, vr0, vr5
|
||||
vmulwod.h.bu vr7, vr0, vr5
|
||||
vsrarni.b.h vr6, vr6, 6
|
||||
vsrarni.b.h vr7, vr7, 6
|
||||
vilvl.b vr6, vr7, vr6
|
||||
vstelm.d vr6, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
vldx vr5, a1, a2
|
||||
vmulwev.h.bu vr6, vr0, vr5
|
||||
vmulwod.h.bu vr7, vr0, vr5
|
||||
vsrarni.b.h vr6, vr6, 6
|
||||
vsrarni.b.h vr7, vr7, 6
|
||||
vilvl.b vr6, vr7, vr6
|
||||
vstelm.d vr6, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
vldx vr5, a1, t2
|
||||
vmulwev.h.bu vr6, vr0, vr5
|
||||
vmulwod.h.bu vr7, vr0, vr5
|
||||
vsrarni.b.h vr6, vr6, 6
|
||||
vsrarni.b.h vr7, vr7, 6
|
||||
vilvl.b vr6, vr7, vr6
|
||||
vstelm.d vr6, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
vldx vr5, a1, t3
|
||||
vmulwev.h.bu vr6, vr0, vr5
|
||||
vmulwod.h.bu vr7, vr0, vr5
|
||||
vsrarni.b.h vr6, vr6, 6
|
||||
vsrarni.b.h vr7, vr7, 6
|
||||
vilvl.b vr6, vr7, vr6
|
||||
vstelm.d vr6, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
add.d a1, a1, t4
|
||||
|
||||
addi.d t1, t1, -4
|
||||
blt zero, t1, .LOOP
|
||||
.ENDLOOP:
|
||||
endfunc
|
||||
|
||||
/* void ff_avg_h264_chroma_mc8_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y) */
|
||||
function ff_avg_h264_chroma_mc8_lsx
|
||||
li.d t8, 8
|
||||
sub.d t1, t8, a4 // 8-x
|
||||
sub.d t2, t8, a5 // 8-y
|
||||
mul.d t3, t1, t2 // A
|
||||
mul.d t4, a4, t2 // B
|
||||
mul.d t5, t1, a5 // C
|
||||
mul.d t6, a4, a5 // D
|
||||
add.d t0, t4, t5 // E
|
||||
vreplgr2vr.b vr0, t3
|
||||
vreplgr2vr.b vr1, t4
|
||||
vreplgr2vr.b vr2, t5
|
||||
vreplgr2vr.b vr3, t6
|
||||
vreplgr2vr.b vr4, t0
|
||||
slli.d t2, a2, 1
|
||||
add.d t3, t2, a2
|
||||
slli.d t4, a2, 2
|
||||
|
||||
bge zero, t6, .ENDLOOPD
|
||||
move t1, a3
|
||||
vilvl.b vr9, vr1, vr0
|
||||
vilvl.b vr10, vr3, vr2
|
||||
.LOOPD:
|
||||
vld vr5, a1, 0
|
||||
vld vr6, a1, 1
|
||||
add.d a1, a1, a2
|
||||
vld vr7, a1, 0
|
||||
vld vr8, a1, 1
|
||||
vld vr11, a0, 0
|
||||
vilvl.b vr12, vr6, vr5
|
||||
vilvl.b vr13, vr8, vr7
|
||||
vmulwev.h.bu vr14, vr9, vr12
|
||||
vmaddwod.h.bu vr14, vr9, vr12
|
||||
vmulwev.h.bu vr15, vr10, vr13
|
||||
vmaddwod.h.bu vr15, vr10, vr13
|
||||
vadd.h vr14, vr14, vr15
|
||||
vsrari.h vr14, vr14, 6
|
||||
vsllwil.hu.bu vr11, vr11, 0
|
||||
vadd.h vr11, vr14, vr11
|
||||
vsrarni.b.h vr11, vr11, 1
|
||||
vstelm.d vr11, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
add.d a1, a1, a2
|
||||
vld vr5, a1, 0
|
||||
vld vr6, a1, 1
|
||||
vld vr11, a0, 0
|
||||
vilvl.b vr12, vr8, vr7
|
||||
vilvl.b vr13, vr6, vr5
|
||||
vmulwev.h.bu vr14, vr9, vr12
|
||||
vmaddwod.h.bu vr14, vr9, vr12
|
||||
vmulwev.h.bu vr15, vr10, vr13
|
||||
vmaddwod.h.bu vr15, vr10, vr13
|
||||
vadd.h vr14, vr14, vr15
|
||||
vsrari.h vr14, vr14, 6
|
||||
vsllwil.hu.bu vr11, vr11, 0
|
||||
vadd.h vr11, vr14, vr11
|
||||
vsrarni.b.h vr11, vr11, 1
|
||||
vstelm.d vr11, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
add.d a1, a1, a2
|
||||
vld vr7, a1, 0
|
||||
vld vr8, a1, 1
|
||||
vld vr11, a0, 0
|
||||
vilvl.b vr12, vr6, vr5
|
||||
vilvl.b vr13, vr8, vr7
|
||||
vmulwev.h.bu vr14, vr9, vr12
|
||||
vmaddwod.h.bu vr14, vr9, vr12
|
||||
vmulwev.h.bu vr15, vr10, vr13
|
||||
vmaddwod.h.bu vr15, vr10, vr13
|
||||
vadd.h vr14, vr14, vr15
|
||||
vsrari.h vr14, vr14, 6
|
||||
vsllwil.hu.bu vr11, vr11, 0
|
||||
vadd.h vr11, vr14, vr11
|
||||
vsrarni.b.h vr11, vr11, 1
|
||||
vstelm.d vr11, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
add.d a1, a1, a2
|
||||
vld vr5, a1, 0
|
||||
vld vr6, a1, 1
|
||||
vld vr11, a0, 0
|
||||
vilvl.b vr12, vr8, vr7
|
||||
vilvl.b vr13, vr6, vr5
|
||||
vmulwev.h.bu vr14, vr9, vr12
|
||||
vmaddwod.h.bu vr14, vr9, vr12
|
||||
vmulwev.h.bu vr15, vr10, vr13
|
||||
vmaddwod.h.bu vr15, vr10, vr13
|
||||
vadd.h vr14, vr14, vr15
|
||||
vsrari.h vr14, vr14, 6
|
||||
vsllwil.hu.bu vr11, vr11, 0
|
||||
vadd.h vr11, vr14, vr11
|
||||
vsrarni.b.h vr11, vr11, 1
|
||||
vstelm.d vr11, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
|
||||
addi.d t1, t1, -4
|
||||
blt zero, t1, .LOOPD
|
||||
b .ENDLOOPELSE
|
||||
.ENDLOOPD:
|
||||
|
||||
bge zero, t0, .ENDLOOPE
|
||||
move t1, a3
|
||||
li.d t7, 1
|
||||
slt t8, zero, t5
|
||||
maskeqz t5, a2, t8
|
||||
masknez t7, t7, t8
|
||||
or t7, t7, t5
|
||||
vilvl.b vr7, vr4, vr0
|
||||
.LOOPE:
|
||||
vld vr5, a1, 0
|
||||
vldx vr6, a1, t7
|
||||
vld vr8, a0, 0
|
||||
vilvl.b vr5, vr6, vr5
|
||||
vmulwev.h.bu vr6, vr7, vr5
|
||||
vmaddwod.h.bu vr6, vr7, vr5
|
||||
vsrari.h vr6, vr6, 6
|
||||
vsllwil.hu.bu vr8, vr8, 0
|
||||
vadd.h vr8, vr6, vr8
|
||||
vsrarni.b.h vr8, vr8, 1
|
||||
vstelm.d vr8, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
add.d a1, a1, a2
|
||||
vld vr5, a1, 0
|
||||
vldx vr6, a1, t7
|
||||
vld vr8, a0, 0
|
||||
vilvl.b vr5, vr6, vr5
|
||||
vmulwev.h.bu vr6, vr7, vr5
|
||||
vmaddwod.h.bu vr6, vr7, vr5
|
||||
vsrari.h vr6, vr6, 6
|
||||
vsllwil.hu.bu vr8, vr8, 0
|
||||
vadd.h vr8, vr6, vr8
|
||||
vsrarni.b.h vr8, vr8, 1
|
||||
vstelm.d vr8, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
add.d a1, a1, a2
|
||||
vld vr5, a1, 0
|
||||
vldx vr6, a1, t7
|
||||
vld vr8, a0, 0
|
||||
vilvl.b vr5, vr6, vr5
|
||||
vmulwev.h.bu vr6, vr7, vr5
|
||||
vmaddwod.h.bu vr6, vr7, vr5
|
||||
vsrari.h vr6, vr6, 6
|
||||
vsllwil.hu.bu vr8, vr8, 0
|
||||
vadd.h vr8, vr6, vr8
|
||||
vsrarni.b.h vr8, vr8, 1
|
||||
vstelm.d vr8, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
add.d a1, a1, a2
|
||||
vld vr5, a1, 0
|
||||
vldx vr6, a1, t7
|
||||
vld vr8, a0, 0
|
||||
vilvl.b vr5, vr6, vr5
|
||||
vmulwev.h.bu vr6, vr7, vr5
|
||||
vmaddwod.h.bu vr6, vr7, vr5
|
||||
vsrari.h vr6, vr6, 6
|
||||
vsllwil.hu.bu vr8, vr8, 0
|
||||
vadd.h vr8, vr6, vr8
|
||||
vsrarni.b.h vr8, vr8, 1
|
||||
vstelm.d vr8, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
add.d a1, a1, a2
|
||||
|
||||
addi.d t1, t1, -4
|
||||
blt zero, t1, .LOOPE
|
||||
b .ENDLOOPELSE
|
||||
.ENDLOOPE:
|
||||
|
||||
move t1, a3
|
||||
.LOOPELSE:
|
||||
vld vr5, a1, 0
|
||||
vld vr8, a0, 0
|
||||
vmulwev.h.bu vr6, vr0, vr5
|
||||
vmulwod.h.bu vr7, vr0, vr5
|
||||
vilvl.h vr6, vr7, vr6
|
||||
vsrari.h vr6, vr6, 6
|
||||
vsllwil.hu.bu vr8, vr8, 0
|
||||
vadd.h vr8, vr6, vr8
|
||||
vsrarni.b.h vr8, vr8, 1
|
||||
vstelm.d vr8, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
vldx vr5, a1, a2
|
||||
vld vr8, a0, 0
|
||||
vmulwev.h.bu vr6, vr0, vr5
|
||||
vmulwod.h.bu vr7, vr0, vr5
|
||||
vilvl.h vr6, vr7, vr6
|
||||
vsrari.h vr6, vr6, 6
|
||||
vsllwil.hu.bu vr8, vr8, 0
|
||||
vadd.h vr8, vr6, vr8
|
||||
vsrarni.b.h vr8, vr8, 1
|
||||
vstelm.d vr8, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
vldx vr5, a1, t2
|
||||
vld vr8, a0, 0
|
||||
vmulwev.h.bu vr6, vr0, vr5
|
||||
vmulwod.h.bu vr7, vr0, vr5
|
||||
vilvl.h vr6, vr7, vr6
|
||||
vsrari.h vr6, vr6, 6
|
||||
vsllwil.hu.bu vr8, vr8, 0
|
||||
vadd.h vr8, vr6, vr8
|
||||
vsrarni.b.h vr8, vr8, 1
|
||||
vstelm.d vr8, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
vldx vr5, a1, t3
|
||||
vld vr8, a0, 0
|
||||
vmulwev.h.bu vr6, vr0, vr5
|
||||
vmulwod.h.bu vr7, vr0, vr5
|
||||
vilvl.h vr6, vr7, vr6
|
||||
vsrari.h vr6, vr6, 6
|
||||
vsllwil.hu.bu vr8, vr8, 0
|
||||
vadd.h vr8, vr6, vr8
|
||||
vsrarni.b.h vr8, vr8, 1
|
||||
vstelm.d vr8, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
add.d a1, a1, t4
|
||||
|
||||
addi.d t1, t1, -4
|
||||
blt zero, t1, .LOOPELSE
|
||||
.ENDLOOPELSE:
|
||||
endfunc
|
||||
|
||||
/* void ff_put_h264_chroma_mc4_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y) */
|
||||
function ff_put_h264_chroma_mc4_lsx
|
||||
li.d t8, 8
|
||||
sub.d t1, t8, a4 // 8-x
|
||||
sub.d t2, t8, a5 // 8-y
|
||||
mul.d t3, t1, t2 // A
|
||||
mul.d t4, a4, t2 // B
|
||||
mul.d t5, t1, a5 // C
|
||||
mul.d t6, a4, a5 // D
|
||||
add.d t0, t4, t5 // E
|
||||
slli.d t8, a2, 1
|
||||
vreplgr2vr.b vr0, t3
|
||||
vreplgr2vr.b vr1, t4
|
||||
vreplgr2vr.b vr2, t5
|
||||
vreplgr2vr.b vr3, t6
|
||||
vreplgr2vr.b vr4, t0
|
||||
|
||||
bge zero, t6, .ENDPUT_D
|
||||
move t1, a3
|
||||
vilvl.b vr9, vr1, vr0
|
||||
vilvl.b vr10, vr3, vr2
|
||||
.PUT_D:
|
||||
vld vr5, a1, 0
|
||||
vld vr6, a1, 1
|
||||
add.d a1, a1, a2
|
||||
vld vr7, a1, 0
|
||||
vld vr8, a1, 1
|
||||
add.d a1, a1, a2
|
||||
vld vr11, a1, 0
|
||||
vld vr12, a1, 1
|
||||
vilvl.b vr5, vr6, vr5
|
||||
vilvl.b vr7, vr8, vr7
|
||||
vilvl.b vr13, vr12, vr11
|
||||
vilvl.d vr5, vr7, vr5
|
||||
vilvl.d vr13, vr13, vr7
|
||||
vmulwev.h.bu vr14, vr9, vr5
|
||||
vmaddwod.h.bu vr14, vr9, vr5
|
||||
vmulwev.h.bu vr15, vr10, vr13
|
||||
vmaddwod.h.bu vr15, vr10, vr13
|
||||
vadd.h vr14, vr14, vr15
|
||||
vsrarni.b.h vr14, vr14, 6
|
||||
vstelm.w vr14, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
vstelm.w vr14, a0, 0, 1
|
||||
add.d a0, a0, a2
|
||||
addi.d t1, t1, -2
|
||||
blt zero, t1, .PUT_D
|
||||
b .ENDPUT
|
||||
.ENDPUT_D:
|
||||
|
||||
bge zero, t0, .ENDPUT_E
|
||||
move t1, a3
|
||||
li.d t7, 1
|
||||
slt t8, zero, t5
|
||||
maskeqz t5, a2, t8
|
||||
masknez t7, t7, t8
|
||||
or t7, t7, t5
|
||||
vilvl.b vr7, vr4, vr0
|
||||
.PUT_E:
|
||||
vld vr5, a1, 0
|
||||
vldx vr6, a1, t7
|
||||
vilvl.b vr5, vr6, vr5
|
||||
add.d a1, a1, a2
|
||||
vld vr8, a1, 0
|
||||
vldx vr9, a1, t7
|
||||
vilvl.b vr8, vr9, vr8
|
||||
vilvl.d vr5, vr8, vr5
|
||||
vmulwev.h.bu vr6, vr7, vr5
|
||||
vmaddwod.h.bu vr6, vr7, vr5
|
||||
vsrarni.b.h vr6, vr6, 6
|
||||
vstelm.w vr6, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
vstelm.w vr6, a0, 0, 1
|
||||
add.d a0, a0, a2
|
||||
add.d a1, a1, a2
|
||||
addi.d t1, t1, -2
|
||||
blt zero, t1, .PUT_E
|
||||
b .ENDPUT
|
||||
.ENDPUT_E:
|
||||
|
||||
move t1, a3
|
||||
.PUT:
|
||||
vld vr5, a1, 0
|
||||
vldx vr8, a1, a2
|
||||
vilvl.w vr5, vr8, vr5
|
||||
vmulwev.h.bu vr6, vr0, vr5
|
||||
vmulwod.h.bu vr7, vr0, vr5
|
||||
vsrarni.b.h vr6, vr6, 6
|
||||
vsrarni.b.h vr7, vr7, 6
|
||||
vilvl.b vr6, vr7, vr6
|
||||
vstelm.w vr6, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
vstelm.w vr6, a0, 0, 1
|
||||
add.d a0, a0, a2
|
||||
add.d a1, a1, t8
|
||||
addi.d t1, t1, -2
|
||||
blt zero, t1, .PUT
|
||||
.ENDPUT:
|
||||
endfunc
|
||||
|
||||
/* void ff_put_h264_chroma_mc8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y) */
|
||||
function ff_put_h264_chroma_mc8_lasx
|
||||
li.d t8, 8
|
||||
sub.d t1, t8, a4 // 8-x
|
||||
sub.d t2, t8, a5 // 8-y
|
||||
mul.d t3, t1, t2 // A
|
||||
mul.d t4, a4, t2 // B
|
||||
mul.d t5, t1, a5 // C
|
||||
mul.d t6, a4, a5 // D
|
||||
add.d t0, t4, t5 // E
|
||||
xvreplgr2vr.b xr0, t3
|
||||
xvreplgr2vr.b xr1, t4
|
||||
xvreplgr2vr.b xr2, t5
|
||||
xvreplgr2vr.b xr3, t6
|
||||
xvreplgr2vr.b xr4, t0
|
||||
slli.d t2, a2, 1
|
||||
add.d t3, t2, a2
|
||||
slli.d t4, a2, 2
|
||||
|
||||
bge zero, t6, .ENDLOOP_DA
|
||||
move t1, a3
|
||||
xvilvl.b xr9, xr1, xr0
|
||||
xvilvl.b xr10, xr3, xr2
|
||||
.LOOP_DA:
|
||||
fld.d f5, a1, 0
|
||||
fld.d f6, a1, 1
|
||||
add.d a1, a1, a2
|
||||
fld.d f7, a1, 0
|
||||
fld.d f8, a1, 1
|
||||
add.d a1, a1, a2
|
||||
fld.d f13, a1, 0
|
||||
fld.d f14, a1, 1
|
||||
add.d a1, a1, a2
|
||||
fld.d f15, a1, 0
|
||||
fld.d f16, a1, 1
|
||||
add.d a1, a1, a2
|
||||
fld.d f17, a1, 0
|
||||
fld.d f18, a1, 1
|
||||
vilvl.b vr11, vr6, vr5
|
||||
vilvl.b vr12, vr8, vr7
|
||||
vilvl.b vr14, vr14, vr13
|
||||
vilvl.b vr15, vr16, vr15
|
||||
vilvl.b vr16, vr18, vr17
|
||||
xvpermi.q xr11, xr12, 0x02
|
||||
xvpermi.q xr12, xr14, 0x02
|
||||
xvpermi.q xr14, xr15, 0x02
|
||||
xvpermi.q xr15, xr16, 0x02
|
||||
|
||||
xvmulwev.h.bu xr19, xr9, xr11
|
||||
xvmaddwod.h.bu xr19, xr9, xr11
|
||||
xvmulwev.h.bu xr20, xr10, xr12
|
||||
xvmaddwod.h.bu xr20, xr10, xr12
|
||||
xvadd.h xr21, xr19, xr20
|
||||
xvsrarni.b.h xr21, xr21, 6
|
||||
vstelm.d vr21, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
xvstelm.d xr21, a0, 0, 2
|
||||
add.d a0, a0, a2
|
||||
xvmulwev.h.bu xr13, xr9, xr14
|
||||
xvmaddwod.h.bu xr13, xr9, xr14
|
||||
xvmulwev.h.bu xr14, xr10, xr15
|
||||
xvmaddwod.h.bu xr14, xr10, xr15
|
||||
xvadd.h xr13, xr13, xr14
|
||||
xvsrarni.b.h xr13, xr13, 6
|
||||
vstelm.d vr13, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
xvstelm.d xr13, a0, 0, 2
|
||||
add.d a0, a0, a2
|
||||
|
||||
addi.d t1, t1, -4
|
||||
blt zero, t1, .LOOP_DA
|
||||
b .ENDLOOPA
|
||||
.ENDLOOP_DA:
|
||||
|
||||
bge zero, t0, .ENDLOOP_EA
|
||||
move t1, a3
|
||||
li.d t7, 1
|
||||
slt t8, zero, t5
|
||||
maskeqz t5, a2, t8
|
||||
masknez t7, t7, t8
|
||||
or t7, t7, t5
|
||||
xvilvl.b xr7, xr4, xr0
|
||||
.LOOP_EA:
|
||||
fld.d f5, a1, 0
|
||||
fldx.d f6, a1, t7
|
||||
add.d a1, a1, a2
|
||||
fld.d f9, a1, 0
|
||||
fldx.d f10, a1, t7
|
||||
add.d a1, a1, a2
|
||||
fld.d f11, a1, 0
|
||||
fldx.d f12, a1, t7
|
||||
add.d a1, a1, a2
|
||||
fld.d f13, a1, 0
|
||||
fldx.d f14, a1, t7
|
||||
vilvl.b vr5, vr6, vr5
|
||||
vilvl.b vr9, vr10, vr9
|
||||
vilvl.b vr11, vr12, vr11
|
||||
vilvl.b vr13, vr14, vr13
|
||||
xvpermi.q xr5, xr9, 0x02
|
||||
xvpermi.q xr11, xr13, 0x02
|
||||
|
||||
xvmulwev.h.bu xr8, xr7, xr5
|
||||
xvmaddwod.h.bu xr8, xr7, xr5
|
||||
xvmulwev.h.bu xr6, xr7, xr11
|
||||
xvmaddwod.h.bu xr6, xr7, xr11
|
||||
xvsrarni.b.h xr8, xr8, 6
|
||||
vstelm.d vr8, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
xvstelm.d xr8, a0, 0, 2
|
||||
add.d a0, a0, a2
|
||||
xvsrarni.b.h xr6, xr6, 6
|
||||
vstelm.d vr6, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
xvstelm.d xr6, a0, 0, 2
|
||||
add.d a0, a0, a2
|
||||
add.d a1, a1, a2
|
||||
|
||||
addi.d t1, t1, -4
|
||||
blt zero, t1, .LOOP_EA
|
||||
b .ENDLOOPA
|
||||
.ENDLOOP_EA:
|
||||
|
||||
move t1, a3
|
||||
.LOOPA:
|
||||
fld.d f5, a1, 0
|
||||
fldx.d f6, a1, a2
|
||||
fldx.d f7, a1, t2
|
||||
fldx.d f8, a1, t3
|
||||
vilvl.d vr5, vr6, vr5
|
||||
vilvl.d vr7, vr8, vr7
|
||||
xvpermi.q xr5, xr7, 0x02
|
||||
xvmulwev.h.bu xr6, xr0, xr5
|
||||
xvmulwod.h.bu xr7, xr0, xr5
|
||||
xvilvl.h xr8, xr7, xr6
|
||||
xvilvh.h xr9, xr7, xr6
|
||||
xvsrarni.b.h xr9, xr8, 6
|
||||
vstelm.d vr9, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
vstelm.d vr9, a0, 0, 1
|
||||
add.d a0, a0, a2
|
||||
xvstelm.d xr9, a0, 0, 2
|
||||
add.d a0, a0, a2
|
||||
xvstelm.d xr9, a0, 0, 3
|
||||
add.d a0, a0, a2
|
||||
add.d a1, a1, t4
|
||||
|
||||
addi.d t1, t1, -4
|
||||
blt zero, t1, .LOOPA
|
||||
.ENDLOOPA:
|
||||
endfunc
|
||||
|
||||
/* void ff_avg_h264_chroma_mc8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y) */
|
||||
function ff_avg_h264_chroma_mc8_lasx
|
||||
li.d t8, 8
|
||||
sub.d t1, t8, a4 // 8-x
|
||||
sub.d t2, t8, a5 // 8-y
|
||||
mul.d t3, t1, t2 // A
|
||||
mul.d t4, a4, t2 // B
|
||||
mul.d t5, t1, a5 // C
|
||||
mul.d t6, a4, a5 // D
|
||||
add.d t0, t4, t5 // E
|
||||
xvreplgr2vr.b xr0, t3
|
||||
xvreplgr2vr.b xr1, t4
|
||||
xvreplgr2vr.b xr2, t5
|
||||
xvreplgr2vr.b xr3, t6
|
||||
xvreplgr2vr.b xr4, t0
|
||||
slli.d t2, a2, 1
|
||||
add.d t3, t2, a2
|
||||
slli.d t4, a2, 2
|
||||
|
||||
bge zero, t6, .ENDLOOPDA
|
||||
move t1, a3
|
||||
xvilvl.b xr9, xr1, xr0
|
||||
xvilvl.b xr10, xr3, xr2
|
||||
.LOOPDA:
|
||||
fld.d f5, a1, 0
|
||||
fld.d f6, a1, 1
|
||||
add.d a1, a1, a2
|
||||
fld.d f7, a1, 0
|
||||
fld.d f8, a1, 1
|
||||
add.d a1, a1, a2
|
||||
fld.d f11, a1, 0
|
||||
fld.d f12, a1, 1
|
||||
add.d a1, a1, a2
|
||||
fld.d f13, a1, 0
|
||||
fld.d f14, a1, 1
|
||||
add.d a1, a1, a2
|
||||
fld.d f15, a1, 0
|
||||
fld.d f16, a1, 1
|
||||
fld.d f17, a0, 0
|
||||
fldx.d f18, a0, a2
|
||||
fldx.d f19, a0, t2
|
||||
fldx.d f20, a0, t3
|
||||
vilvl.b vr5, vr6, vr5
|
||||
vilvl.b vr7, vr8, vr7
|
||||
vilvl.b vr11, vr12, vr11
|
||||
vilvl.b vr13, vr14, vr13
|
||||
vilvl.b vr16, vr16, vr15
|
||||
xvpermi.q xr5, xr7, 0x02
|
||||
xvpermi.q xr7, xr11, 0x02
|
||||
xvpermi.q xr11, xr13, 0x02
|
||||
xvpermi.q xr13, xr16, 0x02
|
||||
xvpermi.q xr17, xr18, 0x02
|
||||
xvpermi.q xr19, xr20, 0x02
|
||||
|
||||
xvmulwev.h.bu xr14, xr9, xr5
|
||||
xvmaddwod.h.bu xr14, xr9, xr5
|
||||
xvmulwev.h.bu xr15, xr10, xr7
|
||||
xvmaddwod.h.bu xr15, xr10, xr7
|
||||
xvadd.h xr14, xr14, xr15
|
||||
xvsrari.h xr14, xr14, 6
|
||||
xvsllwil.hu.bu xr17, xr17, 0
|
||||
xvadd.h xr20, xr14, xr17
|
||||
xvsrarni.b.h xr20, xr20, 1
|
||||
xvstelm.d xr20, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
xvstelm.d xr20, a0, 0, 2
|
||||
add.d a0, a0, a2
|
||||
xvmulwev.h.bu xr14, xr9, xr11
|
||||
xvmaddwod.h.bu xr14, xr9, xr11
|
||||
xvmulwev.h.bu xr15, xr10, xr13
|
||||
xvmaddwod.h.bu xr15, xr10, xr13
|
||||
xvadd.h xr14, xr14, xr15
|
||||
xvsrari.h xr14, xr14, 6
|
||||
xvsllwil.hu.bu xr19, xr19, 0
|
||||
xvadd.h xr21, xr14, xr19
|
||||
xvsrarni.b.h xr21, xr21, 1
|
||||
xvstelm.d xr21, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
xvstelm.d xr21, a0, 0, 2
|
||||
add.d a0, a0, a2
|
||||
|
||||
addi.d t1, t1, -4
|
||||
blt zero, t1, .LOOPDA
|
||||
b .ENDLOOPELSEA
|
||||
.ENDLOOPDA:
|
||||
|
||||
bge zero, t0, .ENDLOOPEA
|
||||
move t1, a3
|
||||
li.d t7, 1
|
||||
slt t8, zero, t5
|
||||
maskeqz t5, a2, t8
|
||||
masknez t7, t7, t8
|
||||
or t7, t7, t5
|
||||
xvilvl.b xr7, xr4, xr0
|
||||
.LOOPEA:
|
||||
fld.d f5, a1, 0
|
||||
fldx.d f6, a1, t7
|
||||
add.d a1, a1, a2
|
||||
fld.d f8, a1, 0
|
||||
fldx.d f9, a1, t7
|
||||
add.d a1, a1, a2
|
||||
fld.d f10, a1, 0
|
||||
fldx.d f11, a1, t7
|
||||
add.d a1, a1, a2
|
||||
fld.d f12, a1, 0
|
||||
fldx.d f13, a1, t7
|
||||
add.d a1, a1, a2
|
||||
fld.d f14, a0, 0
|
||||
fldx.d f15, a0, a2
|
||||
fldx.d f16, a0, t2
|
||||
fldx.d f17, a0, t3
|
||||
vilvl.b vr5, vr6, vr5
|
||||
vilvl.b vr8, vr9, vr8
|
||||
vilvl.b vr10, vr11, vr10
|
||||
vilvl.b vr12, vr13, vr12
|
||||
xvpermi.q xr5, xr8, 0x02
|
||||
xvpermi.q xr10, xr12, 0x02
|
||||
xvpermi.q xr14, xr15, 0x02
|
||||
xvpermi.q xr16, xr17, 0x02
|
||||
|
||||
xvmulwev.h.bu xr6, xr7, xr5
|
||||
xvmaddwod.h.bu xr6, xr7, xr5
|
||||
xvsrari.h xr6, xr6, 6
|
||||
xvsllwil.hu.bu xr14, xr14, 0
|
||||
xvadd.h xr8, xr6, xr14
|
||||
xvsrarni.b.h xr8, xr8, 1
|
||||
xvstelm.d xr8, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
xvstelm.d xr8, a0, 0, 2
|
||||
add.d a0, a0, a2
|
||||
xvmulwev.h.bu xr6, xr7, xr10
|
||||
xvmaddwod.h.bu xr6, xr7, xr10
|
||||
xvsrari.h xr6, xr6, 6
|
||||
xvsllwil.hu.bu xr16, xr16, 0
|
||||
xvadd.h xr8, xr6, xr16
|
||||
xvsrarni.b.h xr8, xr8, 1
|
||||
xvstelm.d xr8, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
xvstelm.d xr8, a0, 0, 2
|
||||
add.d a0, a0, a2
|
||||
|
||||
addi.d t1, t1, -4
|
||||
blt zero, t1, .LOOPEA
|
||||
b .ENDLOOPELSEA
|
||||
.ENDLOOPEA:
|
||||
|
||||
move t1, a3
|
||||
.LOOPELSEA:
|
||||
fld.d f5, a1, 0
|
||||
fldx.d f6, a1, a2
|
||||
fldx.d f7, a1, t2
|
||||
fldx.d f8, a1, t3
|
||||
fld.d f9, a0, 0
|
||||
fldx.d f10, a0, a2
|
||||
fldx.d f11, a0, t2
|
||||
fldx.d f12, a0, t3
|
||||
xvpermi.q xr5, xr6, 0x02
|
||||
xvpermi.q xr7, xr8, 0x02
|
||||
xvpermi.q xr9, xr10, 0x02
|
||||
xvpermi.q xr11, xr12, 0x02
|
||||
|
||||
xvmulwev.h.bu xr12, xr0, xr5
|
||||
xvmulwod.h.bu xr13, xr0, xr5
|
||||
xvilvl.h xr12, xr13, xr12
|
||||
xvsrari.h xr12, xr12, 6
|
||||
xvsllwil.hu.bu xr9, xr9, 0
|
||||
xvadd.h xr9, xr12, xr9
|
||||
xvsrarni.b.h xr9, xr9, 1
|
||||
xvstelm.d xr9, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
xvstelm.d xr9, a0, 0, 2
|
||||
add.d a0, a0, a2
|
||||
xvmulwev.h.bu xr12, xr0, xr7
|
||||
xvmulwod.h.bu xr13, xr0, xr7
|
||||
xvilvl.h xr12, xr13, xr12
|
||||
xvsrari.h xr12, xr12, 6
|
||||
xvsllwil.hu.bu xr11, xr11, 0
|
||||
xvadd.h xr13, xr12, xr11
|
||||
xvsrarni.b.h xr13, xr13, 1
|
||||
xvstelm.d xr13, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
xvstelm.d xr13, a0, 0, 2
|
||||
add.d a0, a0, a2
|
||||
add.d a1, a1, t4
|
||||
|
||||
addi.d t1, t1, -4
|
||||
blt zero, t1, .LOOPELSEA
|
||||
.ENDLOOPELSEA:
|
||||
endfunc
|
||||
|
||||
/* void ff_put_h264_chroma_mc4_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y) */
|
||||
function ff_put_h264_chroma_mc4_lasx
|
||||
li.d t8, 8
|
||||
sub.d t1, t8, a4 // 8-x
|
||||
sub.d t2, t8, a5 // 8-y
|
||||
mul.d t3, t1, t2 // A
|
||||
mul.d t4, a4, t2 // B
|
||||
mul.d t5, t1, a5 // C
|
||||
mul.d t6, a4, a5 // D
|
||||
add.d t0, t4, t5 // E
|
||||
slli.d t8, a2, 1
|
||||
vreplgr2vr.b vr0, t3
|
||||
vreplgr2vr.b vr1, t4
|
||||
vreplgr2vr.b vr2, t5
|
||||
vreplgr2vr.b vr3, t6
|
||||
vreplgr2vr.b vr4, t0
|
||||
|
||||
bge zero, t6, .ENDPUT_DA
|
||||
move t1, a3
|
||||
vilvl.b vr9, vr1, vr0
|
||||
vilvl.b vr10, vr3, vr2
|
||||
.PUT_DA:
|
||||
fld.d f5, a1, 0
|
||||
fld.d f6, a1, 1
|
||||
add.d a1, a1, a2
|
||||
fld.d f7, a1, 0
|
||||
fld.d f8, a1, 1
|
||||
add.d a1, a1, a2
|
||||
fld.d f11, a1, 0
|
||||
fld.d f12, a1, 1
|
||||
vilvl.b vr5, vr6, vr5
|
||||
vilvl.b vr7, vr8, vr7
|
||||
vilvl.b vr13, vr12, vr11
|
||||
vilvl.d vr5, vr7, vr5
|
||||
vilvl.d vr13, vr13, vr7
|
||||
vmulwev.h.bu vr14, vr9, vr5
|
||||
vmaddwod.h.bu vr14, vr9, vr5
|
||||
vmulwev.h.bu vr15, vr10, vr13
|
||||
vmaddwod.h.bu vr15, vr10, vr13
|
||||
xvadd.h xr14, xr14, xr15
|
||||
vsrarni.b.h vr16, vr14, 6
|
||||
vstelm.w vr16, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
vstelm.w vr16, a0, 0, 1
|
||||
add.d a0, a0, a2
|
||||
addi.d t1, t1, -2
|
||||
blt zero, t1, .PUT_DA
|
||||
b .ENDPUTA
|
||||
.ENDPUT_DA:
|
||||
|
||||
bge zero, t0, .ENDPUT_EA
|
||||
move t1, a3
|
||||
li.d t7, 1
|
||||
slt t8, zero, t5
|
||||
maskeqz t5, a2, t8
|
||||
masknez t7, t7, t8
|
||||
or t7, t7, t5
|
||||
vilvl.b vr7, vr4, vr0
|
||||
.PUT_EA:
|
||||
fld.d f5, a1, 0
|
||||
fldx.d f6, a1, t7
|
||||
vilvl.b vr5, vr6, vr5
|
||||
add.d a1, a1, a2
|
||||
fld.d f8, a1, 0
|
||||
fldx.d f9, a1, t7
|
||||
vilvl.b vr8, vr9, vr8
|
||||
vilvl.d vr5, vr8, vr5
|
||||
vmulwev.h.bu vr6, vr7, vr5
|
||||
vmaddwod.h.bu vr6, vr7, vr5
|
||||
vsrarni.b.h vr6, vr6, 6
|
||||
vstelm.w vr6, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
vstelm.w vr6, a0, 0, 1
|
||||
add.d a0, a0, a2
|
||||
add.d a1, a1, a2
|
||||
addi.d t1, t1, -2
|
||||
blt zero, t1, .PUT_EA
|
||||
b .ENDPUTA
|
||||
.ENDPUT_EA:
|
||||
|
||||
move t1, a3
|
||||
.PUTA:
|
||||
fld.d f5, a1, 0
|
||||
fldx.d f8, a1, a2
|
||||
vilvl.w vr5, vr8, vr5
|
||||
vmulwev.h.bu vr6, vr0, vr5
|
||||
vmulwod.h.bu vr7, vr0, vr5
|
||||
vilvl.h vr6, vr7, vr6
|
||||
vsrarni.b.h vr6, vr6, 6
|
||||
vstelm.w vr6, a0, 0, 0
|
||||
add.d a0, a0, a2
|
||||
vstelm.w vr6, a0, 0, 1
|
||||
add.d a0, a0, a2
|
||||
add.d a1, a1, t8
|
||||
addi.d t1, t1, -2
|
||||
blt zero, t1, .PUTA
|
||||
.ENDPUTA:
|
||||
endfunc
|
||||
@@ -19,7 +19,7 @@
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "h264chroma_lasx.h"
|
||||
#include "h264chroma_loongarch.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/loongarch/cpu.h"
|
||||
#include "libavcodec/h264chroma.h"
|
||||
@@ -27,6 +27,14 @@
|
||||
av_cold void ff_h264chroma_init_loongarch(H264ChromaContext *c, int bit_depth)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
if (have_lsx(cpu_flags)) {
|
||||
if (bit_depth <= 8) {
|
||||
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_lsx;
|
||||
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_lsx;
|
||||
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_lsx;
|
||||
}
|
||||
}
|
||||
|
||||
if (have_lasx(cpu_flags)) {
|
||||
if (bit_depth <= 8) {
|
||||
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_lasx;
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,36 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2020 Loongson Technology Corporation Limited
|
||||
* Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_LOONGARCH_H264CHROMA_LASX_H
|
||||
#define AVCODEC_LOONGARCH_H264CHROMA_LASX_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include "libavcodec/h264.h"
|
||||
|
||||
void ff_put_h264_chroma_mc4_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y);
|
||||
void ff_put_h264_chroma_mc8_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y);
|
||||
void ff_avg_h264_chroma_mc8_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y);
|
||||
|
||||
#endif /* AVCODEC_LOONGARCH_H264CHROMA_LASX_H */
|
||||
41
libavcodec/loongarch/h264chroma_loongarch.h
Normal file
41
libavcodec/loongarch/h264chroma_loongarch.h
Normal file
@@ -0,0 +1,41 @@
|
||||
/*
|
||||
* Copyright (c) 2023 Loongson Technology Corporation Limited
|
||||
* Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_LOONGARCH_H264CHROMA_LOONGARCH_H
|
||||
#define AVCODEC_LOONGARCH_H264CHROMA_LOONGARCH_H
|
||||
|
||||
#include "libavcodec/h264.h"
|
||||
|
||||
void ff_put_h264_chroma_mc8_lsx(unsigned char *dst, const unsigned char *src,
|
||||
long int stride, int h, int x, int y);
|
||||
void ff_avg_h264_chroma_mc8_lsx(unsigned char *dst, const unsigned char *src,
|
||||
long int stride, int h, int x, int y);
|
||||
void ff_put_h264_chroma_mc4_lsx(unsigned char *dst, const unsigned char *src,
|
||||
long int stride, int h, int x, int y);
|
||||
|
||||
void ff_put_h264_chroma_mc4_lasx(unsigned char *dst, const unsigned char *src,
|
||||
long int stride, int h, int x, int y);
|
||||
void ff_put_h264_chroma_mc8_lasx(unsigned char *dst, const unsigned char *src,
|
||||
long int stride, int h, int x, int y);
|
||||
void ff_avg_h264_chroma_mc8_lasx(unsigned char *dst, const unsigned char *src,
|
||||
long int stride, int h, int x, int y);
|
||||
|
||||
#endif /* AVCODEC_LOONGARCH_H264CHROMA_LOONGARCH_H */
|
||||
299
libavcodec/loongarch/h264intrapred.S
Normal file
299
libavcodec/loongarch/h264intrapred.S
Normal file
@@ -0,0 +1,299 @@
|
||||
/*
|
||||
* Loongson LSX optimized h264intrapred
|
||||
*
|
||||
* Copyright (c) 2023 Loongson Technology Corporation Limited
|
||||
* Contributed by Lu Wang <wanglu@loongson.cn>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "loongson_asm.S"
|
||||
|
||||
const shufa
|
||||
.byte 6, 5, 4, 3, 2, 1, 0
|
||||
endconst
|
||||
|
||||
const mulk
|
||||
.byte 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0
|
||||
endconst
|
||||
|
||||
const mulh
|
||||
.byte 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0
|
||||
.byte 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 0
|
||||
endconst
|
||||
|
||||
.macro PRED16X16_PLANE
|
||||
slli.d t6, a1, 1
|
||||
slli.d t4, a1, 3
|
||||
addi.d t0, a0, 7
|
||||
sub.d t0, t0, a1
|
||||
add.d t1, a0, t4
|
||||
addi.d t1, t1, -1
|
||||
sub.d t2, t1, t6
|
||||
|
||||
ld.bu t3, t0, 1
|
||||
ld.bu t4, t0, -1
|
||||
ld.bu t5, t1, 0
|
||||
ld.bu t7, t2, 0
|
||||
sub.d t3, t3, t4
|
||||
sub.d t4, t5, t7
|
||||
|
||||
la.local t5, mulk
|
||||
vld vr0, t5, 0
|
||||
fld.d f1, t0, 2
|
||||
fld.d f2, t0, -8
|
||||
la.local t5, shufa
|
||||
fld.d f3, t5, 0
|
||||
vshuf.b vr2, vr2, vr2, vr3
|
||||
vilvl.b vr1, vr1, vr2
|
||||
vhsubw.hu.bu vr1, vr1, vr1
|
||||
vmul.h vr0, vr0, vr1
|
||||
vhaddw.w.h vr1, vr0, vr0
|
||||
vhaddw.d.w vr0, vr1, vr1
|
||||
vhaddw.q.d vr1, vr0, vr0
|
||||
vpickve2gr.w t5, vr1, 0
|
||||
add.d t3, t3, t5
|
||||
//2
|
||||
sub.d t2, t2, a1
|
||||
ld.bu t8, t2, 0
|
||||
ldx.bu t7, t1, a1
|
||||
sub.d t5, t7, t8
|
||||
slli.d t5, t5, 1
|
||||
|
||||
//3&4
|
||||
add.d t1, t1, t6
|
||||
sub.d t2, t2, a1
|
||||
ld.bu t8, t2, 0
|
||||
ld.bu t7, t1, 0
|
||||
sub.d t7, t7, t8
|
||||
slli.d t8, t7, 1
|
||||
add.d t7, t7, t8
|
||||
add.d t5, t5, t7
|
||||
sub.d t2, t2, a1
|
||||
ld.bu t8, t2, 0
|
||||
ldx.bu t7, t1, a1
|
||||
sub.d t7, t7, t8
|
||||
slli.d t7, t7, 2
|
||||
add.d t5, t5, t7
|
||||
|
||||
//5&6
|
||||
add.d t1, t1, t6
|
||||
sub.d t2, t2, a1
|
||||
ld.bu t8, t2, 0
|
||||
ld.bu t7, t1, 0
|
||||
sub.d t7, t7, t8
|
||||
slli.d t8, t7, 2
|
||||
add.d t7, t7, t8
|
||||
add.d t5, t5, t7
|
||||
sub.d t2, t2, a1
|
||||
ld.bu t8, t2, 0
|
||||
ldx.bu t7, t1, a1
|
||||
sub.d t7, t7, t8
|
||||
slli.d t8, t7, 1
|
||||
slli.d t7, t7, 2
|
||||
add.d t7, t7, t8
|
||||
add.d t5, t5, t7
|
||||
|
||||
//7&8
|
||||
add.d t1, t1, t6
|
||||
sub.d t2, t2, a1
|
||||
ld.bu t8, t2, 0
|
||||
ld.bu t7, t1, 0
|
||||
sub.d t7, t7, t8
|
||||
slli.d t8, t7, 3
|
||||
sub.d t7, t8, t7
|
||||
add.d t5, t5, t7
|
||||
sub.d t2, t2, a1
|
||||
ld.bu t8, t2, 0
|
||||
ldx.bu t7, t1, a1
|
||||
sub.d t7, t7, t8
|
||||
slli.d t7, t7, 3
|
||||
add.d t5, t5, t7
|
||||
add.d t4, t4, t5
|
||||
add.d t1, t1, a1
|
||||
.endm
|
||||
|
||||
.macro PRED16X16_PLANE_END
|
||||
ld.bu t7, t1, 0
|
||||
ld.bu t8, t2, 16
|
||||
add.d t5, t7, t8
|
||||
addi.d t5, t5, 1
|
||||
slli.d t5, t5, 4
|
||||
add.d t7, t3, t4
|
||||
slli.d t8, t7, 3
|
||||
sub.d t7, t8, t7
|
||||
sub.d t5, t5, t7
|
||||
|
||||
la.local t8, mulh
|
||||
vld vr3, t8, 0
|
||||
slli.d t8, t3, 3
|
||||
vreplgr2vr.h vr4, t3
|
||||
vreplgr2vr.h vr9, t8
|
||||
vmul.h vr5, vr3, vr4
|
||||
|
||||
.rept 16
|
||||
move t7, t5
|
||||
add.d t5, t5, t4
|
||||
vreplgr2vr.h vr6, t7
|
||||
vadd.h vr7, vr6, vr5
|
||||
vadd.h vr8, vr9, vr7
|
||||
vssrani.bu.h vr8, vr7, 5
|
||||
vst vr8, a0, 0
|
||||
add.d a0, a0, a1
|
||||
.endr
|
||||
.endm
|
||||
|
||||
.macro PRED16X16_PLANE_END_LASX
|
||||
ld.bu t7, t1, 0
|
||||
ld.bu t8, t2, 16
|
||||
add.d t5, t7, t8
|
||||
addi.d t5, t5, 1
|
||||
slli.d t5, t5, 4
|
||||
add.d t7, t3, t4
|
||||
slli.d t8, t7, 3
|
||||
sub.d t7, t8, t7
|
||||
sub.d t5, t5, t7
|
||||
|
||||
la.local t8, mulh
|
||||
xvld xr3, t8, 0
|
||||
xvreplgr2vr.h xr4, t3
|
||||
xvmul.h xr5, xr3, xr4
|
||||
|
||||
.rept 8
|
||||
move t7, t5
|
||||
add.d t5, t5, t4
|
||||
xvreplgr2vr.h xr6, t7
|
||||
xvreplgr2vr.h xr8, t5
|
||||
add.d t5, t5, t4
|
||||
xvadd.h xr7, xr6, xr5
|
||||
xvadd.h xr9, xr8, xr5
|
||||
|
||||
xvssrani.bu.h xr9, xr7, 5
|
||||
vstelm.d vr9, a0, 0, 0
|
||||
xvstelm.d xr9, a0, 8, 2
|
||||
add.d a0, a0, a1
|
||||
vstelm.d vr9, a0, 0, 1
|
||||
xvstelm.d xr9, a0, 8, 3
|
||||
add.d a0, a0, a1
|
||||
.endr
|
||||
.endm
|
||||
|
||||
/* void ff_h264_pred16x16_plane_h264_8_lsx(uint8_t *src, ptrdiff_t stride)
|
||||
*/
|
||||
function ff_h264_pred16x16_plane_h264_8_lsx
|
||||
PRED16X16_PLANE
|
||||
|
||||
slli.d t7, t3, 2
|
||||
add.d t3, t3, t7
|
||||
addi.d t3, t3, 32
|
||||
srai.d t3, t3, 6
|
||||
slli.d t7, t4, 2
|
||||
add.d t4, t4, t7
|
||||
addi.d t4, t4, 32
|
||||
srai.d t4, t4, 6
|
||||
|
||||
PRED16X16_PLANE_END
|
||||
endfunc
|
||||
|
||||
/* void ff_h264_pred16x16_plane_rv40_8_lsx(uint8_t *src, ptrdiff_t stride)
|
||||
*/
|
||||
function ff_h264_pred16x16_plane_rv40_8_lsx
|
||||
PRED16X16_PLANE
|
||||
|
||||
srai.d t7, t3, 2
|
||||
add.d t3, t3, t7
|
||||
srai.d t3, t3, 4
|
||||
srai.d t7, t4, 2
|
||||
add.d t4, t4, t7
|
||||
srai.d t4, t4, 4
|
||||
|
||||
PRED16X16_PLANE_END
|
||||
endfunc
|
||||
|
||||
/* void ff_h264_pred16x16_plane_svq3_8_lsx(uint8_t *src, ptrdiff_t stride)
|
||||
*/
|
||||
function ff_h264_pred16x16_plane_svq3_8_lsx
|
||||
PRED16X16_PLANE
|
||||
|
||||
li.d t6, 4
|
||||
li.d t7, 5
|
||||
li.d t8, 16
|
||||
div.d t3, t3, t6
|
||||
mul.d t3, t3, t7
|
||||
div.d t3, t3, t8
|
||||
div.d t4, t4, t6
|
||||
mul.d t4, t4, t7
|
||||
div.d t4, t4, t8
|
||||
move t7, t3
|
||||
move t3, t4
|
||||
move t4, t7
|
||||
|
||||
PRED16X16_PLANE_END
|
||||
endfunc
|
||||
|
||||
/* void ff_h264_pred16x16_plane_h264_8_lasx(uint8_t *src, ptrdiff_t stride)
|
||||
*/
|
||||
function ff_h264_pred16x16_plane_h264_8_lasx
|
||||
PRED16X16_PLANE
|
||||
|
||||
slli.d t7, t3, 2
|
||||
add.d t3, t3, t7
|
||||
addi.d t3, t3, 32
|
||||
srai.d t3, t3, 6
|
||||
slli.d t7, t4, 2
|
||||
add.d t4, t4, t7
|
||||
addi.d t4, t4, 32
|
||||
srai.d t4, t4, 6
|
||||
|
||||
PRED16X16_PLANE_END_LASX
|
||||
endfunc
|
||||
|
||||
/* void ff_h264_pred16x16_plane_rv40_8_lasx(uint8_t *src, ptrdiff_t stride)
|
||||
*/
|
||||
function ff_h264_pred16x16_plane_rv40_8_lasx
|
||||
PRED16X16_PLANE
|
||||
|
||||
srai.d t7, t3, 2
|
||||
add.d t3, t3, t7
|
||||
srai.d t3, t3, 4
|
||||
srai.d t7, t4, 2
|
||||
add.d t4, t4, t7
|
||||
srai.d t4, t4, 4
|
||||
|
||||
PRED16X16_PLANE_END_LASX
|
||||
endfunc
|
||||
|
||||
/* void ff_h264_pred16x16_plane_svq3_8_lasx(uint8_t *src, ptrdiff_t stride)
|
||||
*/
|
||||
function ff_h264_pred16x16_plane_svq3_8_lasx
|
||||
PRED16X16_PLANE
|
||||
|
||||
li.d t5, 4
|
||||
li.d t7, 5
|
||||
li.d t8, 16
|
||||
div.d t3, t3, t5
|
||||
mul.d t3, t3, t7
|
||||
div.d t3, t3, t8
|
||||
div.d t4, t4, t5
|
||||
mul.d t4, t4, t7
|
||||
div.d t4, t4, t8
|
||||
move t7, t3
|
||||
move t3, t4
|
||||
move t4, t7
|
||||
|
||||
PRED16X16_PLANE_END_LASX
|
||||
endfunc
|
||||
Reference in New Issue
Block a user