lavc/aarch64: Provide optimized implementation of vsse8 for arm64.

Provide optimized implementation of vsse8 for arm64. Performance comparison tests are shown below. - vsse_1_c: 141.5 - vsse_1_neon: 32.5 Benchmarks and tests are run with checkasm tool on AWS Graviton 3. Signed-off-by: Grzegorz Bernacki <gjb@semihalf.com> Signed-off-by: Martin Storsjö <martin@martin.st>
2022-10-03 16:10:18 +02:00
parent faea56c9c7
commit bad67cb9fd
2 changed files with 75 additions and 0 deletions
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -71,6 +71,9 @@ int nsse8_neon(int multiplier, const uint8_t *s, const uint8_t *s2,
 int nsse8_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
                       ptrdiff_t stride, int h);
 int vsse8_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
               ptrdiff_t stride, int h);
 av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 {
    int cpu_flags = av_get_cpu_flags();
@@ -96,6 +99,8 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
        c->vsad[5] = vsad_intra8_neon;
        c->vsse[0] = vsse16_neon;
        c->vsse[1] = vsse8_neon;
        c->vsse[4] = vsse_intra16_neon;
        c->nsse[0] = nsse16_neon_wrapper;
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -838,6 +838,76 @@ function vsad16_neon, export=1
        ret
 endfunc
 function vsse8_neon, export=1
        // x0           unused
        // x1           uint8_t *pix1
        // x2           uint8_t *pix2
        // x3           ptrdiff_t stride
        // w4           int h
        ld1             {v0.8b}, [x1], x3              // Load pix1[0], first iteration
        ld1             {v1.8b}, [x2], x3              // Load pix2[0], first iteration
        sub             w4, w4, #1                      // we need to make h-1 iterations
        movi            v16.4s, #0
        movi            v17.4s, #0
        cmp             w4, #3                          // check if we can make 3 iterations at once
        usubl           v31.8h, v0.8b, v1.8b            // Signed difference of pix1[0] - pix2[0], first iteration
        b.lt            2f
 1:
        // x = abs(pix1[0] - pix2[0] - pix1[0 + stride] + pix2[0 + stride])
        // res = (x) * (x)
        ld1             {v0.8b}, [x1], x3              // Load pix1[0 + stride], first iteration
        ld1             {v1.8b}, [x2], x3              // Load pix2[0 + stride], first iteration
        ld1             {v2.8b}, [x1], x3              // Load pix1[0 + stride], second iteration
        ld1             {v3.8b}, [x2], x3              // Load pix2[0 + stride], second iteration
        usubl           v29.8h, v0.8b, v1.8b
        usubl2          v28.8h, v0.16b, v1.16b
        ld1             {v4.8b}, [x1], x3              // Load pix1[0 + stride], third iteration
        ld1             {v5.8b}, [x2], x3              // Load pix1[0 + stride], third iteration
        sabd            v31.8h, v31.8h, v29.8h
        usubl           v27.8h, v2.8b, v3.8b
        usubl           v25.8h, v4.8b, v5.8b
        sabd            v29.8h, v29.8h, v27.8h
        sabd            v27.8h, v27.8h, v25.8h
        umlal           v16.4s, v31.4h, v31.4h
        umlal2          v17.4s, v31.8h, v31.8h
        mov             v31.16b, v25.16b
        umlal           v16.4s, v29.4h, v29.4h
        umlal2          v17.4s, v29.8h, v29.8h
        sub             w4, w4, #3
        umlal           v16.4s, v27.4h, v27.4h
        umlal2          v17.4s, v27.8h, v27.8h
        cmp             w4, #3
        b.ge            1b
        cbz             w4, 3f
 // iterate by once
 2:
        ld1             {v0.8b}, [x1], x3
        ld1             {v1.8b}, [x2], x3
        subs            w4, w4, #1
        usubl           v29.8h, v0.8b, v1.8b
        sabd            v31.8h, v31.8h, v29.8h
        umlal           v16.4s, v31.4h, v31.4h
        umlal2          v17.4s, v31.8h, v31.8h
        mov             v31.16b, v29.16b
        b.ne            2b
 3:
        add             v16.4s, v16.4s, v17.4s
        uaddlv          d17, v16.4s
        fmov            w0, s17
        ret
 endfunc
 function vsse16_neon, export=1
        // x0           unused
        // x1           uint8_t *pix1