x86inc: AVX-512 support
AVX-512 consists of a plethora of different extensions, but in order to keep things a bit more manageable we group together the following extensions under a single baseline cpu flag which should cover SKL-X and future CPUs: * AVX-512 Foundation (F) * AVX-512 Conflict Detection Instructions (CD) * AVX-512 Byte and Word Instructions (BW) * AVX-512 Doubleword and Quadword Instructions (DQ) * AVX-512 Vector Length Extensions (VL) On x86-64 AVX-512 provides 16 additional vector registers, prefer using those over existing ones since it allows us to avoid using `vzeroupper` unless more than 16 vector registers are required. They also happen to be volatile on Windows which means that we don't need to save and restore existing xmm register contents unless more than 22 vector registers are required. Big thanks to Intel for their support.
This commit is contained in:
committed by
James Darnley
parent
8f86e66238
commit
f7197f68dc
@@ -337,6 +337,8 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
|
|||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%define required_stack_alignment ((mmsize + 15) & ~15)
|
%define required_stack_alignment ((mmsize + 15) & ~15)
|
||||||
|
%define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512)))
|
||||||
|
%define high_mm_regs (16*cpuflag(avx512))
|
||||||
|
|
||||||
%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
|
%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
|
||||||
%ifnum %1
|
%ifnum %1
|
||||||
@@ -450,15 +452,16 @@ DECLARE_REG 14, R13, 120
|
|||||||
|
|
||||||
%macro WIN64_PUSH_XMM 0
|
%macro WIN64_PUSH_XMM 0
|
||||||
; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
|
; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
|
||||||
%if xmm_regs_used > 6
|
%if xmm_regs_used > 6 + high_mm_regs
|
||||||
movaps [rstk + stack_offset + 8], xmm6
|
movaps [rstk + stack_offset + 8], xmm6
|
||||||
%endif
|
%endif
|
||||||
%if xmm_regs_used > 7
|
%if xmm_regs_used > 7 + high_mm_regs
|
||||||
movaps [rstk + stack_offset + 24], xmm7
|
movaps [rstk + stack_offset + 24], xmm7
|
||||||
%endif
|
%endif
|
||||||
%if xmm_regs_used > 8
|
%assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
|
||||||
|
%if %%xmm_regs_on_stack > 0
|
||||||
%assign %%i 8
|
%assign %%i 8
|
||||||
%rep xmm_regs_used-8
|
%rep %%xmm_regs_on_stack
|
||||||
movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
|
movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
|
||||||
%assign %%i %%i+1
|
%assign %%i %%i+1
|
||||||
%endrep
|
%endrep
|
||||||
@@ -467,10 +470,11 @@ DECLARE_REG 14, R13, 120
|
|||||||
|
|
||||||
%macro WIN64_SPILL_XMM 1
|
%macro WIN64_SPILL_XMM 1
|
||||||
%assign xmm_regs_used %1
|
%assign xmm_regs_used %1
|
||||||
ASSERT xmm_regs_used <= 16
|
ASSERT xmm_regs_used <= 16 + high_mm_regs
|
||||||
%if xmm_regs_used > 8
|
%assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
|
||||||
|
%if %%xmm_regs_on_stack > 0
|
||||||
; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
|
; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
|
||||||
%assign %%pad (xmm_regs_used-8)*16 + 32
|
%assign %%pad %%xmm_regs_on_stack*16 + 32
|
||||||
%assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
|
%assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
|
||||||
SUB rsp, stack_size_padded
|
SUB rsp, stack_size_padded
|
||||||
%endif
|
%endif
|
||||||
@@ -479,9 +483,10 @@ DECLARE_REG 14, R13, 120
|
|||||||
|
|
||||||
%macro WIN64_RESTORE_XMM_INTERNAL 0
|
%macro WIN64_RESTORE_XMM_INTERNAL 0
|
||||||
%assign %%pad_size 0
|
%assign %%pad_size 0
|
||||||
%if xmm_regs_used > 8
|
%assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
|
||||||
%assign %%i xmm_regs_used
|
%if %%xmm_regs_on_stack > 0
|
||||||
%rep xmm_regs_used-8
|
%assign %%i xmm_regs_used - high_mm_regs
|
||||||
|
%rep %%xmm_regs_on_stack
|
||||||
%assign %%i %%i-1
|
%assign %%i %%i-1
|
||||||
movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32]
|
movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32]
|
||||||
%endrep
|
%endrep
|
||||||
@@ -494,10 +499,10 @@ DECLARE_REG 14, R13, 120
|
|||||||
%assign %%pad_size stack_size_padded
|
%assign %%pad_size stack_size_padded
|
||||||
%endif
|
%endif
|
||||||
%endif
|
%endif
|
||||||
%if xmm_regs_used > 7
|
%if xmm_regs_used > 7 + high_mm_regs
|
||||||
movaps xmm7, [rsp + stack_offset - %%pad_size + 24]
|
movaps xmm7, [rsp + stack_offset - %%pad_size + 24]
|
||||||
%endif
|
%endif
|
||||||
%if xmm_regs_used > 6
|
%if xmm_regs_used > 6 + high_mm_regs
|
||||||
movaps xmm6, [rsp + stack_offset - %%pad_size + 8]
|
movaps xmm6, [rsp + stack_offset - %%pad_size + 8]
|
||||||
%endif
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
@@ -509,12 +514,12 @@ DECLARE_REG 14, R13, 120
|
|||||||
%assign xmm_regs_used 0
|
%assign xmm_regs_used 0
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
|
%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs
|
||||||
|
|
||||||
%macro RET 0
|
%macro RET 0
|
||||||
WIN64_RESTORE_XMM_INTERNAL
|
WIN64_RESTORE_XMM_INTERNAL
|
||||||
POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
|
POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
|
||||||
%if mmsize == 32
|
%if vzeroupper_required
|
||||||
vzeroupper
|
vzeroupper
|
||||||
%endif
|
%endif
|
||||||
AUTO_REP_RET
|
AUTO_REP_RET
|
||||||
@@ -538,9 +543,10 @@ DECLARE_REG 12, R15, 56
|
|||||||
DECLARE_REG 13, R12, 64
|
DECLARE_REG 13, R12, 64
|
||||||
DECLARE_REG 14, R13, 72
|
DECLARE_REG 14, R13, 72
|
||||||
|
|
||||||
%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
|
%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
|
||||||
%assign num_args %1
|
%assign num_args %1
|
||||||
%assign regs_used %2
|
%assign regs_used %2
|
||||||
|
%assign xmm_regs_used %3
|
||||||
ASSERT regs_used >= num_args
|
ASSERT regs_used >= num_args
|
||||||
SETUP_STACK_POINTER %4
|
SETUP_STACK_POINTER %4
|
||||||
ASSERT regs_used <= 15
|
ASSERT regs_used <= 15
|
||||||
@@ -550,7 +556,7 @@ DECLARE_REG 14, R13, 72
|
|||||||
DEFINE_ARGS_INTERNAL %0, %4, %5
|
DEFINE_ARGS_INTERNAL %0, %4, %5
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
|
%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
|
||||||
|
|
||||||
%macro RET 0
|
%macro RET 0
|
||||||
%if stack_size_padded > 0
|
%if stack_size_padded > 0
|
||||||
@@ -561,7 +567,7 @@ DECLARE_REG 14, R13, 72
|
|||||||
%endif
|
%endif
|
||||||
%endif
|
%endif
|
||||||
POP_IF_USED 14, 13, 12, 11, 10, 9
|
POP_IF_USED 14, 13, 12, 11, 10, 9
|
||||||
%if mmsize == 32
|
%if vzeroupper_required
|
||||||
vzeroupper
|
vzeroupper
|
||||||
%endif
|
%endif
|
||||||
AUTO_REP_RET
|
AUTO_REP_RET
|
||||||
@@ -606,7 +612,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
|
|||||||
DEFINE_ARGS_INTERNAL %0, %4, %5
|
DEFINE_ARGS_INTERNAL %0, %4, %5
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
|
%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
|
||||||
|
|
||||||
%macro RET 0
|
%macro RET 0
|
||||||
%if stack_size_padded > 0
|
%if stack_size_padded > 0
|
||||||
@@ -617,7 +623,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
|
|||||||
%endif
|
%endif
|
||||||
%endif
|
%endif
|
||||||
POP_IF_USED 6, 5, 4, 3
|
POP_IF_USED 6, 5, 4, 3
|
||||||
%if mmsize == 32
|
%if vzeroupper_required
|
||||||
vzeroupper
|
vzeroupper
|
||||||
%endif
|
%endif
|
||||||
AUTO_REP_RET
|
AUTO_REP_RET
|
||||||
@@ -727,7 +733,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
|
|||||||
%assign stack_offset 0 ; stack pointer offset relative to the return address
|
%assign stack_offset 0 ; stack pointer offset relative to the return address
|
||||||
%assign stack_size 0 ; amount of stack space that can be freely used inside a function
|
%assign stack_size 0 ; amount of stack space that can be freely used inside a function
|
||||||
%assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
|
%assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
|
||||||
%assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
|
%assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper
|
||||||
%ifnidn %3, ""
|
%ifnidn %3, ""
|
||||||
PROLOGUE %3
|
PROLOGUE %3
|
||||||
%endif
|
%endif
|
||||||
@@ -803,12 +809,13 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
|
|||||||
%assign cpuflags_bmi1 (1<<17)| cpuflags_avx|cpuflags_lzcnt
|
%assign cpuflags_bmi1 (1<<17)| cpuflags_avx|cpuflags_lzcnt
|
||||||
%assign cpuflags_bmi2 (1<<18)| cpuflags_bmi1
|
%assign cpuflags_bmi2 (1<<18)| cpuflags_bmi1
|
||||||
%assign cpuflags_avx2 (1<<19)| cpuflags_fma3|cpuflags_bmi2
|
%assign cpuflags_avx2 (1<<19)| cpuflags_fma3|cpuflags_bmi2
|
||||||
|
%assign cpuflags_avx512 (1<<20)| cpuflags_avx2 ; F, CD, BW, DQ, VL
|
||||||
|
|
||||||
%assign cpuflags_cache32 (1<<20)
|
%assign cpuflags_cache32 (1<<21)
|
||||||
%assign cpuflags_cache64 (1<<21)
|
%assign cpuflags_cache64 (1<<22)
|
||||||
%assign cpuflags_slowctz (1<<22)
|
%assign cpuflags_slowctz (1<<23)
|
||||||
%assign cpuflags_aligned (1<<23) ; not a cpu feature, but a function variant
|
%assign cpuflags_aligned (1<<24) ; not a cpu feature, but a function variant
|
||||||
%assign cpuflags_atom (1<<24)
|
%assign cpuflags_atom (1<<25)
|
||||||
|
|
||||||
; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
|
; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
|
||||||
%define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
|
%define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
|
||||||
@@ -856,11 +863,12 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
|
|||||||
%endif
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
; Merge mmx and sse*
|
; Merge mmx, sse*, and avx*
|
||||||
; m# is a simd register of the currently selected size
|
; m# is a simd register of the currently selected size
|
||||||
; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
|
; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
|
||||||
; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
|
; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
|
||||||
; (All 3 remain in sync through SWAP.)
|
; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m#
|
||||||
|
; (All 4 remain in sync through SWAP.)
|
||||||
|
|
||||||
%macro CAT_XDEFINE 3
|
%macro CAT_XDEFINE 3
|
||||||
%xdefine %1%2 %3
|
%xdefine %1%2 %3
|
||||||
@@ -870,6 +878,18 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
|
|||||||
%undef %1%2
|
%undef %1%2
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
|
; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper
|
||||||
|
%macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg
|
||||||
|
%if ARCH_X86_64 && cpuflag(avx512)
|
||||||
|
%assign %%i %1
|
||||||
|
%rep 16-%1
|
||||||
|
%assign %%i_high %%i+16
|
||||||
|
SWAP %%i, %%i_high
|
||||||
|
%assign %%i %%i+1
|
||||||
|
%endrep
|
||||||
|
%endif
|
||||||
|
%endmacro
|
||||||
|
|
||||||
%macro INIT_MMX 0-1+
|
%macro INIT_MMX 0-1+
|
||||||
%assign avx_enabled 0
|
%assign avx_enabled 0
|
||||||
%define RESET_MM_PERMUTATION INIT_MMX %1
|
%define RESET_MM_PERMUTATION INIT_MMX %1
|
||||||
@@ -885,7 +905,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
|
|||||||
CAT_XDEFINE nnmm, %%i, %%i
|
CAT_XDEFINE nnmm, %%i, %%i
|
||||||
%assign %%i %%i+1
|
%assign %%i %%i+1
|
||||||
%endrep
|
%endrep
|
||||||
%rep 8
|
%rep 24
|
||||||
CAT_UNDEF m, %%i
|
CAT_UNDEF m, %%i
|
||||||
CAT_UNDEF nnmm, %%i
|
CAT_UNDEF nnmm, %%i
|
||||||
%assign %%i %%i+1
|
%assign %%i %%i+1
|
||||||
@@ -899,7 +919,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
|
|||||||
%define mmsize 16
|
%define mmsize 16
|
||||||
%define num_mmregs 8
|
%define num_mmregs 8
|
||||||
%if ARCH_X86_64
|
%if ARCH_X86_64
|
||||||
%define num_mmregs 16
|
%define num_mmregs 32
|
||||||
%endif
|
%endif
|
||||||
%define mova movdqa
|
%define mova movdqa
|
||||||
%define movu movdqu
|
%define movu movdqu
|
||||||
@@ -912,6 +932,10 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
|
|||||||
%assign %%i %%i+1
|
%assign %%i %%i+1
|
||||||
%endrep
|
%endrep
|
||||||
INIT_CPUFLAGS %1
|
INIT_CPUFLAGS %1
|
||||||
|
%if WIN64
|
||||||
|
; Swap callee-saved registers with volatile registers
|
||||||
|
AVX512_MM_PERMUTATION 6
|
||||||
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%macro INIT_YMM 0-1+
|
%macro INIT_YMM 0-1+
|
||||||
@@ -920,7 +944,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
|
|||||||
%define mmsize 32
|
%define mmsize 32
|
||||||
%define num_mmregs 8
|
%define num_mmregs 8
|
||||||
%if ARCH_X86_64
|
%if ARCH_X86_64
|
||||||
%define num_mmregs 16
|
%define num_mmregs 32
|
||||||
%endif
|
%endif
|
||||||
%define mova movdqa
|
%define mova movdqa
|
||||||
%define movu movdqu
|
%define movu movdqu
|
||||||
@@ -933,6 +957,29 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
|
|||||||
%assign %%i %%i+1
|
%assign %%i %%i+1
|
||||||
%endrep
|
%endrep
|
||||||
INIT_CPUFLAGS %1
|
INIT_CPUFLAGS %1
|
||||||
|
AVX512_MM_PERMUTATION
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro INIT_ZMM 0-1+
|
||||||
|
%assign avx_enabled 1
|
||||||
|
%define RESET_MM_PERMUTATION INIT_ZMM %1
|
||||||
|
%define mmsize 64
|
||||||
|
%define num_mmregs 8
|
||||||
|
%if ARCH_X86_64
|
||||||
|
%define num_mmregs 32
|
||||||
|
%endif
|
||||||
|
%define mova movdqa
|
||||||
|
%define movu movdqu
|
||||||
|
%undef movh
|
||||||
|
%define movnta movntdq
|
||||||
|
%assign %%i 0
|
||||||
|
%rep num_mmregs
|
||||||
|
CAT_XDEFINE m, %%i, zmm %+ %%i
|
||||||
|
CAT_XDEFINE nnzmm, %%i, %%i
|
||||||
|
%assign %%i %%i+1
|
||||||
|
%endrep
|
||||||
|
INIT_CPUFLAGS %1
|
||||||
|
AVX512_MM_PERMUTATION
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
INIT_XMM
|
INIT_XMM
|
||||||
@@ -941,18 +988,26 @@ INIT_XMM
|
|||||||
%define mmmm%1 mm%1
|
%define mmmm%1 mm%1
|
||||||
%define mmxmm%1 mm%1
|
%define mmxmm%1 mm%1
|
||||||
%define mmymm%1 mm%1
|
%define mmymm%1 mm%1
|
||||||
|
%define mmzmm%1 mm%1
|
||||||
%define xmmmm%1 mm%1
|
%define xmmmm%1 mm%1
|
||||||
%define xmmxmm%1 xmm%1
|
%define xmmxmm%1 xmm%1
|
||||||
%define xmmymm%1 xmm%1
|
%define xmmymm%1 xmm%1
|
||||||
|
%define xmmzmm%1 xmm%1
|
||||||
%define ymmmm%1 mm%1
|
%define ymmmm%1 mm%1
|
||||||
%define ymmxmm%1 xmm%1
|
%define ymmxmm%1 xmm%1
|
||||||
%define ymmymm%1 ymm%1
|
%define ymmymm%1 ymm%1
|
||||||
|
%define ymmzmm%1 ymm%1
|
||||||
|
%define zmmmm%1 mm%1
|
||||||
|
%define zmmxmm%1 xmm%1
|
||||||
|
%define zmmymm%1 ymm%1
|
||||||
|
%define zmmzmm%1 zmm%1
|
||||||
%define xm%1 xmm %+ m%1
|
%define xm%1 xmm %+ m%1
|
||||||
%define ym%1 ymm %+ m%1
|
%define ym%1 ymm %+ m%1
|
||||||
|
%define zm%1 zmm %+ m%1
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%assign i 0
|
%assign i 0
|
||||||
%rep 16
|
%rep 32
|
||||||
DECLARE_MMCAST i
|
DECLARE_MMCAST i
|
||||||
%assign i i+1
|
%assign i i+1
|
||||||
%endrep
|
%endrep
|
||||||
@@ -1087,12 +1142,17 @@ INIT_XMM
|
|||||||
;=============================================================================
|
;=============================================================================
|
||||||
|
|
||||||
%assign i 0
|
%assign i 0
|
||||||
%rep 16
|
%rep 32
|
||||||
%if i < 8
|
%if i < 8
|
||||||
CAT_XDEFINE sizeofmm, i, 8
|
CAT_XDEFINE sizeofmm, i, 8
|
||||||
|
CAT_XDEFINE regnumofmm, i, i
|
||||||
%endif
|
%endif
|
||||||
CAT_XDEFINE sizeofxmm, i, 16
|
CAT_XDEFINE sizeofxmm, i, 16
|
||||||
CAT_XDEFINE sizeofymm, i, 32
|
CAT_XDEFINE sizeofymm, i, 32
|
||||||
|
CAT_XDEFINE sizeofzmm, i, 64
|
||||||
|
CAT_XDEFINE regnumofxmm, i, i
|
||||||
|
CAT_XDEFINE regnumofymm, i, i
|
||||||
|
CAT_XDEFINE regnumofzmm, i, i
|
||||||
%assign i i+1
|
%assign i i+1
|
||||||
%endrep
|
%endrep
|
||||||
%undef i
|
%undef i
|
||||||
@@ -1209,7 +1269,7 @@ INIT_XMM
|
|||||||
%endmacro
|
%endmacro
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
; Instructions with both VEX and non-VEX encodings
|
; Instructions with both VEX/EVEX and legacy encodings
|
||||||
; Non-destructive instructions are written without parameters
|
; Non-destructive instructions are written without parameters
|
||||||
AVX_INSTR addpd, sse2, 1, 0, 1
|
AVX_INSTR addpd, sse2, 1, 0, 1
|
||||||
AVX_INSTR addps, sse, 1, 0, 1
|
AVX_INSTR addps, sse, 1, 0, 1
|
||||||
@@ -1545,6 +1605,52 @@ FMA4_INSTR fmsubadd, pd, ps
|
|||||||
FMA4_INSTR fnmadd, pd, ps, sd, ss
|
FMA4_INSTR fnmadd, pd, ps, sd, ss
|
||||||
FMA4_INSTR fnmsub, pd, ps, sd, ss
|
FMA4_INSTR fnmsub, pd, ps, sd, ss
|
||||||
|
|
||||||
|
; Macros for converting VEX instructions to equivalent EVEX ones.
|
||||||
|
%macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex
|
||||||
|
%macro %1 2-7 fnord, fnord, %1, %2, %3
|
||||||
|
%ifidn %3, fnord
|
||||||
|
%define %%args %1, %2
|
||||||
|
%elifidn %4, fnord
|
||||||
|
%define %%args %1, %2, %3
|
||||||
|
%else
|
||||||
|
%define %%args %1, %2, %3, %4
|
||||||
|
%endif
|
||||||
|
%assign %%evex_required cpuflag(avx512) & %7
|
||||||
|
%ifnum regnumof%1
|
||||||
|
%if regnumof%1 >= 16 || sizeof%1 > 32
|
||||||
|
%assign %%evex_required 1
|
||||||
|
%endif
|
||||||
|
%endif
|
||||||
|
%ifnum regnumof%2
|
||||||
|
%if regnumof%2 >= 16 || sizeof%2 > 32
|
||||||
|
%assign %%evex_required 1
|
||||||
|
%endif
|
||||||
|
%endif
|
||||||
|
%if %%evex_required
|
||||||
|
%6 %%args
|
||||||
|
%else
|
||||||
|
%5 %%args ; Prefer VEX over EVEX due to shorter instruction length
|
||||||
|
%endif
|
||||||
|
%endmacro
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
EVEX_INSTR vbroadcastf128, vbroadcastf32x4
|
||||||
|
EVEX_INSTR vbroadcasti128, vbroadcasti32x4
|
||||||
|
EVEX_INSTR vextractf128, vextractf32x4
|
||||||
|
EVEX_INSTR vextracti128, vextracti32x4
|
||||||
|
EVEX_INSTR vinsertf128, vinsertf32x4
|
||||||
|
EVEX_INSTR vinserti128, vinserti32x4
|
||||||
|
EVEX_INSTR vmovdqa, vmovdqa32
|
||||||
|
EVEX_INSTR vmovdqu, vmovdqu32
|
||||||
|
EVEX_INSTR vpand, vpandd
|
||||||
|
EVEX_INSTR vpandn, vpandnd
|
||||||
|
EVEX_INSTR vpor, vpord
|
||||||
|
EVEX_INSTR vpxor, vpxord
|
||||||
|
EVEX_INSTR vrcpps, vrcp14ps, 1 ; EVEX versions have higher precision
|
||||||
|
EVEX_INSTR vrcpss, vrcp14ss, 1
|
||||||
|
EVEX_INSTR vrsqrtps, vrsqrt14ps, 1
|
||||||
|
EVEX_INSTR vrsqrtss, vrsqrt14ss, 1
|
||||||
|
|
||||||
; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0)
|
; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0)
|
||||||
%ifdef __YASM_VER__
|
%ifdef __YASM_VER__
|
||||||
%if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0
|
%if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0
|
||||||
|
|||||||
Reference in New Issue
Block a user