From f6c66cf6b81f7a063a930cdfc0a62c68e6e2d0fc Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Sun, 13 Oct 2024 15:27:05 +0900 Subject: [PATCH] vpdpb[su,uu,ss]d[,s] support avx10.2 --- doc/usage.md | 30 ++++++++++++------- gen/gen_avx512.cpp | 8 ++++-- gen/gen_code.cpp | 10 +++---- test/avx10/misc.txt | 65 ++++++++++++++++++++++++++++++++++++++++-- xbyak/xbyak_mnemonic.h | 10 +++---- 5 files changed, 97 insertions(+), 26 deletions(-) diff --git a/doc/usage.md b/doc/usage.md index 53c0bb9..9398755 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -106,29 +106,37 @@ vcvtpd2dq xmm19, [eax+32]{1to4} --> vcvtpd2dq(xmm19, yword_b [eax+32]); vfpclassps k5{k3}, zword [rax+64], 5 --> vfpclassps(k5|k3, zword [rax+64], 5); // specify m512 vfpclasspd k5{k3}, [rax+64]{1to2}, 5 --> vfpclasspd(k5|k3, xword_b [rax+64], 5); // broadcast 64-bit to 128-bit vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, yword_b [rax+64], 5); // broadcast 64-bit to 256-bit +``` -vpdpbusd(xm0, xm1, xm2); // default encoding is EVEX +## Selecting AVX512-VNNI, AVX-VNNI, AVX-VNNI-INT8 etc. +Some mnemonics have two types of encodings: VEX and EVEX. +The functions for these mnemonics include an optional parameter as the last argument to specify the encoding. +The default behavior depends on the order in which the instruction was introduced (whether VEX or EVEX came first), +and can be specified using setDefaultEncoding. + +``` +vpdpbusd(xm0, xm1, xm2); // default encoding: EVEX (AVX512-VNNI) vpdpbusd(xm0, xm1, xm2, EvexEncoding); // same as the above -vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX encoding +vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX (AVX-VNNI) setDefaultEncoding(VexEncoding); // default encoding is VEX -vpdpbusd(xm0, xm1, xm2); // VEX encoding +vpdpbusd(xm0, xm1, xm2); // VEX -vmpsadbw(xm1, xm3, xm15, 3); // default encoding -vmpsadbw(xm1, xm3, xm15, 3, VexEncoding); // vex(avx) -vmpsadbw(xm1, xm3, xm15, 3, EvexEncoding); // evex(avx10.2) +vmpsadbw(xm1, xm3, xm15, 3); // default encoding: VEX (AVX-VNNI) +vmpsadbw(xm1, xm3, xm15, 3, VexEncoding); // same as the above +vmpsadbw(xm1, xm3, xm15, 3, EvexEncoding); // EVEX (AVX10.2) setDefaultEncoding(VexEncoding, EvexEncoding); // use 2nd argument. -vmpsadbw(xm1, xm3, xm15, 3); // evex(avx10.2) - +vmpsadbw(xm1, xm3, xm15, 3); // EVEX ``` - `setDefaultEncoding(PreferredEncoding vnniEnc = EvexEncoding, PreferredEncoding avx10Enc = VexEncoding)` +Control the default encoding of mnemonics with `Xbyak::PreferredEncoding` param. param|vnniEnc|avx10Enc -|-|- -EvexEncoding|AVX512_VNNI|AVX10.2 -VexEncoding|AVX/AVX2|AVX-VNNI-INT8 +EvexEncoding|AVX512-VNNI|AVX10.2 +VexEncoding|AVX-VNNI|AVX-VNNI-INT8 default|EvexEncoding|VexEncoding -mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd +mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd, vpdpbssds, vpdpbsud, vpdpbsuds, vpdpbuud, vpdpbuuds ### Remark * `k1`, ..., `k7` are opmask registers. diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index 9159a64..ed7440c 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -468,10 +468,12 @@ void putX_X_XM_IMM_AVX10() bool hasIMM; } tbl[] = { { 0x50, "vpdpbssd", T_F2|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0x51, "vpdpbssds", T_F2|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0x50, "vpdpbsud", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0x51, "vpdpbsuds", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0x50, "vpdpbuud", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0x51, "vpdpbuuds", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, #if 0 - { 0x51, "vpdpbssds", T_MUST_EVEX | T_YMM | T_F2 | T_0F38 | T_EW0 | T_B32, false }, - { 0x50, "vpdpbsud", T_MUST_EVEX | T_YMM | T_F3 | T_0F38 | T_EW0 | T_B32, false }, - { 0x51, "vpdpbsuds", T_MUST_EVEX | T_YMM | T_F3 | T_0F38 | T_EW0 | T_B32, false }, { 0x50, "vpdpbuud", T_MUST_EVEX | T_YMM | T_0F38 | T_EW0 | T_B32, false }, { 0x51, "vpdpbuuds", T_MUST_EVEX | T_YMM | T_0F38 | T_EW0 | T_B32, false }, #endif diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp index caa9e79..a71d416 100644 --- a/gen/gen_code.cpp +++ b/gen/gen_code.cpp @@ -1908,11 +1908,11 @@ void put() uint64_t type; } tbl[] = { // { 0x50, "vpdpbssd", T_F2 | T_0F38 | T_W0 | T_YMM }, - { 0x51, "vpdpbssds", T_F2 | T_0F38 | T_W0 | T_YMM }, - { 0x50, "vpdpbsud", T_F3 | T_0F38 | T_W0 | T_YMM }, - { 0x51, "vpdpbsuds", T_F3 | T_0F38 | T_W0 | T_YMM }, - { 0x50, "vpdpbuud", T_0F38 | T_W0 | T_YMM }, - { 0x51, "vpdpbuuds", T_0F38 | T_W0 | T_YMM }, +// { 0x51, "vpdpbssds", T_F2 | T_0F38 | T_W0 | T_YMM }, +// { 0x50, "vpdpbsud", T_F3 | T_0F38 | T_W0 | T_YMM }, +// { 0x51, "vpdpbsuds", T_F3 | T_0F38 | T_W0 | T_YMM }, +// { 0x50, "vpdpbuud", T_0F38 | T_W0 | T_YMM }, +// { 0x51, "vpdpbuuds", T_0F38 | T_W0 | T_YMM }, { 0xD2, "vpdpwsud", T_F3 | T_0F38 | T_W0 | T_YMM }, { 0xD3, "vpdpwsuds", T_F3 | T_0F38 | T_W0 | T_YMM }, diff --git a/test/avx10/misc.txt b/test/avx10/misc.txt index 8993107..380e9a9 100644 --- a/test/avx10/misc.txt +++ b/test/avx10/misc.txt @@ -9,7 +9,7 @@ vdpphps(ym1, ym2, ptr_b[rax+128]); vdpphps(zm1, zm2, zm3); vdpphps(zm1, zm2, ptr[rax+128]); vdpphps(zm1, zm2, ptr_b[rax+128]); - +// vmpsadbw(xm1, xm3, xm15, 3); vmpsadbw(xm1|T_z, xm4, ptr[rax+128], 5); @@ -18,7 +18,7 @@ vmpsadbw(ym1, ym4, ptr[rax+128], 5); vmpsadbw(zm1|k4, zm3, zm15, 3); vmpsadbw(zm1, zm4, ptr[rax+128], 5); - +// vpdpbssd(xm1, xm2, xm3); vpdpbssd(xm1, xm2, ptr[rax+128]); vpdpbssd(xm1, xm2, ptr_b[rax+128]); @@ -30,3 +30,64 @@ vpdpbssd(ym1, ym2, ptr_b[rax+128]); vpdpbssd(zm1, zm2, zm3); vpdpbssd(zm1, zm2, ptr[rax+128]); vpdpbssd(zm1, zm2, ptr_b[rax+128]); +// +vpdpbssds(xm1, xm2, xm3); +vpdpbssds(xm1, xm2, ptr[rax+128]); +vpdpbssds(xm1, xm2, ptr_b[rax+128]); + +vpdpbssds(ym1, ym2, ym3); +vpdpbssds(ym1, ym2, ptr[rax+128]); +vpdpbssds(ym1, ym2, ptr_b[rax+128]); + +vpdpbssds(zm1, zm2, zm3); +vpdpbssds(zm1, zm2, ptr[rax+128]); +vpdpbssds(zm1, zm2, ptr_b[rax+128]); +// +vpdpbsud(xm1, xm2, xm3); +vpdpbsud(xm1, xm2, ptr[rax+128]); +vpdpbsud(xm1, xm2, ptr_b[rax+128]); + +vpdpbsud(ym1, ym2, ym3); +vpdpbsud(ym1, ym2, ptr[rax+128]); +vpdpbsud(ym1, ym2, ptr_b[rax+128]); + +vpdpbsud(zm1, zm2, zm3); +vpdpbsud(zm1, zm2, ptr[rax+128]); +vpdpbsud(zm1, zm2, ptr_b[rax+128]); +// +vpdpbsuds(xm1, xm2, xm3); +vpdpbsuds(xm1, xm2, ptr[rax+128]); +vpdpbsuds(xm1, xm2, ptr_b[rax+128]); + +vpdpbsuds(ym1, ym2, ym3); +vpdpbsuds(ym1, ym2, ptr[rax+128]); +vpdpbsuds(ym1, ym2, ptr_b[rax+128]); + +vpdpbsuds(zm1, zm2, zm3); +vpdpbsuds(zm1, zm2, ptr[rax+128]); +vpdpbsuds(zm1, zm2, ptr_b[rax+128]); + +// +vpdpbuud(xm1, xm2, xm3); +vpdpbuud(xm1, xm2, ptr[rax+128]); +vpdpbuud(xm1, xm2, ptr_b[rax+128]); + +vpdpbuud(ym1, ym2, ym3); +vpdpbuud(ym1, ym2, ptr[rax+128]); +vpdpbuud(ym1, ym2, ptr_b[rax+128]); + +vpdpbuud(zm1, zm2, zm3); +vpdpbuud(zm1, zm2, ptr[rax+128]); +vpdpbuud(zm1, zm2, ptr_b[rax+128]); +// +vpdpbuuds(xm1, xm2, xm3); +vpdpbuuds(xm1, xm2, ptr[rax+128]); +vpdpbuuds(xm1, xm2, ptr_b[rax+128]); + +vpdpbuuds(ym1, ym2, ym3); +vpdpbuuds(ym1, ym2, ptr[rax+128]); +vpdpbuuds(ym1, ym2, ptr_b[rax+128]); + +vpdpbuuds(zm1, zm2, zm3); +vpdpbuuds(zm1, zm2, ptr[rax+128]); +vpdpbuuds(zm1, zm2, ptr_b[rax+128]); diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index dbe52e9..c3c6c8b 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -1419,13 +1419,8 @@ void vpcmpgtq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1 void vpcmpgtw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F|T_YMM, 0x65); } void vpcmpistri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66|T_0F3A, 0x63, imm); } void vpcmpistrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66|T_0F3A, 0x62, imm); } -void vpdpbssds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F38|T_W0|T_YMM, 0x51); } -void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0x50); } -void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0x51); } void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x50, encoding); } void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x51, encoding); } -void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_W0|T_YMM, 0x50); } -void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_W0|T_YMM, 0x51); } void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x52, encoding); } void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x53, encoding); } void vpdpwsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0xD2); } @@ -2451,6 +2446,11 @@ void vpcompressw(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N2|T void vpconflictd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0xC4); } void vpconflictq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0xC4); } void vpdpbssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F2|T_0F38|T_YMM, 0x50, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpbssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F2|T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0x50, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0x50, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); } void vpermb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x8D); } void vpermi2b(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x75); } void vpermi2d(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x76); }