vpdpb[su,uu,ss]d[,s] support avx10.2

This commit is contained in:
MITSUNARI Shigeo 2024-10-13 15:27:05 +09:00
parent f3f2dd2d74
commit f6c66cf6b8
5 changed files with 97 additions and 26 deletions

View file

@ -106,29 +106,37 @@ vcvtpd2dq xmm19, [eax+32]{1to4} --> vcvtpd2dq(xmm19, yword_b [eax+32]);
vfpclassps k5{k3}, zword [rax+64], 5 --> vfpclassps(k5|k3, zword [rax+64], 5); // specify m512 vfpclassps k5{k3}, zword [rax+64], 5 --> vfpclassps(k5|k3, zword [rax+64], 5); // specify m512
vfpclasspd k5{k3}, [rax+64]{1to2}, 5 --> vfpclasspd(k5|k3, xword_b [rax+64], 5); // broadcast 64-bit to 128-bit vfpclasspd k5{k3}, [rax+64]{1to2}, 5 --> vfpclasspd(k5|k3, xword_b [rax+64], 5); // broadcast 64-bit to 128-bit
vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, yword_b [rax+64], 5); // broadcast 64-bit to 256-bit vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, yword_b [rax+64], 5); // broadcast 64-bit to 256-bit
```
vpdpbusd(xm0, xm1, xm2); // default encoding is EVEX ## Selecting AVX512-VNNI, AVX-VNNI, AVX-VNNI-INT8 etc.
Some mnemonics have two types of encodings: VEX and EVEX.
The functions for these mnemonics include an optional parameter as the last argument to specify the encoding.
The default behavior depends on the order in which the instruction was introduced (whether VEX or EVEX came first),
and can be specified using setDefaultEncoding.
```
vpdpbusd(xm0, xm1, xm2); // default encoding: EVEX (AVX512-VNNI)
vpdpbusd(xm0, xm1, xm2, EvexEncoding); // same as the above vpdpbusd(xm0, xm1, xm2, EvexEncoding); // same as the above
vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX encoding vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX (AVX-VNNI)
setDefaultEncoding(VexEncoding); // default encoding is VEX setDefaultEncoding(VexEncoding); // default encoding is VEX
vpdpbusd(xm0, xm1, xm2); // VEX encoding vpdpbusd(xm0, xm1, xm2); // VEX
vmpsadbw(xm1, xm3, xm15, 3); // default encoding vmpsadbw(xm1, xm3, xm15, 3); // default encoding: VEX (AVX-VNNI)
vmpsadbw(xm1, xm3, xm15, 3, VexEncoding); // vex(avx) vmpsadbw(xm1, xm3, xm15, 3, VexEncoding); // same as the above
vmpsadbw(xm1, xm3, xm15, 3, EvexEncoding); // evex(avx10.2) vmpsadbw(xm1, xm3, xm15, 3, EvexEncoding); // EVEX (AVX10.2)
setDefaultEncoding(VexEncoding, EvexEncoding); // use 2nd argument. setDefaultEncoding(VexEncoding, EvexEncoding); // use 2nd argument.
vmpsadbw(xm1, xm3, xm15, 3); // evex(avx10.2) vmpsadbw(xm1, xm3, xm15, 3); // EVEX
``` ```
- `setDefaultEncoding(PreferredEncoding vnniEnc = EvexEncoding, PreferredEncoding avx10Enc = VexEncoding)` - `setDefaultEncoding(PreferredEncoding vnniEnc = EvexEncoding, PreferredEncoding avx10Enc = VexEncoding)`
Control the default encoding of mnemonics with `Xbyak::PreferredEncoding` param.
param|vnniEnc|avx10Enc param|vnniEnc|avx10Enc
-|-|- -|-|-
EvexEncoding|AVX512_VNNI|AVX10.2 EvexEncoding|AVX512-VNNI|AVX10.2
VexEncoding|AVX/AVX2|AVX-VNNI-INT8 VexEncoding|AVX-VNNI|AVX-VNNI-INT8
default|EvexEncoding|VexEncoding default|EvexEncoding|VexEncoding
mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd, vpdpbssds, vpdpbsud, vpdpbsuds, vpdpbuud, vpdpbuuds
### Remark ### Remark
* `k1`, ..., `k7` are opmask registers. * `k1`, ..., `k7` are opmask registers.

View file

@ -468,10 +468,12 @@ void putX_X_XM_IMM_AVX10()
bool hasIMM; bool hasIMM;
} tbl[] = { } tbl[] = {
{ 0x50, "vpdpbssd", T_F2|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, { 0x50, "vpdpbssd", T_F2|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
{ 0x51, "vpdpbssds", T_F2|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
{ 0x50, "vpdpbsud", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
{ 0x51, "vpdpbsuds", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
{ 0x50, "vpdpbuud", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
{ 0x51, "vpdpbuuds", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
#if 0 #if 0
{ 0x51, "vpdpbssds", T_MUST_EVEX | T_YMM | T_F2 | T_0F38 | T_EW0 | T_B32, false },
{ 0x50, "vpdpbsud", T_MUST_EVEX | T_YMM | T_F3 | T_0F38 | T_EW0 | T_B32, false },
{ 0x51, "vpdpbsuds", T_MUST_EVEX | T_YMM | T_F3 | T_0F38 | T_EW0 | T_B32, false },
{ 0x50, "vpdpbuud", T_MUST_EVEX | T_YMM | T_0F38 | T_EW0 | T_B32, false }, { 0x50, "vpdpbuud", T_MUST_EVEX | T_YMM | T_0F38 | T_EW0 | T_B32, false },
{ 0x51, "vpdpbuuds", T_MUST_EVEX | T_YMM | T_0F38 | T_EW0 | T_B32, false }, { 0x51, "vpdpbuuds", T_MUST_EVEX | T_YMM | T_0F38 | T_EW0 | T_B32, false },
#endif #endif

View file

@ -1908,11 +1908,11 @@ void put()
uint64_t type; uint64_t type;
} tbl[] = { } tbl[] = {
// { 0x50, "vpdpbssd", T_F2 | T_0F38 | T_W0 | T_YMM }, // { 0x50, "vpdpbssd", T_F2 | T_0F38 | T_W0 | T_YMM },
{ 0x51, "vpdpbssds", T_F2 | T_0F38 | T_W0 | T_YMM }, // { 0x51, "vpdpbssds", T_F2 | T_0F38 | T_W0 | T_YMM },
{ 0x50, "vpdpbsud", T_F3 | T_0F38 | T_W0 | T_YMM }, // { 0x50, "vpdpbsud", T_F3 | T_0F38 | T_W0 | T_YMM },
{ 0x51, "vpdpbsuds", T_F3 | T_0F38 | T_W0 | T_YMM }, // { 0x51, "vpdpbsuds", T_F3 | T_0F38 | T_W0 | T_YMM },
{ 0x50, "vpdpbuud", T_0F38 | T_W0 | T_YMM }, // { 0x50, "vpdpbuud", T_0F38 | T_W0 | T_YMM },
{ 0x51, "vpdpbuuds", T_0F38 | T_W0 | T_YMM }, // { 0x51, "vpdpbuuds", T_0F38 | T_W0 | T_YMM },
{ 0xD2, "vpdpwsud", T_F3 | T_0F38 | T_W0 | T_YMM }, { 0xD2, "vpdpwsud", T_F3 | T_0F38 | T_W0 | T_YMM },
{ 0xD3, "vpdpwsuds", T_F3 | T_0F38 | T_W0 | T_YMM }, { 0xD3, "vpdpwsuds", T_F3 | T_0F38 | T_W0 | T_YMM },

View file

@ -9,7 +9,7 @@ vdpphps(ym1, ym2, ptr_b[rax+128]);
vdpphps(zm1, zm2, zm3); vdpphps(zm1, zm2, zm3);
vdpphps(zm1, zm2, ptr[rax+128]); vdpphps(zm1, zm2, ptr[rax+128]);
vdpphps(zm1, zm2, ptr_b[rax+128]); vdpphps(zm1, zm2, ptr_b[rax+128]);
//
vmpsadbw(xm1, xm3, xm15, 3); vmpsadbw(xm1, xm3, xm15, 3);
vmpsadbw(xm1|T_z, xm4, ptr[rax+128], 5); vmpsadbw(xm1|T_z, xm4, ptr[rax+128], 5);
@ -18,7 +18,7 @@ vmpsadbw(ym1, ym4, ptr[rax+128], 5);
vmpsadbw(zm1|k4, zm3, zm15, 3); vmpsadbw(zm1|k4, zm3, zm15, 3);
vmpsadbw(zm1, zm4, ptr[rax+128], 5); vmpsadbw(zm1, zm4, ptr[rax+128], 5);
//
vpdpbssd(xm1, xm2, xm3); vpdpbssd(xm1, xm2, xm3);
vpdpbssd(xm1, xm2, ptr[rax+128]); vpdpbssd(xm1, xm2, ptr[rax+128]);
vpdpbssd(xm1, xm2, ptr_b[rax+128]); vpdpbssd(xm1, xm2, ptr_b[rax+128]);
@ -30,3 +30,64 @@ vpdpbssd(ym1, ym2, ptr_b[rax+128]);
vpdpbssd(zm1, zm2, zm3); vpdpbssd(zm1, zm2, zm3);
vpdpbssd(zm1, zm2, ptr[rax+128]); vpdpbssd(zm1, zm2, ptr[rax+128]);
vpdpbssd(zm1, zm2, ptr_b[rax+128]); vpdpbssd(zm1, zm2, ptr_b[rax+128]);
//
vpdpbssds(xm1, xm2, xm3);
vpdpbssds(xm1, xm2, ptr[rax+128]);
vpdpbssds(xm1, xm2, ptr_b[rax+128]);
vpdpbssds(ym1, ym2, ym3);
vpdpbssds(ym1, ym2, ptr[rax+128]);
vpdpbssds(ym1, ym2, ptr_b[rax+128]);
vpdpbssds(zm1, zm2, zm3);
vpdpbssds(zm1, zm2, ptr[rax+128]);
vpdpbssds(zm1, zm2, ptr_b[rax+128]);
//
vpdpbsud(xm1, xm2, xm3);
vpdpbsud(xm1, xm2, ptr[rax+128]);
vpdpbsud(xm1, xm2, ptr_b[rax+128]);
vpdpbsud(ym1, ym2, ym3);
vpdpbsud(ym1, ym2, ptr[rax+128]);
vpdpbsud(ym1, ym2, ptr_b[rax+128]);
vpdpbsud(zm1, zm2, zm3);
vpdpbsud(zm1, zm2, ptr[rax+128]);
vpdpbsud(zm1, zm2, ptr_b[rax+128]);
//
vpdpbsuds(xm1, xm2, xm3);
vpdpbsuds(xm1, xm2, ptr[rax+128]);
vpdpbsuds(xm1, xm2, ptr_b[rax+128]);
vpdpbsuds(ym1, ym2, ym3);
vpdpbsuds(ym1, ym2, ptr[rax+128]);
vpdpbsuds(ym1, ym2, ptr_b[rax+128]);
vpdpbsuds(zm1, zm2, zm3);
vpdpbsuds(zm1, zm2, ptr[rax+128]);
vpdpbsuds(zm1, zm2, ptr_b[rax+128]);
//
vpdpbuud(xm1, xm2, xm3);
vpdpbuud(xm1, xm2, ptr[rax+128]);
vpdpbuud(xm1, xm2, ptr_b[rax+128]);
vpdpbuud(ym1, ym2, ym3);
vpdpbuud(ym1, ym2, ptr[rax+128]);
vpdpbuud(ym1, ym2, ptr_b[rax+128]);
vpdpbuud(zm1, zm2, zm3);
vpdpbuud(zm1, zm2, ptr[rax+128]);
vpdpbuud(zm1, zm2, ptr_b[rax+128]);
//
vpdpbuuds(xm1, xm2, xm3);
vpdpbuuds(xm1, xm2, ptr[rax+128]);
vpdpbuuds(xm1, xm2, ptr_b[rax+128]);
vpdpbuuds(ym1, ym2, ym3);
vpdpbuuds(ym1, ym2, ptr[rax+128]);
vpdpbuuds(ym1, ym2, ptr_b[rax+128]);
vpdpbuuds(zm1, zm2, zm3);
vpdpbuuds(zm1, zm2, ptr[rax+128]);
vpdpbuuds(zm1, zm2, ptr_b[rax+128]);

View file

@ -1419,13 +1419,8 @@ void vpcmpgtq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1
void vpcmpgtw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F|T_YMM, 0x65); } void vpcmpgtw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F|T_YMM, 0x65); }
void vpcmpistri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66|T_0F3A, 0x63, imm); } void vpcmpistri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66|T_0F3A, 0x63, imm); }
void vpcmpistrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66|T_0F3A, 0x62, imm); } void vpcmpistrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66|T_0F3A, 0x62, imm); }
void vpdpbssds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F38|T_W0|T_YMM, 0x51); }
void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0x50); }
void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0x51); }
void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x50, encoding); } void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x50, encoding); }
void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x51, encoding); } void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x51, encoding); }
void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_W0|T_YMM, 0x50); }
void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_W0|T_YMM, 0x51); }
void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x52, encoding); } void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x52, encoding); }
void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x53, encoding); } void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x53, encoding); }
void vpdpwsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0xD2); } void vpdpwsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0xD2); }
@ -2451,6 +2446,11 @@ void vpcompressw(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N2|T
void vpconflictd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0xC4); } void vpconflictd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0xC4); }
void vpconflictq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0xC4); } void vpconflictq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0xC4); }
void vpdpbssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F2|T_0F38|T_YMM, 0x50, encoding, NONE, T_W0, T_EW0|T_B32, 1); } void vpdpbssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F2|T_0F38|T_YMM, 0x50, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
void vpdpbssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F2|T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0x50, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0x50, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
void vpermb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x8D); } void vpermb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x8D); }
void vpermi2b(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x75); } void vpermi2b(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x75); }
void vpermi2d(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x76); } void vpermi2d(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x76); }