mirror of
https://github.com/herumi/xbyak
synced 2024-11-20 16:06:14 -07:00
vpdpb[su,uu,ss]d[,s] support avx10.2
This commit is contained in:
parent
f3f2dd2d74
commit
f6c66cf6b8
5 changed files with 97 additions and 26 deletions
30
doc/usage.md
30
doc/usage.md
|
@ -106,29 +106,37 @@ vcvtpd2dq xmm19, [eax+32]{1to4} --> vcvtpd2dq(xmm19, yword_b [eax+32]);
|
|||
vfpclassps k5{k3}, zword [rax+64], 5 --> vfpclassps(k5|k3, zword [rax+64], 5); // specify m512
|
||||
vfpclasspd k5{k3}, [rax+64]{1to2}, 5 --> vfpclasspd(k5|k3, xword_b [rax+64], 5); // broadcast 64-bit to 128-bit
|
||||
vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, yword_b [rax+64], 5); // broadcast 64-bit to 256-bit
|
||||
```
|
||||
|
||||
vpdpbusd(xm0, xm1, xm2); // default encoding is EVEX
|
||||
## Selecting AVX512-VNNI, AVX-VNNI, AVX-VNNI-INT8 etc.
|
||||
Some mnemonics have two types of encodings: VEX and EVEX.
|
||||
The functions for these mnemonics include an optional parameter as the last argument to specify the encoding.
|
||||
The default behavior depends on the order in which the instruction was introduced (whether VEX or EVEX came first),
|
||||
and can be specified using setDefaultEncoding.
|
||||
|
||||
```
|
||||
vpdpbusd(xm0, xm1, xm2); // default encoding: EVEX (AVX512-VNNI)
|
||||
vpdpbusd(xm0, xm1, xm2, EvexEncoding); // same as the above
|
||||
vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX encoding
|
||||
vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX (AVX-VNNI)
|
||||
setDefaultEncoding(VexEncoding); // default encoding is VEX
|
||||
vpdpbusd(xm0, xm1, xm2); // VEX encoding
|
||||
vpdpbusd(xm0, xm1, xm2); // VEX
|
||||
|
||||
vmpsadbw(xm1, xm3, xm15, 3); // default encoding
|
||||
vmpsadbw(xm1, xm3, xm15, 3, VexEncoding); // vex(avx)
|
||||
vmpsadbw(xm1, xm3, xm15, 3, EvexEncoding); // evex(avx10.2)
|
||||
vmpsadbw(xm1, xm3, xm15, 3); // default encoding: VEX (AVX-VNNI)
|
||||
vmpsadbw(xm1, xm3, xm15, 3, VexEncoding); // same as the above
|
||||
vmpsadbw(xm1, xm3, xm15, 3, EvexEncoding); // EVEX (AVX10.2)
|
||||
setDefaultEncoding(VexEncoding, EvexEncoding); // use 2nd argument.
|
||||
vmpsadbw(xm1, xm3, xm15, 3); // evex(avx10.2)
|
||||
|
||||
vmpsadbw(xm1, xm3, xm15, 3); // EVEX
|
||||
```
|
||||
|
||||
- `setDefaultEncoding(PreferredEncoding vnniEnc = EvexEncoding, PreferredEncoding avx10Enc = VexEncoding)`
|
||||
Control the default encoding of mnemonics with `Xbyak::PreferredEncoding` param.
|
||||
|
||||
param|vnniEnc|avx10Enc
|
||||
-|-|-
|
||||
EvexEncoding|AVX512_VNNI|AVX10.2
|
||||
VexEncoding|AVX/AVX2|AVX-VNNI-INT8
|
||||
EvexEncoding|AVX512-VNNI|AVX10.2
|
||||
VexEncoding|AVX-VNNI|AVX-VNNI-INT8
|
||||
default|EvexEncoding|VexEncoding
|
||||
mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd
|
||||
mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd, vpdpbssds, vpdpbsud, vpdpbsuds, vpdpbuud, vpdpbuuds
|
||||
|
||||
### Remark
|
||||
* `k1`, ..., `k7` are opmask registers.
|
||||
|
|
|
@ -468,10 +468,12 @@ void putX_X_XM_IMM_AVX10()
|
|||
bool hasIMM;
|
||||
} tbl[] = {
|
||||
{ 0x50, "vpdpbssd", T_F2|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
|
||||
{ 0x51, "vpdpbssds", T_F2|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
|
||||
{ 0x50, "vpdpbsud", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
|
||||
{ 0x51, "vpdpbsuds", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
|
||||
{ 0x50, "vpdpbuud", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
|
||||
{ 0x51, "vpdpbuuds", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
|
||||
#if 0
|
||||
{ 0x51, "vpdpbssds", T_MUST_EVEX | T_YMM | T_F2 | T_0F38 | T_EW0 | T_B32, false },
|
||||
{ 0x50, "vpdpbsud", T_MUST_EVEX | T_YMM | T_F3 | T_0F38 | T_EW0 | T_B32, false },
|
||||
{ 0x51, "vpdpbsuds", T_MUST_EVEX | T_YMM | T_F3 | T_0F38 | T_EW0 | T_B32, false },
|
||||
{ 0x50, "vpdpbuud", T_MUST_EVEX | T_YMM | T_0F38 | T_EW0 | T_B32, false },
|
||||
{ 0x51, "vpdpbuuds", T_MUST_EVEX | T_YMM | T_0F38 | T_EW0 | T_B32, false },
|
||||
#endif
|
||||
|
|
|
@ -1908,11 +1908,11 @@ void put()
|
|||
uint64_t type;
|
||||
} tbl[] = {
|
||||
// { 0x50, "vpdpbssd", T_F2 | T_0F38 | T_W0 | T_YMM },
|
||||
{ 0x51, "vpdpbssds", T_F2 | T_0F38 | T_W0 | T_YMM },
|
||||
{ 0x50, "vpdpbsud", T_F3 | T_0F38 | T_W0 | T_YMM },
|
||||
{ 0x51, "vpdpbsuds", T_F3 | T_0F38 | T_W0 | T_YMM },
|
||||
{ 0x50, "vpdpbuud", T_0F38 | T_W0 | T_YMM },
|
||||
{ 0x51, "vpdpbuuds", T_0F38 | T_W0 | T_YMM },
|
||||
// { 0x51, "vpdpbssds", T_F2 | T_0F38 | T_W0 | T_YMM },
|
||||
// { 0x50, "vpdpbsud", T_F3 | T_0F38 | T_W0 | T_YMM },
|
||||
// { 0x51, "vpdpbsuds", T_F3 | T_0F38 | T_W0 | T_YMM },
|
||||
// { 0x50, "vpdpbuud", T_0F38 | T_W0 | T_YMM },
|
||||
// { 0x51, "vpdpbuuds", T_0F38 | T_W0 | T_YMM },
|
||||
|
||||
{ 0xD2, "vpdpwsud", T_F3 | T_0F38 | T_W0 | T_YMM },
|
||||
{ 0xD3, "vpdpwsuds", T_F3 | T_0F38 | T_W0 | T_YMM },
|
||||
|
|
|
@ -9,7 +9,7 @@ vdpphps(ym1, ym2, ptr_b[rax+128]);
|
|||
vdpphps(zm1, zm2, zm3);
|
||||
vdpphps(zm1, zm2, ptr[rax+128]);
|
||||
vdpphps(zm1, zm2, ptr_b[rax+128]);
|
||||
|
||||
//
|
||||
vmpsadbw(xm1, xm3, xm15, 3);
|
||||
vmpsadbw(xm1|T_z, xm4, ptr[rax+128], 5);
|
||||
|
||||
|
@ -18,7 +18,7 @@ vmpsadbw(ym1, ym4, ptr[rax+128], 5);
|
|||
|
||||
vmpsadbw(zm1|k4, zm3, zm15, 3);
|
||||
vmpsadbw(zm1, zm4, ptr[rax+128], 5);
|
||||
|
||||
//
|
||||
vpdpbssd(xm1, xm2, xm3);
|
||||
vpdpbssd(xm1, xm2, ptr[rax+128]);
|
||||
vpdpbssd(xm1, xm2, ptr_b[rax+128]);
|
||||
|
@ -30,3 +30,64 @@ vpdpbssd(ym1, ym2, ptr_b[rax+128]);
|
|||
vpdpbssd(zm1, zm2, zm3);
|
||||
vpdpbssd(zm1, zm2, ptr[rax+128]);
|
||||
vpdpbssd(zm1, zm2, ptr_b[rax+128]);
|
||||
//
|
||||
vpdpbssds(xm1, xm2, xm3);
|
||||
vpdpbssds(xm1, xm2, ptr[rax+128]);
|
||||
vpdpbssds(xm1, xm2, ptr_b[rax+128]);
|
||||
|
||||
vpdpbssds(ym1, ym2, ym3);
|
||||
vpdpbssds(ym1, ym2, ptr[rax+128]);
|
||||
vpdpbssds(ym1, ym2, ptr_b[rax+128]);
|
||||
|
||||
vpdpbssds(zm1, zm2, zm3);
|
||||
vpdpbssds(zm1, zm2, ptr[rax+128]);
|
||||
vpdpbssds(zm1, zm2, ptr_b[rax+128]);
|
||||
//
|
||||
vpdpbsud(xm1, xm2, xm3);
|
||||
vpdpbsud(xm1, xm2, ptr[rax+128]);
|
||||
vpdpbsud(xm1, xm2, ptr_b[rax+128]);
|
||||
|
||||
vpdpbsud(ym1, ym2, ym3);
|
||||
vpdpbsud(ym1, ym2, ptr[rax+128]);
|
||||
vpdpbsud(ym1, ym2, ptr_b[rax+128]);
|
||||
|
||||
vpdpbsud(zm1, zm2, zm3);
|
||||
vpdpbsud(zm1, zm2, ptr[rax+128]);
|
||||
vpdpbsud(zm1, zm2, ptr_b[rax+128]);
|
||||
//
|
||||
vpdpbsuds(xm1, xm2, xm3);
|
||||
vpdpbsuds(xm1, xm2, ptr[rax+128]);
|
||||
vpdpbsuds(xm1, xm2, ptr_b[rax+128]);
|
||||
|
||||
vpdpbsuds(ym1, ym2, ym3);
|
||||
vpdpbsuds(ym1, ym2, ptr[rax+128]);
|
||||
vpdpbsuds(ym1, ym2, ptr_b[rax+128]);
|
||||
|
||||
vpdpbsuds(zm1, zm2, zm3);
|
||||
vpdpbsuds(zm1, zm2, ptr[rax+128]);
|
||||
vpdpbsuds(zm1, zm2, ptr_b[rax+128]);
|
||||
|
||||
//
|
||||
vpdpbuud(xm1, xm2, xm3);
|
||||
vpdpbuud(xm1, xm2, ptr[rax+128]);
|
||||
vpdpbuud(xm1, xm2, ptr_b[rax+128]);
|
||||
|
||||
vpdpbuud(ym1, ym2, ym3);
|
||||
vpdpbuud(ym1, ym2, ptr[rax+128]);
|
||||
vpdpbuud(ym1, ym2, ptr_b[rax+128]);
|
||||
|
||||
vpdpbuud(zm1, zm2, zm3);
|
||||
vpdpbuud(zm1, zm2, ptr[rax+128]);
|
||||
vpdpbuud(zm1, zm2, ptr_b[rax+128]);
|
||||
//
|
||||
vpdpbuuds(xm1, xm2, xm3);
|
||||
vpdpbuuds(xm1, xm2, ptr[rax+128]);
|
||||
vpdpbuuds(xm1, xm2, ptr_b[rax+128]);
|
||||
|
||||
vpdpbuuds(ym1, ym2, ym3);
|
||||
vpdpbuuds(ym1, ym2, ptr[rax+128]);
|
||||
vpdpbuuds(ym1, ym2, ptr_b[rax+128]);
|
||||
|
||||
vpdpbuuds(zm1, zm2, zm3);
|
||||
vpdpbuuds(zm1, zm2, ptr[rax+128]);
|
||||
vpdpbuuds(zm1, zm2, ptr_b[rax+128]);
|
||||
|
|
|
@ -1419,13 +1419,8 @@ void vpcmpgtq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1
|
|||
void vpcmpgtw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F|T_YMM, 0x65); }
|
||||
void vpcmpistri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66|T_0F3A, 0x63, imm); }
|
||||
void vpcmpistrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66|T_0F3A, 0x62, imm); }
|
||||
void vpdpbssds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F38|T_W0|T_YMM, 0x51); }
|
||||
void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0x50); }
|
||||
void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0x51); }
|
||||
void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x50, encoding); }
|
||||
void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x51, encoding); }
|
||||
void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_W0|T_YMM, 0x50); }
|
||||
void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_W0|T_YMM, 0x51); }
|
||||
void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x52, encoding); }
|
||||
void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x53, encoding); }
|
||||
void vpdpwsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0xD2); }
|
||||
|
@ -2451,6 +2446,11 @@ void vpcompressw(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N2|T
|
|||
void vpconflictd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0xC4); }
|
||||
void vpconflictq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0xC4); }
|
||||
void vpdpbssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F2|T_0F38|T_YMM, 0x50, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
|
||||
void vpdpbssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F2|T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
|
||||
void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0x50, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
|
||||
void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
|
||||
void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0x50, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
|
||||
void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
|
||||
void vpermb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x8D); }
|
||||
void vpermi2b(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x75); }
|
||||
void vpermi2d(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x76); }
|
||||
|
|
Loading…
Reference in a new issue