vpdpw[su,us,uu]d[,s] support avx10.2
Some checks are pending
test / test (push) Waiting to run

This commit is contained in:
MITSUNARI Shigeo 2024-10-13 15:45:43 +09:00
parent f6c66cf6b8
commit 08f71cee95
5 changed files with 99 additions and 17 deletions

View file

@ -136,7 +136,7 @@ param|vnniEnc|avx10Enc
EvexEncoding|AVX512-VNNI|AVX10.2 EvexEncoding|AVX512-VNNI|AVX10.2
VexEncoding|AVX-VNNI|AVX-VNNI-INT8 VexEncoding|AVX-VNNI|AVX-VNNI-INT8
default|EvexEncoding|VexEncoding default|EvexEncoding|VexEncoding
mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd, vpdpbssds, vpdpbsud, vpdpbsuds, vpdpbuud, vpdpbuuds mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd, vpdpbssds, vpdpbsud, vpdpbsuds, vpdpbuud, vpdpbuuds, vpdpwsud vpdpwsuds vpdpwusd vpdpwusds vpdpwuud, vpdpwuuds
### Remark ### Remark
* `k1`, ..., `k7` are opmask registers. * `k1`, ..., `k7` are opmask registers.

View file

@ -467,16 +467,22 @@ void putX_X_XM_IMM_AVX10()
int sel; int sel;
bool hasIMM; bool hasIMM;
} tbl[] = { } tbl[] = {
// vpdpb[su,uu,ss]d[,s]
{ 0x50, "vpdpbssd", T_F2|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, { 0x50, "vpdpbssd", T_F2|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
{ 0x51, "vpdpbssds", T_F2|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, { 0x51, "vpdpbssds", T_F2|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
{ 0x50, "vpdpbsud", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, { 0x50, "vpdpbsud", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
{ 0x51, "vpdpbsuds", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, { 0x51, "vpdpbsuds", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
{ 0x50, "vpdpbuud", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, { 0x50, "vpdpbuud", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
{ 0x51, "vpdpbuuds", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, { 0x51, "vpdpbuuds", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
#if 0
{ 0x50, "vpdpbuud", T_MUST_EVEX | T_YMM | T_0F38 | T_EW0 | T_B32, false }, // vpdpw[su,us,uu]d[,s]
{ 0x51, "vpdpbuuds", T_MUST_EVEX | T_YMM | T_0F38 | T_EW0 | T_B32, false }, { 0xD2, "vpdpwsud", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
#endif { 0xD3, "vpdpwsuds", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
{ 0xD2, "vpdpwusd", T_66|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
{ 0xD3, "vpdpwusds", T_66|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
{ 0xD2, "vpdpwuud", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
{ 0xD3, "vpdpwuuds", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false },
{ 0x42, "vmpsadbw", T_0F3A|T_YMM, T_66|T_W0|T_YMM, T_F3|T_0F3A|T_EW0|T_B32, 1, true }, { 0x42, "vmpsadbw", T_0F3A|T_YMM, T_66|T_W0|T_YMM, T_F3|T_0F3A|T_EW0|T_B32, 1, true },
}; };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {

View file

@ -1901,6 +1901,7 @@ void put()
} }
// avx-vnni-int8 // avx-vnni-int8
// avx-vnni-int16 // avx-vnni-int16
#if 0
{ {
const struct Tbl { const struct Tbl {
uint8_t code; uint8_t code;
@ -1914,12 +1915,12 @@ void put()
// { 0x50, "vpdpbuud", T_0F38 | T_W0 | T_YMM }, // { 0x50, "vpdpbuud", T_0F38 | T_W0 | T_YMM },
// { 0x51, "vpdpbuuds", T_0F38 | T_W0 | T_YMM }, // { 0x51, "vpdpbuuds", T_0F38 | T_W0 | T_YMM },
{ 0xD2, "vpdpwsud", T_F3 | T_0F38 | T_W0 | T_YMM }, // { 0xD2, "vpdpwsud", T_F3 | T_0F38 | T_W0 | T_YMM },
{ 0xD3, "vpdpwsuds", T_F3 | T_0F38 | T_W0 | T_YMM }, // { 0xD3, "vpdpwsuds", T_F3 | T_0F38 | T_W0 | T_YMM },
{ 0xD2, "vpdpwusd", T_66 | T_0F38 | T_W0 | T_YMM }, // { 0xD2, "vpdpwusd", T_66 | T_0F38 | T_W0 | T_YMM },
{ 0xD3, "vpdpwusds", T_66 | T_0F38 | T_W0 | T_YMM }, // { 0xD3, "vpdpwusds", T_66 | T_0F38 | T_W0 | T_YMM },
{ 0xD2, "vpdpwuud", T_0F38 | T_W0 | T_YMM }, // { 0xD2, "vpdpwuud", T_0F38 | T_W0 | T_YMM },
{ 0xD3, "vpdpwuuds", T_0F38 | T_W0 | T_YMM }, // { 0xD3, "vpdpwuuds", T_0F38 | T_W0 | T_YMM },
}; };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i]; const Tbl *p = &tbl[i];
@ -1927,6 +1928,7 @@ void put()
printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, %s, 0x%02X); }\n", p->name, s.c_str(), p->code); printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, %s, 0x%02X); }\n", p->name, s.c_str(), p->code);
} }
} }
#endif
} }
void put32() void put32()

View file

@ -91,3 +91,77 @@ vpdpbuuds(ym1, ym2, ptr_b[rax+128]);
vpdpbuuds(zm1, zm2, zm3); vpdpbuuds(zm1, zm2, zm3);
vpdpbuuds(zm1, zm2, ptr[rax+128]); vpdpbuuds(zm1, zm2, ptr[rax+128]);
vpdpbuuds(zm1, zm2, ptr_b[rax+128]); vpdpbuuds(zm1, zm2, ptr_b[rax+128]);
//
vpdpwsud(xm1, xm2, xm3);
vpdpwsud(xm1, xm2, ptr[rax+128]);
vpdpwsud(xm1, xm2, ptr_b[rax+128]);
vpdpwsud(ym1, ym2, ym3);
vpdpwsud(ym1, ym2, ptr[rax+128]);
vpdpwsud(ym1, ym2, ptr_b[rax+128]);
vpdpwsud(zm1, zm2, zm3);
vpdpwsud(zm1, zm2, ptr[rax+128]);
vpdpwsud(zm1, zm2, ptr_b[rax+128]);
//
vpdpwsuds(xm1, xm2, xm3);
vpdpwsuds(xm1, xm2, ptr[rax+128]);
vpdpwsuds(xm1, xm2, ptr_b[rax+128]);
vpdpwsuds(ym1, ym2, ym3);
vpdpwsuds(ym1, ym2, ptr[rax+128]);
vpdpwsuds(ym1, ym2, ptr_b[rax+128]);
vpdpwsuds(zm1, zm2, zm3);
vpdpwsuds(zm1, zm2, ptr[rax+128]);
vpdpwsuds(zm1, zm2, ptr_b[rax+128]);
//
vpdpwsud(xm1, xm2, xm3);
vpdpwsud(xm1, xm2, ptr[rax+128]);
vpdpwsud(xm1, xm2, ptr_b[rax+128]);
vpdpwsud(ym1, ym2, ym3);
vpdpwsud(ym1, ym2, ptr[rax+128]);
vpdpwsud(ym1, ym2, ptr_b[rax+128]);
vpdpwsud(zm1, zm2, zm3);
vpdpwsud(zm1, zm2, ptr[rax+128]);
vpdpwsud(zm1, zm2, ptr_b[rax+128]);
//
vpdpwsuds(xm1, xm2, xm3);
vpdpwsuds(xm1, xm2, ptr[rax+128]);
vpdpwsuds(xm1, xm2, ptr_b[rax+128]);
vpdpwsuds(ym1, ym2, ym3);
vpdpwsuds(ym1, ym2, ptr[rax+128]);
vpdpwsuds(ym1, ym2, ptr_b[rax+128]);
vpdpwsuds(zm1, zm2, zm3);
vpdpwsuds(zm1, zm2, ptr[rax+128]);
vpdpwsuds(zm1, zm2, ptr_b[rax+128]);
//
vpdpwuud(xm1, xm2, xm3);
vpdpwuud(xm1, xm2, ptr[rax+128]);
vpdpwuud(xm1, xm2, ptr_b[rax+128]);
vpdpwuud(ym1, ym2, ym3);
vpdpwuud(ym1, ym2, ptr[rax+128]);
vpdpwuud(ym1, ym2, ptr_b[rax+128]);
vpdpwuud(zm1, zm2, zm3);
vpdpwuud(zm1, zm2, ptr[rax+128]);
vpdpwuud(zm1, zm2, ptr_b[rax+128]);
//
vpdpwuuds(xm1, xm2, xm3);
vpdpwuuds(xm1, xm2, ptr[rax+128]);
vpdpwuuds(xm1, xm2, ptr_b[rax+128]);
vpdpwuuds(ym1, ym2, ym3);
vpdpwuuds(ym1, ym2, ptr[rax+128]);
vpdpwuuds(ym1, ym2, ptr_b[rax+128]);
vpdpwuuds(zm1, zm2, zm3);
vpdpwuuds(zm1, zm2, ptr[rax+128]);
vpdpwuuds(zm1, zm2, ptr_b[rax+128]);

View file

@ -1423,12 +1423,6 @@ void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding
void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x51, encoding); } void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x51, encoding); }
void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x52, encoding); } void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x52, encoding); }
void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x53, encoding); } void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x53, encoding); }
void vpdpwsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0xD2); }
void vpdpwsuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0xD3); }
void vpdpwusd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_YMM, 0xD2); }
void vpdpwusds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_YMM, 0xD3); }
void vpdpwuud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_W0|T_YMM, 0xD2); }
void vpdpwuuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_W0|T_YMM, 0xD3); }
void vperm2f128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x06, imm); } void vperm2f128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x06, imm); }
void vperm2i128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x46, imm); } void vperm2i128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x46, imm); }
void vpermd(const Ymm& y1, const Ymm& y2, const Operand& op) { opAVX_X_X_XM(y1, y2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0x36); } void vpermd(const Ymm& y1, const Ymm& y2, const Operand& op) { opAVX_X_X_XM(y1, y2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0x36); }
@ -2451,6 +2445,12 @@ void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding
void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); } void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0x50, encoding, NONE, T_W0, T_EW0|T_B32, 1); } void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0x50, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); } void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
void vpdpwsud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0xD2, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
void vpdpwsuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0xD3, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
void vpdpwusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_YMM, 0xD2, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
void vpdpwusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_YMM, 0xD3, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
void vpdpwuud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0xD2, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
void vpdpwuuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0xD3, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
void vpermb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x8D); } void vpermb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x8D); }
void vpermi2b(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x75); } void vpermi2b(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x75); }
void vpermi2d(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x76); } void vpermi2d(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x76); }