add vf[,n]m[add,sub][132,213,231]nebf16

This commit is contained in:
MITSUNARI Shigeo 2024-10-11 12:21:48 +09:00
parent 3ca7e64c63
commit a84866bcbc
4 changed files with 93 additions and 4 deletions

View file

@ -959,6 +959,22 @@ void putAVX10_BF16()
{ "vmulnepbf16", T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, 0x59 },
{ "vscalefpbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0x2C },
{ "vsubnepbf16", T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, 0x5C },
{ "vfmadd132nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0x98 },
{ "vfmadd213nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xA8 },
{ "vfmadd231nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xB8 },
{ "vfnmadd132nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0x9C },
{ "vfnmadd213nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xAC },
{ "vfnmadd231nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xBC },
{ "vfmsub132nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0x9A },
{ "vfmsub213nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xAA },
{ "vfmsub231nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xBA },
{ "vfnmsub132nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0x9E },
{ "vfnmsub213nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xAE },
{ "vfnmsub231nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xBE },
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const xxopTbl& p = tbl[i];

View file

@ -32,3 +32,63 @@ vsubnepbf16(xm1, xm2, xm3);
vsubnepbf16(ym1|k1, ym2, ptr[rax+128]);
vsubnepbf16(ym1|k1, ym2, ptr_b[rax+128]);
vsubnepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
// madd
vfmadd132nepbf16(xm1, xm2, xm3);
vfmadd132nepbf16(ym1|k1, ym2, ptr[rax+128]);
vfmadd132nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
vfmadd132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vfmadd213nepbf16(xm1, xm2, xm3);
vfmadd213nepbf16(ym1|k1, ym2, ptr[rax+128]);
vfmadd213nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
vfmadd213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vfmadd231nepbf16(xm1, xm2, xm3);
vfmadd231nepbf16(ym1|k1, ym2, ptr[rax+128]);
vfmadd231nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
vfmadd231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
// nmadd
vfnmadd132nepbf16(xm1, xm2, xm3);
vfnmadd132nepbf16(ym1|k1, ym2, ptr[rax+128]);
vfnmadd132nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
vfnmadd132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vfnmadd213nepbf16(xm1, xm2, xm3);
vfnmadd213nepbf16(ym1|k1, ym2, ptr[rax+128]);
vfnmadd213nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
vfnmadd213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vfnmadd231nepbf16(xm1, xm2, xm3);
vfnmadd231nepbf16(ym1|k1, ym2, ptr[rax+128]);
vfnmadd231nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
vfnmadd231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
// msub
vfmsub132nepbf16(xm1, xm2, xm3);
vfmsub132nepbf16(ym1|k1, ym2, ptr[rax+128]);
vfmsub132nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
vfmsub132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vfmsub213nepbf16(xm1, xm2, xm3);
vfmsub213nepbf16(ym1|k1, ym2, ptr[rax+128]);
vfmsub213nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
vfmsub213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vfmsub231nepbf16(xm1, xm2, xm3);
vfmsub231nepbf16(ym1|k1, ym2, ptr[rax+128]);
vfmsub231nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
vfmsub231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
// nmsub
vfnmsub132nepbf16(xm1, xm2, xm3);
vfnmsub132nepbf16(ym1|k1, ym2, ptr[rax+128]);
vfnmsub132nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
vfnmsub132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vfnmsub213nepbf16(xm1, xm2, xm3);
vfnmsub213nepbf16(ym1|k1, ym2, ptr[rax+128]);
vfnmsub213nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
vfnmsub213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vfnmsub231nepbf16(xm1, xm2, xm3);
vfnmsub231nepbf16(ym1|k1, ym2, ptr[rax+128]);
vfnmsub231nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
vfnmsub231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);

View file

@ -287,17 +287,18 @@ def removeExtraInfo(s):
def run(cppText, xedText):
cpp = loadFile(cppText)
xed = loadFile(xedText)
if len(cpp) != len(xed):
raise Exception(f'different line {len(cpp)} {len(xed)}')
n = len(cpp)
if n != len(xed):
raise Exception(f'different line {n} {len(xed)}')
for i in range(len(cpp)):
for i in range(n):
line1 = cpp[i]
line2 = removeExtraInfo(xed[i])
m1 = parseNmemonic(line1)
m2 = parseNmemonic(line2)
assertEqual(m1, m2, f'{i+1}')
print('run ok')
print('run ok', n)
def assertEqualStr(a, b, msg=None):
if str(a) != str(b):

View file

@ -2281,36 +2281,48 @@ void vfixupimmpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
void vfixupimmps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x54, imm); }
void vfixupimmsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F3A|T_EW1|T_SAE_Z|T_MUST_EVEX, 0x55, imm); }
void vfixupimmss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F3A|T_EW0|T_SAE_Z|T_MUST_EVEX, 0x55, imm); }
void vfmadd132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x98); }
void vfmadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x98); }
void vfmadd132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x99); }
void vfmadd213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xA8); }
void vfmadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xA8); }
void vfmadd213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xA9); }
void vfmadd231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xB8); }
void vfmadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xB8); }
void vfmadd231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xB9); }
void vfmaddcph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x56); }
void vfmaddsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x96); }
void vfmaddsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xA6); }
void vfmaddsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xB6); }
void vfmsub132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x9A); }
void vfmsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x9A); }
void vfmsub132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x9B); }
void vfmsub213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xAA); }
void vfmsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xAA); }
void vfmsub213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xAB); }
void vfmsub231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xBA); }
void vfmsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xBA); }
void vfmsub231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xBB); }
void vfmsubadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x97); }
void vfmsubadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xA7); }
void vfmsubadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xB7); }
void vfmulcph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0xD6); }
void vfnmadd132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x9C); }
void vfnmadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x9C); }
void vfnmadd132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x9D); }
void vfnmadd213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xAC); }
void vfnmadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xAC); }
void vfnmadd213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xAD); }
void vfnmadd231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xBC); }
void vfnmadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xBC); }
void vfnmadd231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xBD); }
void vfnmsub132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x9E); }
void vfnmsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x9E); }
void vfnmsub132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x9F); }
void vfnmsub213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xAE); }
void vfnmsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xAE); }
void vfnmsub213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xAF); }
void vfnmsub231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xBE); }
void vfnmsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xBE); }
void vfnmsub231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xBF); }
void vfpclasspd(const Opmask& k, const Operand& op, uint8_t imm) { if (!op.isBit(128|256|512)) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k.changeBit(op.getBit()), 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW1 | T_B64, 0x66, imm); }