mirror of
https://github.com/herumi/xbyak
synced 2024-11-20 16:06:14 -07:00
add vf[,n]m[add,sub][132,213,231]nebf16
This commit is contained in:
parent
3ca7e64c63
commit
a84866bcbc
4 changed files with 93 additions and 4 deletions
|
@ -959,6 +959,22 @@ void putAVX10_BF16()
|
|||
{ "vmulnepbf16", T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, 0x59 },
|
||||
{ "vscalefpbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0x2C },
|
||||
{ "vsubnepbf16", T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, 0x5C },
|
||||
|
||||
{ "vfmadd132nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0x98 },
|
||||
{ "vfmadd213nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xA8 },
|
||||
{ "vfmadd231nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xB8 },
|
||||
|
||||
{ "vfnmadd132nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0x9C },
|
||||
{ "vfnmadd213nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xAC },
|
||||
{ "vfnmadd231nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xBC },
|
||||
|
||||
{ "vfmsub132nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0x9A },
|
||||
{ "vfmsub213nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xAA },
|
||||
{ "vfmsub231nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xBA },
|
||||
|
||||
{ "vfnmsub132nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0x9E },
|
||||
{ "vfnmsub213nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xAE },
|
||||
{ "vfnmsub231nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xBE },
|
||||
};
|
||||
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
||||
const xxopTbl& p = tbl[i];
|
||||
|
|
|
@ -32,3 +32,63 @@ vsubnepbf16(xm1, xm2, xm3);
|
|||
vsubnepbf16(ym1|k1, ym2, ptr[rax+128]);
|
||||
vsubnepbf16(ym1|k1, ym2, ptr_b[rax+128]);
|
||||
vsubnepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
|
||||
// madd
|
||||
vfmadd132nepbf16(xm1, xm2, xm3);
|
||||
vfmadd132nepbf16(ym1|k1, ym2, ptr[rax+128]);
|
||||
vfmadd132nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
|
||||
vfmadd132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
|
||||
|
||||
vfmadd213nepbf16(xm1, xm2, xm3);
|
||||
vfmadd213nepbf16(ym1|k1, ym2, ptr[rax+128]);
|
||||
vfmadd213nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
|
||||
vfmadd213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
|
||||
|
||||
vfmadd231nepbf16(xm1, xm2, xm3);
|
||||
vfmadd231nepbf16(ym1|k1, ym2, ptr[rax+128]);
|
||||
vfmadd231nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
|
||||
vfmadd231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
|
||||
// nmadd
|
||||
vfnmadd132nepbf16(xm1, xm2, xm3);
|
||||
vfnmadd132nepbf16(ym1|k1, ym2, ptr[rax+128]);
|
||||
vfnmadd132nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
|
||||
vfnmadd132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
|
||||
|
||||
vfnmadd213nepbf16(xm1, xm2, xm3);
|
||||
vfnmadd213nepbf16(ym1|k1, ym2, ptr[rax+128]);
|
||||
vfnmadd213nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
|
||||
vfnmadd213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
|
||||
|
||||
vfnmadd231nepbf16(xm1, xm2, xm3);
|
||||
vfnmadd231nepbf16(ym1|k1, ym2, ptr[rax+128]);
|
||||
vfnmadd231nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
|
||||
vfnmadd231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
|
||||
// msub
|
||||
vfmsub132nepbf16(xm1, xm2, xm3);
|
||||
vfmsub132nepbf16(ym1|k1, ym2, ptr[rax+128]);
|
||||
vfmsub132nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
|
||||
vfmsub132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
|
||||
|
||||
vfmsub213nepbf16(xm1, xm2, xm3);
|
||||
vfmsub213nepbf16(ym1|k1, ym2, ptr[rax+128]);
|
||||
vfmsub213nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
|
||||
vfmsub213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
|
||||
|
||||
vfmsub231nepbf16(xm1, xm2, xm3);
|
||||
vfmsub231nepbf16(ym1|k1, ym2, ptr[rax+128]);
|
||||
vfmsub231nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
|
||||
vfmsub231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
|
||||
// nmsub
|
||||
vfnmsub132nepbf16(xm1, xm2, xm3);
|
||||
vfnmsub132nepbf16(ym1|k1, ym2, ptr[rax+128]);
|
||||
vfnmsub132nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
|
||||
vfnmsub132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
|
||||
|
||||
vfnmsub213nepbf16(xm1, xm2, xm3);
|
||||
vfnmsub213nepbf16(ym1|k1, ym2, ptr[rax+128]);
|
||||
vfnmsub213nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
|
||||
vfnmsub213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
|
||||
|
||||
vfnmsub231nepbf16(xm1, xm2, xm3);
|
||||
vfnmsub231nepbf16(ym1|k1, ym2, ptr[rax+128]);
|
||||
vfnmsub231nepbf16(ym1|k1, ym2, ptr_b[rax+128]);
|
||||
vfnmsub231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
|
||||
|
|
|
@ -287,17 +287,18 @@ def removeExtraInfo(s):
|
|||
def run(cppText, xedText):
|
||||
cpp = loadFile(cppText)
|
||||
xed = loadFile(xedText)
|
||||
if len(cpp) != len(xed):
|
||||
raise Exception(f'different line {len(cpp)} {len(xed)}')
|
||||
n = len(cpp)
|
||||
if n != len(xed):
|
||||
raise Exception(f'different line {n} {len(xed)}')
|
||||
|
||||
for i in range(len(cpp)):
|
||||
for i in range(n):
|
||||
line1 = cpp[i]
|
||||
line2 = removeExtraInfo(xed[i])
|
||||
m1 = parseNmemonic(line1)
|
||||
m2 = parseNmemonic(line2)
|
||||
|
||||
assertEqual(m1, m2, f'{i+1}')
|
||||
print('run ok')
|
||||
print('run ok', n)
|
||||
|
||||
def assertEqualStr(a, b, msg=None):
|
||||
if str(a) != str(b):
|
||||
|
|
|
@ -2281,36 +2281,48 @@ void vfixupimmpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
|
|||
void vfixupimmps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x54, imm); }
|
||||
void vfixupimmsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F3A|T_EW1|T_SAE_Z|T_MUST_EVEX, 0x55, imm); }
|
||||
void vfixupimmss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F3A|T_EW0|T_SAE_Z|T_MUST_EVEX, 0x55, imm); }
|
||||
void vfmadd132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x98); }
|
||||
void vfmadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x98); }
|
||||
void vfmadd132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x99); }
|
||||
void vfmadd213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xA8); }
|
||||
void vfmadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xA8); }
|
||||
void vfmadd213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xA9); }
|
||||
void vfmadd231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xB8); }
|
||||
void vfmadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xB8); }
|
||||
void vfmadd231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xB9); }
|
||||
void vfmaddcph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x56); }
|
||||
void vfmaddsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x96); }
|
||||
void vfmaddsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xA6); }
|
||||
void vfmaddsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xB6); }
|
||||
void vfmsub132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x9A); }
|
||||
void vfmsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x9A); }
|
||||
void vfmsub132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x9B); }
|
||||
void vfmsub213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xAA); }
|
||||
void vfmsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xAA); }
|
||||
void vfmsub213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xAB); }
|
||||
void vfmsub231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xBA); }
|
||||
void vfmsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xBA); }
|
||||
void vfmsub231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xBB); }
|
||||
void vfmsubadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x97); }
|
||||
void vfmsubadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xA7); }
|
||||
void vfmsubadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xB7); }
|
||||
void vfmulcph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0xD6); }
|
||||
void vfnmadd132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x9C); }
|
||||
void vfnmadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x9C); }
|
||||
void vfnmadd132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x9D); }
|
||||
void vfnmadd213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xAC); }
|
||||
void vfnmadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xAC); }
|
||||
void vfnmadd213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xAD); }
|
||||
void vfnmadd231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xBC); }
|
||||
void vfnmadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xBC); }
|
||||
void vfnmadd231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xBD); }
|
||||
void vfnmsub132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x9E); }
|
||||
void vfnmsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x9E); }
|
||||
void vfnmsub132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x9F); }
|
||||
void vfnmsub213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xAE); }
|
||||
void vfnmsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xAE); }
|
||||
void vfnmsub213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xAF); }
|
||||
void vfnmsub231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xBE); }
|
||||
void vfnmsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xBE); }
|
||||
void vfnmsub231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xBF); }
|
||||
void vfpclasspd(const Opmask& k, const Operand& op, uint8_t imm) { if (!op.isBit(128|256|512)) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k.changeBit(op.getBit()), 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW1 | T_B64, 0x66, imm); }
|
||||
|
|
Loading…
Reference in a new issue