mirror of
https://github.com/herumi/xbyak
synced 2024-11-21 16:09:11 -07:00
add avx512_bf16
This commit is contained in:
parent
4033564c6f
commit
4cfd520878
8 changed files with 59 additions and 4 deletions
|
@ -368,6 +368,9 @@ void putX_X_XM_IMM()
|
|||
|
||||
{ 0x52, "vpdpwssd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
|
||||
{ 0x53, "vpdpwssds", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
|
||||
|
||||
{ 0x72, "vcvtne2ps2bf16", T_F2 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
|
||||
{ 0x52, "vdpbf16ps", T_F3 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
|
||||
};
|
||||
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
||||
const Tbl *p = &tbl[i];
|
||||
|
@ -711,6 +714,8 @@ void putMisc()
|
|||
puts("void vfpclassss(const Opmask& k, const Operand& op, uint8 imm) { if (!op.isXMEM()) throw Error(ERR_BAD_MEM_SIZE); opVex(k, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_EW0 | T_N4, 0x67, imm); }");
|
||||
|
||||
puts("void vpshufbitqmb(const Opmask& k, const Xmm& x, const Operand& op) { opVex(k, &x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8F); }");
|
||||
puts("void vcvtneps2bf16(const Xmm& x, const Operand& op) { int xBit = x.getBit(); int opBit = op.getBit(); if (xBit == 256 && opBit == 0) opBit = 512; if (!(xBit == 128 && (opBit == 128 || opBit == 256)) && !(xBit == 256 && opBit == 512)) throw Error(ERR_BAD_COMBINATION); Xmm t = x; t.setBit(opBit); opAVX_X_XM_IMM(t, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); }");
|
||||
|
||||
}
|
||||
|
||||
void putV4FMA()
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
# Xbyak 5.79 ; JIT assembler for x86(IA32), x64(AMD64, x86-64) by C++
|
||||
# Xbyak 5.80 ; JIT assembler for x86(IA32), x64(AMD64, x86-64) by C++
|
||||
|
||||
## Abstract
|
||||
|
||||
|
@ -392,6 +392,7 @@ modified new BSD License
|
|||
http://opensource.org/licenses/BSD-3-Clause
|
||||
|
||||
## History
|
||||
* 2019/May/26 ver 5.80 support vcvtne2ps2bf16, vcvtneps2bf16, vdpbf16ps
|
||||
* 2019/Apr/27 ver 5.79 vcmppd/vcmpps supports ptr_b(thanks to jkopinsky)
|
||||
* 2019/Apr/15 ver 5.78 rewrite Reg::changeBit() (thanks to MerryMage)
|
||||
* 2019/Mar/06 ver 5.77 fix number of cores that share LLC cache by densamoilov
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 5.79
|
||||
C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 5.80
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
◎概要
|
||||
|
@ -373,6 +373,7 @@ sample/{echo,hello}.bfは http://www.kmonos.net/alang/etc/brainfuck.php から
|
|||
-----------------------------------------------------------------------------
|
||||
◎履歴
|
||||
|
||||
2019/05/26 ver 5.80 support vcvtne2ps2bf16, vcvtneps2bf16, vdpbf16ps
|
||||
2019/04/27 ver 5.79 vcmppd/vcmppsのptr_b対応忘れ(thanks to jkopinsky)
|
||||
2019/04/15 ver 5.78 Reg::changeBit()のリファクタリング(thanks to MerryMage)
|
||||
2019/03/06 ver 5.77 LLCキャッシュを共有数CPU数の修整(by densamoilov)
|
||||
|
|
|
@ -78,6 +78,8 @@ void putCPUinfo()
|
|||
{ Cpu::tAVX512_VNNI, "avx512_vnni" },
|
||||
{ Cpu::tAVX512_BITALG, "avx512_bitalg" },
|
||||
{ Cpu::tAVX512_VPOPCNTDQ, "avx512_vpopcntdq" },
|
||||
{ Cpu::tAVX512_BF16, "avx512_bf16" },
|
||||
{ Cpu::tAVX512_VP2INTERSECT, "avx512_vp2intersect" },
|
||||
};
|
||||
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
||||
if (cpu.has(tbl[i].type)) printf(" %s", tbl[i].str);
|
||||
|
|
|
@ -683,4 +683,42 @@ CYBOZU_TEST_AUTO(gf2)
|
|||
CYBOZU_TEST_EQUAL(c.getSize(), n);
|
||||
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
|
||||
}
|
||||
|
||||
CYBOZU_TEST_AUTO(bf16)
|
||||
{
|
||||
struct Code : Xbyak::CodeGenerator {
|
||||
Code()
|
||||
{
|
||||
vcvtne2ps2bf16(xmm0 | k1, xmm1, ptr [rax + 64]);
|
||||
vcvtne2ps2bf16(ymm0 | k1 | T_z, ymm0, ptr [rax + 64]);
|
||||
vcvtne2ps2bf16(zmm0 | k1, zmm1, ptr [rax + 64]);
|
||||
|
||||
vcvtneps2bf16(xmm0, xword [rax + 64]);
|
||||
vcvtneps2bf16(xmm0 | k1, yword [rax + 64]);
|
||||
vcvtneps2bf16(ymm0 | k1, zword [rax + 64]);
|
||||
vcvtneps2bf16(ymm0 | k1, ptr [rax + 64]);
|
||||
|
||||
vdpbf16ps(xmm0 | k1, xmm1, ptr [rax + 64]);
|
||||
vdpbf16ps(ymm0 | k1, ymm1, ptr [rax + 64]);
|
||||
vdpbf16ps(zmm0 | k1, zmm1, ptr [rax + 64]);
|
||||
}
|
||||
} c;
|
||||
const uint8_t tbl[] = {
|
||||
0x62, 0xf2, 0x77, 0x09, 0x72, 0x40, 0x04,
|
||||
0x62, 0xf2, 0x7f, 0xa9, 0x72, 0x40, 0x02,
|
||||
0x62, 0xf2, 0x77, 0x49, 0x72, 0x40, 0x01,
|
||||
|
||||
0x62, 0xf2, 0x7e, 0x08, 0x72, 0x40, 0x04,
|
||||
0x62, 0xf2, 0x7e, 0x29, 0x72, 0x40, 0x02,
|
||||
0x62, 0xf2, 0x7e, 0x49, 0x72, 0x40, 0x01,
|
||||
0x62, 0xf2, 0x7e, 0x49, 0x72, 0x40, 0x01,
|
||||
|
||||
0x62, 0xf2, 0x76, 0x09, 0x52, 0x40, 0x04,
|
||||
0x62, 0xf2, 0x76, 0x29, 0x52, 0x40, 0x02,
|
||||
0x62, 0xf2, 0x76, 0x49, 0x52, 0x40, 0x01,
|
||||
};
|
||||
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
|
||||
CYBOZU_TEST_EQUAL(c.getSize(), n);
|
||||
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -113,7 +113,7 @@ namespace Xbyak {
|
|||
|
||||
enum {
|
||||
DEFAULT_MAX_CODE_SIZE = 4096,
|
||||
VERSION = 0x5790 /* 0xABCD = A.BC(D) */
|
||||
VERSION = 0x5800 /* 0xABCD = A.BC(D) */
|
||||
};
|
||||
|
||||
#ifndef MIE_INTEGER_TYPE_DEFINED
|
||||
|
@ -551,6 +551,7 @@ inline void Operand::setBit(int bit)
|
|||
idx_ = idx;
|
||||
kind_ = kind;
|
||||
bit_ = bit;
|
||||
if (bit >= 128) return; // keep mask_ and rounding_
|
||||
mask_ = 0;
|
||||
rounding_ = 0;
|
||||
return;
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
const char *getVersionString() const { return "5.79"; }
|
||||
const char *getVersionString() const { return "5.80"; }
|
||||
void adc(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x10, 2); }
|
||||
void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); }
|
||||
void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); }
|
||||
|
@ -1684,6 +1684,8 @@ void vcompressb(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N1 |
|
|||
void vcompresspd(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x8A); }
|
||||
void vcompressps(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8A); }
|
||||
void vcompressw(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x63); }
|
||||
void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); }
|
||||
void vcvtneps2bf16(const Xmm& x, const Operand& op) { int xBit = x.getBit(); int opBit = op.getBit(); if (xBit == 256 && opBit == 0) opBit = 512; if (!(xBit == 128 && (opBit == 128 || opBit == 256)) && !(xBit == 256 && opBit == 512)) throw Error(ERR_BAD_COMBINATION); Xmm t = x; t.setBit(opBit); opAVX_X_XM_IMM(t, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); }
|
||||
void vcvtpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x7B); }
|
||||
void vcvtpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_YMM | T_MUST_EVEX | T_EW1 | T_B64 | T_ER_Z, 0x79); }
|
||||
void vcvtpd2uqq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x79); }
|
||||
|
@ -1709,6 +1711,7 @@ void vcvtuqq2ps(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F2 | T_0F | T
|
|||
void vcvtusi2sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt3(x1, x2, op, T_F2 | T_0F | T_MUST_EVEX, T_W1 | T_EW1 | T_ER_X | T_N8, T_W0 | T_EW0 | T_N4, 0x7B); }
|
||||
void vcvtusi2ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt3(x1, x2, op, T_F3 | T_0F | T_MUST_EVEX | T_ER_X, T_W1 | T_EW1 | T_N8, T_W0 | T_EW0 | T_N4, 0x7B); }
|
||||
void vdbpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x42, imm); }
|
||||
void vdpbf16ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x52); }
|
||||
void vexp2pd(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_B64 | T_SAE_Z, 0xC8); }
|
||||
void vexp2ps(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xC8); }
|
||||
void vexpandpd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x88); }
|
||||
|
|
|
@ -331,6 +331,8 @@ public:
|
|||
static const Type tAVX512_VNNI = uint64(1) << 54;
|
||||
static const Type tAVX512_BITALG = uint64(1) << 55;
|
||||
static const Type tAVX512_VPOPCNTDQ = uint64(1) << 56;
|
||||
static const Type tAVX512_BF16 = uint64(1) << 57;
|
||||
static const Type tAVX512_VP2INTERSECT = uint64(1) << 58;
|
||||
|
||||
Cpu()
|
||||
: type_(NONE)
|
||||
|
@ -410,6 +412,8 @@ public:
|
|||
if (ECX & (1U << 14)) type_ |= tAVX512_VPOPCNTDQ;
|
||||
if (EDX & (1U << 2)) type_ |= tAVX512_4VNNIW;
|
||||
if (EDX & (1U << 3)) type_ |= tAVX512_4FMAPS;
|
||||
if (EAX & (1U << 5)) type_ |= tAVX512_BF16;
|
||||
if (EDX & (1U << 8)) type_ |= tAVX512_VP2INTERSECT;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue