add avx512_bf16

This commit is contained in:
MITSUNARI Shigeo 2019-05-24 15:08:19 +09:00
parent 4033564c6f
commit 4cfd520878
8 changed files with 59 additions and 4 deletions

View file

@ -368,6 +368,9 @@ void putX_X_XM_IMM()
{ 0x52, "vpdpwssd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
{ 0x53, "vpdpwssds", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
{ 0x72, "vcvtne2ps2bf16", T_F2 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
{ 0x52, "vdpbf16ps", T_F3 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_B32, false },
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i];
@ -711,6 +714,8 @@ void putMisc()
puts("void vfpclassss(const Opmask& k, const Operand& op, uint8 imm) { if (!op.isXMEM()) throw Error(ERR_BAD_MEM_SIZE); opVex(k, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_EW0 | T_N4, 0x67, imm); }");
puts("void vpshufbitqmb(const Opmask& k, const Xmm& x, const Operand& op) { opVex(k, &x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8F); }");
puts("void vcvtneps2bf16(const Xmm& x, const Operand& op) { int xBit = x.getBit(); int opBit = op.getBit(); if (xBit == 256 && opBit == 0) opBit = 512; if (!(xBit == 128 && (opBit == 128 || opBit == 256)) && !(xBit == 256 && opBit == 512)) throw Error(ERR_BAD_COMBINATION); Xmm t = x; t.setBit(opBit); opAVX_X_XM_IMM(t, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); }");
}
void putV4FMA()

View file

@ -1,5 +1,5 @@
# Xbyak 5.79 ; JIT assembler for x86(IA32), x64(AMD64, x86-64) by C++
# Xbyak 5.80 ; JIT assembler for x86(IA32), x64(AMD64, x86-64) by C++
## Abstract
@ -392,6 +392,7 @@ modified new BSD License
http://opensource.org/licenses/BSD-3-Clause
## History
* 2019/May/26 ver 5.80 support vcvtne2ps2bf16, vcvtneps2bf16, vdpbf16ps
* 2019/Apr/27 ver 5.79 vcmppd/vcmpps supports ptr_b(thanks to jkopinsky)
* 2019/Apr/15 ver 5.78 rewrite Reg::changeBit() (thanks to MerryMage)
* 2019/Mar/06 ver 5.77 fix number of cores that share LLC cache by densamoilov

View file

@ -1,5 +1,5 @@
C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 5.79
C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 5.80
-----------------------------------------------------------------------------
◎概要
@ -373,6 +373,7 @@ sample/{echo,hello}.bfは http://www.kmonos.net/alang/etc/brainfuck.php から
-----------------------------------------------------------------------------
◎履歴
2019/05/26 ver 5.80 support vcvtne2ps2bf16, vcvtneps2bf16, vdpbf16ps
2019/04/27 ver 5.79 vcmppd/vcmppsのptr_b対応忘れ(thanks to jkopinsky)
2019/04/15 ver 5.78 Reg::changeBit()のリファクタリング(thanks to MerryMage)
2019/03/06 ver 5.77 LLCキャッシュを共有数CPU数の修整(by densamoilov)

View file

@ -78,6 +78,8 @@ void putCPUinfo()
{ Cpu::tAVX512_VNNI, "avx512_vnni" },
{ Cpu::tAVX512_BITALG, "avx512_bitalg" },
{ Cpu::tAVX512_VPOPCNTDQ, "avx512_vpopcntdq" },
{ Cpu::tAVX512_BF16, "avx512_bf16" },
{ Cpu::tAVX512_VP2INTERSECT, "avx512_vp2intersect" },
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
if (cpu.has(tbl[i].type)) printf(" %s", tbl[i].str);

View file

@ -683,4 +683,42 @@ CYBOZU_TEST_AUTO(gf2)
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
CYBOZU_TEST_AUTO(bf16)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
vcvtne2ps2bf16(xmm0 | k1, xmm1, ptr [rax + 64]);
vcvtne2ps2bf16(ymm0 | k1 | T_z, ymm0, ptr [rax + 64]);
vcvtne2ps2bf16(zmm0 | k1, zmm1, ptr [rax + 64]);
vcvtneps2bf16(xmm0, xword [rax + 64]);
vcvtneps2bf16(xmm0 | k1, yword [rax + 64]);
vcvtneps2bf16(ymm0 | k1, zword [rax + 64]);
vcvtneps2bf16(ymm0 | k1, ptr [rax + 64]);
vdpbf16ps(xmm0 | k1, xmm1, ptr [rax + 64]);
vdpbf16ps(ymm0 | k1, ymm1, ptr [rax + 64]);
vdpbf16ps(zmm0 | k1, zmm1, ptr [rax + 64]);
}
} c;
const uint8_t tbl[] = {
0x62, 0xf2, 0x77, 0x09, 0x72, 0x40, 0x04,
0x62, 0xf2, 0x7f, 0xa9, 0x72, 0x40, 0x02,
0x62, 0xf2, 0x77, 0x49, 0x72, 0x40, 0x01,
0x62, 0xf2, 0x7e, 0x08, 0x72, 0x40, 0x04,
0x62, 0xf2, 0x7e, 0x29, 0x72, 0x40, 0x02,
0x62, 0xf2, 0x7e, 0x49, 0x72, 0x40, 0x01,
0x62, 0xf2, 0x7e, 0x49, 0x72, 0x40, 0x01,
0x62, 0xf2, 0x76, 0x09, 0x52, 0x40, 0x04,
0x62, 0xf2, 0x76, 0x29, 0x52, 0x40, 0x02,
0x62, 0xf2, 0x76, 0x49, 0x52, 0x40, 0x01,
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
#endif

View file

@ -113,7 +113,7 @@ namespace Xbyak {
enum {
DEFAULT_MAX_CODE_SIZE = 4096,
VERSION = 0x5790 /* 0xABCD = A.BC(D) */
VERSION = 0x5800 /* 0xABCD = A.BC(D) */
};
#ifndef MIE_INTEGER_TYPE_DEFINED
@ -551,6 +551,7 @@ inline void Operand::setBit(int bit)
idx_ = idx;
kind_ = kind;
bit_ = bit;
if (bit >= 128) return; // keep mask_ and rounding_
mask_ = 0;
rounding_ = 0;
return;

View file

@ -1,4 +1,4 @@
const char *getVersionString() const { return "5.79"; }
const char *getVersionString() const { return "5.80"; }
void adc(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x10, 2); }
void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); }
void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); }
@ -1684,6 +1684,8 @@ void vcompressb(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N1 |
void vcompresspd(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x8A); }
void vcompressps(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8A); }
void vcompressw(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x63); }
void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); }
void vcvtneps2bf16(const Xmm& x, const Operand& op) { int xBit = x.getBit(); int opBit = op.getBit(); if (xBit == 256 && opBit == 0) opBit = 512; if (!(xBit == 128 && (opBit == 128 || opBit == 256)) && !(xBit == 256 && opBit == 512)) throw Error(ERR_BAD_COMBINATION); Xmm t = x; t.setBit(opBit); opAVX_X_XM_IMM(t, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); }
void vcvtpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x7B); }
void vcvtpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_YMM | T_MUST_EVEX | T_EW1 | T_B64 | T_ER_Z, 0x79); }
void vcvtpd2uqq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x79); }
@ -1709,6 +1711,7 @@ void vcvtuqq2ps(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F2 | T_0F | T
void vcvtusi2sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt3(x1, x2, op, T_F2 | T_0F | T_MUST_EVEX, T_W1 | T_EW1 | T_ER_X | T_N8, T_W0 | T_EW0 | T_N4, 0x7B); }
void vcvtusi2ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt3(x1, x2, op, T_F3 | T_0F | T_MUST_EVEX | T_ER_X, T_W1 | T_EW1 | T_N8, T_W0 | T_EW0 | T_N4, 0x7B); }
void vdbpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x42, imm); }
void vdpbf16ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x52); }
void vexp2pd(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_B64 | T_SAE_Z, 0xC8); }
void vexp2ps(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xC8); }
void vexpandpd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x88); }

View file

@ -331,6 +331,8 @@ public:
static const Type tAVX512_VNNI = uint64(1) << 54;
static const Type tAVX512_BITALG = uint64(1) << 55;
static const Type tAVX512_VPOPCNTDQ = uint64(1) << 56;
static const Type tAVX512_BF16 = uint64(1) << 57;
static const Type tAVX512_VP2INTERSECT = uint64(1) << 58;
Cpu()
: type_(NONE)
@ -410,6 +412,8 @@ public:
if (ECX & (1U << 14)) type_ |= tAVX512_VPOPCNTDQ;
if (EDX & (1U << 2)) type_ |= tAVX512_4VNNIW;
if (EDX & (1U << 3)) type_ |= tAVX512_4FMAPS;
if (EAX & (1U << 5)) type_ |= tAVX512_BF16;
if (EDX & (1U << 8)) type_ |= tAVX512_VP2INTERSECT;
}
}
}