xbyak/test/misc.cpp
2024-10-10 14:03:02 +09:00

2287 lines
70 KiB
C++

#include <stdio.h>
#include <string.h>
#include <string>
#include <xbyak/xbyak.h>
#include <xbyak/xbyak_util.h>
#include <cybozu/inttype.hpp>
#include <cybozu/test.hpp>
#include <algorithm>
using namespace Xbyak;
CYBOZU_TEST_AUTO(setSize)
{
struct Code : Xbyak::CodeGenerator {
Code() : Xbyak::CodeGenerator(4096)
{
setSize(4095);
db(1);
size_t size = getSize();
CYBOZU_TEST_EQUAL(size, 4096u);
CYBOZU_TEST_NO_EXCEPTION(setSize(size));
CYBOZU_TEST_EXCEPTION(db(1), Xbyak::Error);
}
} code;
}
#ifdef XBYAK64
CYBOZU_TEST_AUTO(badSSE)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
CYBOZU_TEST_EXCEPTION(paddd(xm16, xm1), Xbyak::Error);
CYBOZU_TEST_EXCEPTION(pslld(xm16, 1), Xbyak::Error);
CYBOZU_TEST_EXCEPTION(movapd(xm16, xm1), Xbyak::Error);
CYBOZU_TEST_EXCEPTION(movhpd(xm16, ptr[eax]), Xbyak::Error);
CYBOZU_TEST_EXCEPTION(pextrb(eax, xm16, 1), Xbyak::Error);
}
} code;
}
#endif
CYBOZU_TEST_AUTO(compOperand)
{
using namespace Xbyak::util;
CYBOZU_TEST_ASSERT(eax == eax);
CYBOZU_TEST_ASSERT(ecx != xmm0);
CYBOZU_TEST_ASSERT(ptr[eax] == ptr[eax]);
CYBOZU_TEST_ASSERT(dword[eax] != ptr[eax]);
CYBOZU_TEST_ASSERT(ptr[eax] != ptr[eax+3]);
}
CYBOZU_TEST_AUTO(mov_const)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
const struct {
uint64_t v;
int bit;
bool error;
} tbl[] = {
{ uint64_t(-1), 8, false },
{ 0x12, 8, false },
{ 0x80, 8, false },
{ 0xff, 8, false },
{ 0x100, 8, true },
{ 1, 16, false },
{ uint64_t(-1), 16, false },
{ 0x7fff, 16, false },
{ 0xffff, 16, false },
{ 0x10000, 16, true },
{ uint64_t(-1), 32, false },
{ 0x7fffffff, 32, false },
{ uint64_t(-0x7fffffff), 32, false },
{ 0xffffffff, 32, false },
{ 0x100000000ull, 32, true },
#ifdef XBYAK64
{ uint64_t(-1), 64, false },
{ 0x7fffffff, 64, false },
{ 0xffffffffffffffffull, 64, false },
{ 0x80000000, 64, true },
{ 0xffffffff, 64, true },
#endif
};
for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
const int bit = tbl[i].bit;
const uint64_t v = tbl[i].v;
const Xbyak::AddressFrame& af = bit == 8 ? byte : bit == 16 ? word : bit == 32 ? dword : qword;
if (tbl[i].error) {
CYBOZU_TEST_EXCEPTION(mov(af[eax], v), Xbyak::Error);
} else {
CYBOZU_TEST_NO_EXCEPTION(mov(af[eax], v));
}
}
#ifdef XBYAK64
CYBOZU_TEST_NO_EXCEPTION(mov(rax, ptr[(void*)0x7fffffff]));
if (sizeof(void*) != 4) { // sizeof(void*) == 4 on x32
CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0x17fffffff]), Xbyak::Error);
}
#ifdef XBYAK_OLD_DISP_CHECK
CYBOZU_TEST_NO_EXCEPTION(mov(rax, ptr[(void*)0x80000000]));
CYBOZU_TEST_NO_EXCEPTION(mov(rax, ptr[(void*)0xffffffff]));
#else
if (sizeof(void*) != 4) { // sizeof(void*) == 4 on x32
CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0x80000000ull]), Xbyak::Error);
CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0xffffffffull]), Xbyak::Error);
}
#endif
#endif
}
} code;
}
CYBOZU_TEST_AUTO(align)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
const size_t alignSize = 16;
for (int padding = 0; padding < 20; padding++) {
for (int i = 0; i < padding; i++) {
db(1);
}
align(alignSize);
CYBOZU_TEST_EQUAL(size_t(getCurr()) % alignSize, 0u);
}
align(alignSize);
const uint8_t *p = getCurr();
// do nothing if aligned
align(alignSize);
CYBOZU_TEST_EQUAL(p, getCurr());
}
} c;
}
CYBOZU_TEST_AUTO(kmask)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
CYBOZU_TEST_EXCEPTION(kmovb(k1, ax), std::exception);
CYBOZU_TEST_EXCEPTION(kmovw(k1, ax), std::exception);
CYBOZU_TEST_EXCEPTION(kmovd(k1, ax), std::exception);
CYBOZU_TEST_EXCEPTION(kmovq(k1, eax), std::exception);
#ifdef XBYAK64
CYBOZU_TEST_EXCEPTION(kmovb(k1, rax), std::exception);
CYBOZU_TEST_EXCEPTION(kmovw(k1, rax), std::exception);
CYBOZU_TEST_EXCEPTION(kmovd(k1, rax), std::exception);
CYBOZU_TEST_NO_EXCEPTION(kmovq(k1, rax));
#endif
CYBOZU_TEST_NO_EXCEPTION(vmovaps(xm0|k0, ptr[eax]));
checkT_z();
}
void checkT_z()
{
const uint8_t *p1 = getCurr();
vmovaps(zm0, ptr[eax]);
const uint8_t *p2 = getCurr();
vmovaps(zm0|T_z, ptr[eax]);
const uint8_t *end = getCurr();
CYBOZU_TEST_EQUAL(p2 - p1, end - p2);
CYBOZU_TEST_EQUAL_ARRAY(p1, p2, end - p2);
}
} c;
}
CYBOZU_TEST_AUTO(gather)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
CYBOZU_TEST_NO_EXCEPTION(vgatherdpd(xmm1, ptr[eax+xmm2], xmm3));
CYBOZU_TEST_EXCEPTION(vgatherdpd(xmm1, ptr[eax+xmm1], xmm2), std::exception);
CYBOZU_TEST_EXCEPTION(vgatherdpd(xmm1, ptr[eax+xmm2], xmm1), std::exception);
CYBOZU_TEST_EXCEPTION(vgatherdpd(xmm2, ptr[eax+xmm1], xmm1), std::exception);
CYBOZU_TEST_NO_EXCEPTION(vgatherdpd(xmm1|k2, ptr[eax+xmm2]));
CYBOZU_TEST_EXCEPTION(vgatherdpd(xmm1, ptr[eax+xmm2]), std::exception);
CYBOZU_TEST_EXCEPTION(vgatherdpd(xmm1|k2, ptr[eax+xmm1]), std::exception);
CYBOZU_TEST_NO_EXCEPTION(vpscatterdd(ptr[eax+xmm2]|k2, xmm1));
CYBOZU_TEST_NO_EXCEPTION(vpscatterdd(ptr[eax+xmm2], xmm1|k2));
CYBOZU_TEST_NO_EXCEPTION(vpscatterdd(ptr[eax+xmm2]|k3, xmm2));
CYBOZU_TEST_EXCEPTION(vpscatterdd(ptr[eax+xmm2], xmm1), std::exception);
}
} c;
}
#ifdef XBYAK64
CYBOZU_TEST_AUTO(vfmaddps)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
v4fmaddps(zmm1, zmm8, ptr [rdx + 64]);
v4fmaddss(xmm15, xmm8, ptr [rax + 64]);
v4fnmaddps(zmm5 | k5, zmm2, ptr [rcx + 0x80]);
v4fnmaddss(xmm31, xmm2, ptr [rsp + 0x80]);
vp4dpwssd(zmm23 | k7 | T_z, zmm1, ptr [rax + 64]);
vp4dpwssds(zmm10 | k4, zmm3, ptr [rsp + rax * 4 + 64]);
}
} c;
const uint8_t tbl[] = {
0x62, 0xf2, 0x3f, 0x48, 0x9a, 0x4a, 0x04,
0x62, 0x72, 0x3f, 0x08, 0x9b, 0x78, 0x04,
0x62, 0xf2, 0x6f, 0x4d, 0xaa, 0x69, 0x08,
0x62, 0x62, 0x6f, 0x08, 0xab, 0x7c, 0x24, 0x08,
0x62, 0xe2, 0x77, 0xcf, 0x52, 0x78, 0x04,
0x62, 0x72, 0x67, 0x4c, 0x53, 0x54, 0x84, 0x04,
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
CYBOZU_TEST_AUTO(vaes)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
vaesdec(xmm20, xmm30, ptr [rcx + 64]);
vaesdec(ymm1, ymm2, ptr [rcx + 64]);
vaesdec(zmm1, zmm2, ptr [rcx + 64]);
vaesdeclast(xmm20, xmm30, ptr [rax + 64]);
vaesdeclast(ymm20, ymm30, ptr [rax + 64]);
vaesdeclast(zmm20, zmm30, ptr [rax + 64]);
vaesenc(xmm20, xmm30, ptr [rcx + 64]);
vaesenc(ymm1, ymm2, ptr [rcx + 64]);
vaesenc(zmm1, zmm2, ptr [rcx + 64]);
vaesenclast(xmm20, xmm30, ptr [rax + 64]);
vaesenclast(ymm20, ymm30, ptr [rax + 64]);
vaesenclast(zmm20, zmm30, ptr [rax + 64]);
}
} c;
const uint8_t tbl[] = {
0x62, 0xE2, 0x0D, 0x00, 0xDE, 0x61, 0x04,
0xC4, 0xE2, 0x6D, 0xDE, 0x49, 0x40,
0x62, 0xF2, 0x6D, 0x48, 0xDE, 0x49, 0x01,
0x62, 0xE2, 0x0D, 0x00, 0xDF, 0x60, 0x04,
0x62, 0xE2, 0x0D, 0x20, 0xDF, 0x60, 0x02,
0x62, 0xE2, 0x0D, 0x40, 0xDF, 0x60, 0x01,
0x62, 0xE2, 0x0D, 0x00, 0xDC, 0x61, 0x04,
0xC4, 0xE2, 0x6D, 0xDC, 0x49, 0x40,
0x62, 0xF2, 0x6D, 0x48, 0xDC, 0x49, 0x01,
0x62, 0xE2, 0x0D, 0x00, 0xDD, 0x60, 0x04,
0x62, 0xE2, 0x0D, 0x20, 0xDD, 0x60, 0x02,
0x62, 0xE2, 0x0D, 0x40, 0xDD, 0x60, 0x01,
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
CYBOZU_TEST_AUTO(vpclmulqdq)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
vpclmulqdq(xmm2, xmm3, ptr [rax + 64], 3);
vpclmulqdq(ymm2, ymm3, ptr [rax + 64], 3);
vpclmulqdq(zmm2, zmm3, ptr [rax + 64], 3);
vpclmulqdq(xmm20, xmm3, ptr [rax + 64], 3);
vpclmulqdq(ymm20, ymm3, ptr [rax + 64], 3);
vpclmulqdq(zmm20, zmm3, ptr [rax + 64], 3);
}
} c;
const uint8_t tbl[] = {
0xc4, 0xe3, 0x61, 0x44, 0x50, 0x40, 0x03,
0xc4, 0xe3, 0x65, 0x44, 0x50, 0x40, 0x03,
0x62, 0xf3, 0x65, 0x48, 0x44, 0x50, 0x01, 0x03,
0x62, 0xe3, 0x65, 0x08, 0x44, 0x60, 0x04, 0x03,
0x62, 0xe3, 0x65, 0x28, 0x44, 0x60, 0x02, 0x03,
0x62, 0xe3, 0x65, 0x48, 0x44, 0x60, 0x01, 0x03,
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
CYBOZU_TEST_AUTO(vpcompressb_w)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
vpcompressb(ptr[rax + 64], xmm1);
vpcompressb(xmm30 | k5, xmm1);
vpcompressb(ptr[rax + 64], ymm1);
vpcompressb(ymm30 | k3 |T_z, ymm1);
vpcompressb(ptr[rax + 64], zmm1);
vpcompressb(zmm30 | k2 |T_z, zmm1);
vpcompressw(ptr[rax + 64], xmm1);
vpcompressw(xmm30 | k5, xmm1);
vpcompressw(ptr[rax + 64], ymm1);
vpcompressw(ymm30 | k3 |T_z, ymm1);
vpcompressw(ptr[rax + 64], zmm1);
vpcompressw(zmm30 | k2 |T_z, zmm1);
}
} c;
const uint8_t tbl[] = {
0x62, 0xf2, 0x7d, 0x08, 0x63, 0x48, 0x40,
0x62, 0x92, 0x7d, 0x0d, 0x63, 0xce,
0x62, 0xf2, 0x7d, 0x28, 0x63, 0x48, 0x40,
0x62, 0x92, 0x7d, 0xab, 0x63, 0xce,
0x62, 0xf2, 0x7d, 0x48, 0x63, 0x48, 0x40,
0x62, 0x92, 0x7d, 0xca, 0x63, 0xce,
0x62, 0xf2, 0xfd, 0x08, 0x63, 0x48, 0x20,
0x62, 0x92, 0xfd, 0x0d, 0x63, 0xce,
0x62, 0xf2, 0xfd, 0x28, 0x63, 0x48, 0x20,
0x62, 0x92, 0xfd, 0xab, 0x63, 0xce,
0x62, 0xf2, 0xfd, 0x48, 0x63, 0x48, 0x20,
0x62, 0x92, 0xfd, 0xca, 0x63, 0xce,
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
CYBOZU_TEST_AUTO(shld)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
vpshldw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5);
vpshldw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5);
vpshldw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5);
vpshldd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5);
vpshldd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5);
vpshldd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5);
vpshldq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5);
vpshldq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5);
vpshldq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5);
vpshldvw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]);
vpshldvw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]);
vpshldvw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]);
vpshldvd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]);
vpshldvd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]);
vpshldvd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]);
vpshldvq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]);
vpshldvq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]);
vpshldvq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]);
}
} c;
const uint8_t tbl[] = {
0x62, 0xf3, 0xed, 0x8b, 0x70, 0x68, 0x04, 0x05,
0x62, 0xf3, 0xed, 0xab, 0x70, 0x68, 0x02, 0x05,
0x62, 0xf3, 0xed, 0xcb, 0x70, 0x68, 0x01, 0x05,
0x62, 0xf3, 0x6d, 0x8b, 0x71, 0x68, 0x04, 0x05,
0x62, 0xf3, 0x6d, 0xab, 0x71, 0x68, 0x02, 0x05,
0x62, 0xf3, 0x6d, 0xcb, 0x71, 0x68, 0x01, 0x05,
0x62, 0xf3, 0xed, 0x8b, 0x71, 0x68, 0x04, 0x05,
0x62, 0xf3, 0xed, 0xab, 0x71, 0x68, 0x02, 0x05,
0x62, 0xf3, 0xed, 0xcb, 0x71, 0x68, 0x01, 0x05,
0x62, 0xf2, 0xed, 0x8b, 0x70, 0x68, 0x04,
0x62, 0xf2, 0xed, 0xab, 0x70, 0x68, 0x02,
0x62, 0xf2, 0xed, 0xcb, 0x70, 0x68, 0x01,
0x62, 0xf2, 0x6d, 0x8b, 0x71, 0x68, 0x04,
0x62, 0xf2, 0x6d, 0xab, 0x71, 0x68, 0x02,
0x62, 0xf2, 0x6d, 0xcb, 0x71, 0x68, 0x01,
0x62, 0xf2, 0xed, 0x8b, 0x71, 0x68, 0x04,
0x62, 0xf2, 0xed, 0xab, 0x71, 0x68, 0x02,
0x62, 0xf2, 0xed, 0xcb, 0x71, 0x68, 0x01,
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
CYBOZU_TEST_AUTO(shrd)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
vpshrdw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5);
vpshrdw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5);
vpshrdw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5);
vpshrdd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5);
vpshrdd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5);
vpshrdd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5);
vpshrdq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5);
vpshrdq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5);
vpshrdq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5);
vpshrdvw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]);
vpshrdvw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]);
vpshrdvw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]);
vpshrdvd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]);
vpshrdvd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]);
vpshrdvd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]);
vpshrdvq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]);
vpshrdvq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]);
vpshrdvq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]);
vpshrdd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5);
vpshrdd(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40], 5);
vpshrdd(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40], 5);
vpshrdq(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5);
vpshrdq(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40], 5);
vpshrdq(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40], 5);
vpshrdvd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40]);
vpshrdvd(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40]);
vpshrdvd(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40]);
vpshrdvq(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40]);
vpshrdvq(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40]);
vpshrdvq(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40]);
}
} c;
const uint8_t tbl[] = {
0x62, 0xf3, 0xed, 0x8b, 0x72, 0x68, 0x04, 0x05,
0x62, 0xf3, 0xed, 0xab, 0x72, 0x68, 0x02, 0x05,
0x62, 0xf3, 0xed, 0xcb, 0x72, 0x68, 0x01, 0x05,
0x62, 0xf3, 0x6d, 0x8b, 0x73, 0x68, 0x04, 0x05,
0x62, 0xf3, 0x6d, 0xab, 0x73, 0x68, 0x02, 0x05,
0x62, 0xf3, 0x6d, 0xcb, 0x73, 0x68, 0x01, 0x05,
0x62, 0xf3, 0xed, 0x8b, 0x73, 0x68, 0x04, 0x05,
0x62, 0xf3, 0xed, 0xab, 0x73, 0x68, 0x02, 0x05,
0x62, 0xf3, 0xed, 0xcb, 0x73, 0x68, 0x01, 0x05,
0x62, 0xf2, 0xed, 0x8b, 0x72, 0x68, 0x04,
0x62, 0xf2, 0xed, 0xab, 0x72, 0x68, 0x02,
0x62, 0xf2, 0xed, 0xcb, 0x72, 0x68, 0x01,
0x62, 0xf2, 0x6d, 0x8b, 0x73, 0x68, 0x04,
0x62, 0xf2, 0x6d, 0xab, 0x73, 0x68, 0x02,
0x62, 0xf2, 0x6d, 0xcb, 0x73, 0x68, 0x01,
0x62, 0xf2, 0xed, 0x8b, 0x73, 0x68, 0x04,
0x62, 0xf2, 0xed, 0xab, 0x73, 0x68, 0x02,
0x62, 0xf2, 0xed, 0xcb, 0x73, 0x68, 0x01,
0x62, 0xf3, 0x6d, 0x9b, 0x73, 0x68, 0x10, 0x05,
0x62, 0xf3, 0x6d, 0xbb, 0x73, 0x68, 0x10, 0x05,
0x62, 0xf3, 0x6d, 0xdb, 0x73, 0x68, 0x10, 0x05,
0x62, 0xf3, 0xed, 0x9b, 0x73, 0x68, 0x08, 0x05,
0x62, 0xf3, 0xed, 0xbb, 0x73, 0x68, 0x08, 0x05,
0x62, 0xf3, 0xed, 0xdb, 0x73, 0x68, 0x08, 0x05,
0x62, 0xf2, 0x6d, 0x9b, 0x73, 0x68, 0x10,
0x62, 0xf2, 0x6d, 0xbb, 0x73, 0x68, 0x10,
0x62, 0xf2, 0x6d, 0xdb, 0x73, 0x68, 0x10,
0x62, 0xf2, 0xed, 0x9b, 0x73, 0x68, 0x08,
0x62, 0xf2, 0xed, 0xbb, 0x73, 0x68, 0x08,
0x62, 0xf2, 0xed, 0xdb, 0x73, 0x68, 0x08,
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
CYBOZU_TEST_AUTO(vpopcnt)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
vpopcntb(xmm5|k3|T_z, ptr [rax + 0x40]);
vpopcntb(ymm5|k3|T_z, ptr [rax + 0x40]);
vpopcntb(zmm5|k3|T_z, ptr [rax + 0x40]);
vpopcntw(xmm5|k3|T_z, ptr [rax + 0x40]);
vpopcntw(ymm5|k3|T_z, ptr [rax + 0x40]);
vpopcntw(zmm5|k3|T_z, ptr [rax + 0x40]);
vpopcntd(xmm5|k3|T_z, ptr [rax + 0x40]);
vpopcntd(ymm5|k3|T_z, ptr [rax + 0x40]);
vpopcntd(zmm5|k3|T_z, ptr [rax + 0x40]);
vpopcntd(xmm5|k3|T_z, ptr_b [rax + 0x40]);
vpopcntd(ymm5|k3|T_z, ptr_b [rax + 0x40]);
vpopcntd(zmm5|k3|T_z, ptr_b [rax + 0x40]);
vpopcntq(xmm5|k3|T_z, ptr [rax + 0x40]);
vpopcntq(ymm5|k3|T_z, ptr [rax + 0x40]);
vpopcntq(zmm5|k3|T_z, ptr [rax + 0x40]);
vpopcntq(xmm5|k3|T_z, ptr_b [rax + 0x40]);
vpopcntq(ymm5|k3|T_z, ptr_b [rax + 0x40]);
vpopcntq(zmm5|k3|T_z, ptr_b [rax + 0x40]);
}
} c;
const uint8_t tbl[] = {
0x62, 0xf2, 0x7d, 0x8b, 0x54, 0x68, 0x04,
0x62, 0xf2, 0x7d, 0xab, 0x54, 0x68, 0x02,
0x62, 0xf2, 0x7d, 0xcb, 0x54, 0x68, 0x01,
0x62, 0xf2, 0xfd, 0x8b, 0x54, 0x68, 0x04,
0x62, 0xf2, 0xfd, 0xab, 0x54, 0x68, 0x02,
0x62, 0xf2, 0xfd, 0xcb, 0x54, 0x68, 0x01,
0x62, 0xf2, 0x7d, 0x8b, 0x55, 0x68, 0x04,
0x62, 0xf2, 0x7d, 0xab, 0x55, 0x68, 0x02,
0x62, 0xf2, 0x7d, 0xcb, 0x55, 0x68, 0x01,
0x62, 0xf2, 0x7d, 0x9b, 0x55, 0x68, 0x10,
0x62, 0xf2, 0x7d, 0xbb, 0x55, 0x68, 0x10,
0x62, 0xf2, 0x7d, 0xdb, 0x55, 0x68, 0x10,
0x62, 0xf2, 0xfd, 0x8b, 0x55, 0x68, 0x04,
0x62, 0xf2, 0xfd, 0xab, 0x55, 0x68, 0x02,
0x62, 0xf2, 0xfd, 0xcb, 0x55, 0x68, 0x01,
0x62, 0xf2, 0xfd, 0x9b, 0x55, 0x68, 0x08,
0x62, 0xf2, 0xfd, 0xbb, 0x55, 0x68, 0x08,
0x62, 0xf2, 0xfd, 0xdb, 0x55, 0x68, 0x08,
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
CYBOZU_TEST_AUTO(vpdpbus)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
vpdpbusd(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]);
vpdpbusd(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]);
vpdpbusd(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]);
vpdpbusd(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]);
vpdpbusd(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]);
vpdpbusd(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]);
vpdpbusds(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]);
vpdpbusds(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]);
vpdpbusds(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]);
vpdpbusds(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]);
vpdpbusds(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]);
vpdpbusds(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]);
vpdpwssd(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]);
vpdpwssd(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]);
vpdpwssd(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]);
vpdpwssd(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]);
vpdpwssd(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]);
vpdpwssd(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]);
vpdpwssds(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]);
vpdpwssds(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]);
vpdpwssds(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]);
vpdpwssds(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]);
vpdpwssds(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]);
vpdpwssds(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]);
}
} c;
const uint8_t tbl[] = {
0x62, 0xf2, 0x5d, 0x83, 0x50, 0x68, 0x04,
0x62, 0xf2, 0x5d, 0xa3, 0x50, 0x68, 0x02,
0x62, 0xf2, 0x5d, 0xc3, 0x50, 0x68, 0x01,
0x62, 0xf2, 0x5d, 0x93, 0x50, 0x68, 0x10,
0x62, 0xf2, 0x5d, 0xb3, 0x50, 0x68, 0x10,
0x62, 0xf2, 0x5d, 0xd3, 0x50, 0x68, 0x10,
0x62, 0xf2, 0x5d, 0x83, 0x51, 0x68, 0x04,
0x62, 0xf2, 0x5d, 0xa3, 0x51, 0x68, 0x02,
0x62, 0xf2, 0x5d, 0xc3, 0x51, 0x68, 0x01,
0x62, 0xf2, 0x5d, 0x93, 0x51, 0x68, 0x10,
0x62, 0xf2, 0x5d, 0xb3, 0x51, 0x68, 0x10,
0x62, 0xf2, 0x5d, 0xd3, 0x51, 0x68, 0x10,
0x62, 0xf2, 0x5d, 0x83, 0x52, 0x68, 0x04,
0x62, 0xf2, 0x5d, 0xa3, 0x52, 0x68, 0x02,
0x62, 0xf2, 0x5d, 0xc3, 0x52, 0x68, 0x01,
0x62, 0xf2, 0x5d, 0x93, 0x52, 0x68, 0x10,
0x62, 0xf2, 0x5d, 0xb3, 0x52, 0x68, 0x10,
0x62, 0xf2, 0x5d, 0xd3, 0x52, 0x68, 0x10,
0x62, 0xf2, 0x5d, 0x83, 0x53, 0x68, 0x04,
0x62, 0xf2, 0x5d, 0xa3, 0x53, 0x68, 0x02,
0x62, 0xf2, 0x5d, 0xc3, 0x53, 0x68, 0x01,
0x62, 0xf2, 0x5d, 0x93, 0x53, 0x68, 0x10,
0x62, 0xf2, 0x5d, 0xb3, 0x53, 0x68, 0x10,
0x62, 0xf2, 0x5d, 0xd3, 0x53, 0x68, 0x10,
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
CYBOZU_TEST_AUTO(vexpand_vpshufbitqmb)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
vpexpandb(xmm5|k3|T_z, xmm30);
vpexpandb(ymm5|k3|T_z, ymm30);
vpexpandb(zmm5|k3|T_z, zmm30);
vpexpandb(xmm5|k3|T_z, ptr [rax + 0x40]);
vpexpandb(ymm5|k3|T_z, ptr [rax + 0x40]);
vpexpandb(zmm5|k3|T_z, ptr [rax + 0x40]);
vpexpandw(xmm5|k3|T_z, xmm30);
vpexpandw(ymm5|k3|T_z, ymm30);
vpexpandw(zmm5|k3|T_z, zmm30);
vpexpandw(xmm5|k3|T_z, ptr [rax + 0x40]);
vpexpandw(ymm5|k3|T_z, ptr [rax + 0x40]);
vpexpandw(zmm5|k3|T_z, ptr [rax + 0x40]);
vpshufbitqmb(k1|k2, xmm2, ptr [rax + 0x40]);
vpshufbitqmb(k1|k2, ymm2, ptr [rax + 0x40]);
vpshufbitqmb(k1|k2, zmm2, ptr [rax + 0x40]);
}
} c;
const uint8_t tbl[] = {
0x62, 0x92, 0x7d, 0x8b, 0x62, 0xee,
0x62, 0x92, 0x7d, 0xab, 0x62, 0xee,
0x62, 0x92, 0x7d, 0xcb, 0x62, 0xee,
0x62, 0xf2, 0x7d, 0x8b, 0x62, 0x68, 0x40,
0x62, 0xf2, 0x7d, 0xab, 0x62, 0x68, 0x40,
0x62, 0xf2, 0x7d, 0xcb, 0x62, 0x68, 0x40,
0x62, 0x92, 0xfd, 0x8b, 0x62, 0xee,
0x62, 0x92, 0xfd, 0xab, 0x62, 0xee,
0x62, 0x92, 0xfd, 0xcb, 0x62, 0xee,
0x62, 0xf2, 0xfd, 0x8b, 0x62, 0x68, 0x20,
0x62, 0xf2, 0xfd, 0xab, 0x62, 0x68, 0x20,
0x62, 0xf2, 0xfd, 0xcb, 0x62, 0x68, 0x20,
0x62, 0xf2, 0x6d, 0x0a, 0x8f, 0x48, 0x04,
0x62, 0xf2, 0x6d, 0x2a, 0x8f, 0x48, 0x02,
0x62, 0xf2, 0x6d, 0x4a, 0x8f, 0x48, 0x01,
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
CYBOZU_TEST_AUTO(gf2)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
///
gf2p8affineinvqb(xmm1, xmm2, 3);
gf2p8affineinvqb(xmm1, ptr [rax + 0x40], 3);
vgf2p8affineinvqb(xmm1, xmm5, xmm2, 3);
vgf2p8affineinvqb(ymm1, ymm5, ymm2, 3);
vgf2p8affineinvqb(xmm1, xmm5, ptr [rax + 0x40], 3);
vgf2p8affineinvqb(ymm1, ymm5, ptr [rax + 0x40], 3);
vgf2p8affineinvqb(xmm30, xmm31, xmm4, 5);
vgf2p8affineinvqb(ymm30, ymm31, ymm4, 5);
vgf2p8affineinvqb(zmm30, zmm31, zmm4, 5);
vgf2p8affineinvqb(xmm30|k1|T_z, xmm5, ptr [rax + 0x40], 5);
vgf2p8affineinvqb(ymm30|k1|T_z, ymm5, ptr [rax + 0x40], 5);
vgf2p8affineinvqb(zmm30|k1|T_z, zmm5, ptr [rax + 0x40], 5);
vgf2p8affineinvqb(xmm30|k1|T_z, xmm5, ptr_b [rax + 0x40], 5);
vgf2p8affineinvqb(ymm30|k1|T_z, ymm5, ptr_b [rax + 0x40], 5);
vgf2p8affineinvqb(zmm30|k1|T_z, zmm5, ptr_b [rax + 0x40], 5);
///
gf2p8affineqb(xmm1, xmm2, 3);
gf2p8affineqb(xmm1, ptr [rax + 0x40], 3);
vgf2p8affineqb(xmm1, xmm5, xmm2, 3);
vgf2p8affineqb(ymm1, ymm5, ymm2, 3);
vgf2p8affineqb(xmm1, xmm5, ptr [rax + 0x40], 3);
vgf2p8affineqb(ymm1, ymm5, ptr [rax + 0x40], 3);
vgf2p8affineqb(xmm30, xmm31, xmm4, 5);
vgf2p8affineqb(ymm30, ymm31, ymm4, 5);
vgf2p8affineqb(zmm30, zmm31, zmm4, 5);
vgf2p8affineqb(xmm30|k1|T_z, xmm5, ptr [rax + 0x40], 5);
vgf2p8affineqb(ymm30|k1|T_z, ymm5, ptr [rax + 0x40], 5);
vgf2p8affineqb(zmm30|k1|T_z, zmm5, ptr [rax + 0x40], 5);
vgf2p8affineqb(xmm30|k1|T_z, xmm5, ptr_b [rax + 0x40], 5);
vgf2p8affineqb(ymm30|k1|T_z, ymm5, ptr_b [rax + 0x40], 5);
vgf2p8affineqb(zmm30|k1|T_z, zmm5, ptr_b [rax + 0x40], 5);
///
gf2p8mulb(xmm1, xmm2);
gf2p8mulb(xmm1, ptr [rax + 0x40]);
vgf2p8mulb(xmm1, xmm5, xmm2);
vgf2p8mulb(ymm1, ymm5, ymm2);
vgf2p8mulb(xmm1, xmm5, ptr [rax + 0x40]);
vgf2p8mulb(ymm1, ymm5, ptr [rax + 0x40]);
vgf2p8mulb(xmm30, xmm31, xmm4);
vgf2p8mulb(ymm30, ymm31, ymm4);
vgf2p8mulb(zmm30, zmm31, zmm4);
vgf2p8mulb(xmm30|k1|T_z, xmm5, ptr [rax + 0x40]);
vgf2p8mulb(ymm30|k1|T_z, ymm5, ptr [rax + 0x40]);
vgf2p8mulb(zmm30|k1|T_z, zmm5, ptr [rax + 0x40]);
}
} c;
const uint8_t tbl[] = {
0x66, 0x0f, 0x3a, 0xcf, 0xca, 0x03,
0x66, 0x0f, 0x3a, 0xcf, 0x48, 0x40, 0x03,
0xc4, 0xe3, 0xd1, 0xcf, 0xca, 0x03,
0xc4, 0xe3, 0xd5, 0xcf, 0xca, 0x03,
0xc4, 0xe3, 0xd1, 0xcf, 0x48, 0x40, 0x03,
0xc4, 0xe3, 0xd5, 0xcf, 0x48, 0x40, 0x03,
0x62, 0x63, 0x85, 0x00, 0xcf, 0xf4, 0x05,
0x62, 0x63, 0x85, 0x20, 0xcf, 0xf4, 0x05,
0x62, 0x63, 0x85, 0x40, 0xcf, 0xf4, 0x05,
0x62, 0x63, 0xd5, 0x89, 0xcf, 0x70, 0x04, 0x05,
0x62, 0x63, 0xd5, 0xa9, 0xcf, 0x70, 0x02, 0x05,
0x62, 0x63, 0xd5, 0xc9, 0xcf, 0x70, 0x01, 0x05,
0x62, 0x63, 0xd5, 0x99, 0xcf, 0x70, 0x08, 0x05,
0x62, 0x63, 0xd5, 0xb9, 0xcf, 0x70, 0x08, 0x05,
0x62, 0x63, 0xd5, 0xd9, 0xcf, 0x70, 0x08, 0x05,
0x66, 0x0f, 0x3a, 0xce, 0xca, 0x03,
0x66, 0x0f, 0x3a, 0xce, 0x48, 0x40, 0x03,
0xc4, 0xe3, 0xd1, 0xce, 0xca, 0x03,
0xc4, 0xe3, 0xd5, 0xce, 0xca, 0x03,
0xc4, 0xe3, 0xd1, 0xce, 0x48, 0x40, 0x03,
0xc4, 0xe3, 0xd5, 0xce, 0x48, 0x40, 0x03,
0x62, 0x63, 0x85, 0x00, 0xce, 0xf4, 0x05,
0x62, 0x63, 0x85, 0x20, 0xce, 0xf4, 0x05,
0x62, 0x63, 0x85, 0x40, 0xce, 0xf4, 0x05,
0x62, 0x63, 0xd5, 0x89, 0xce, 0x70, 0x04, 0x05,
0x62, 0x63, 0xd5, 0xa9, 0xce, 0x70, 0x02, 0x05,
0x62, 0x63, 0xd5, 0xc9, 0xce, 0x70, 0x01, 0x05,
0x62, 0x63, 0xd5, 0x99, 0xce, 0x70, 0x08, 0x05,
0x62, 0x63, 0xd5, 0xb9, 0xce, 0x70, 0x08, 0x05,
0x62, 0x63, 0xd5, 0xd9, 0xce, 0x70, 0x08, 0x05,
0x66, 0x0f, 0x38, 0xcf, 0xca,
0x66, 0x0f, 0x38, 0xcf, 0x48, 0x40,
0xc4, 0xe2, 0x51, 0xcf, 0xca,
0xc4, 0xe2, 0x55, 0xcf, 0xca,
0xc4, 0xe2, 0x51, 0xcf, 0x48, 0x40,
0xc4, 0xe2, 0x55, 0xcf, 0x48, 0x40,
0x62, 0x62, 0x05, 0x00, 0xcf, 0xf4,
0x62, 0x62, 0x05, 0x20, 0xcf, 0xf4,
0x62, 0x62, 0x05, 0x40, 0xcf, 0xf4,
0x62, 0x62, 0x55, 0x89, 0xcf, 0x70, 0x04,
0x62, 0x62, 0x55, 0xa9, 0xcf, 0x70, 0x02,
0x62, 0x62, 0x55, 0xc9, 0xcf, 0x70, 0x01,
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
CYBOZU_TEST_AUTO(bf16)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
vcvtne2ps2bf16(xmm0 | k1, xmm1, ptr [rax + 64]);
vcvtne2ps2bf16(ymm0 | k1 | T_z, ymm0, ptr [rax + 64]);
vcvtne2ps2bf16(zmm0 | k1, zmm1, ptr [rax + 64]);
vcvtneps2bf16(xmm0, xword [rax + 64]);
vcvtneps2bf16(xmm0 | k1, yword [rax + 64]);
vcvtneps2bf16(ymm0 | k1, zword [rax + 64]);
vcvtneps2bf16(ymm0 | k1, ptr [rax + 64]);
vdpbf16ps(xmm0 | k1, xmm1, ptr [rax + 64]);
vdpbf16ps(ymm0 | k1, ymm1, ptr [rax + 64]);
vdpbf16ps(zmm0 | k1, zmm1, ptr [rax + 64]);
}
} c;
const uint8_t tbl[] = {
0x62, 0xf2, 0x77, 0x09, 0x72, 0x40, 0x04,
0x62, 0xf2, 0x7f, 0xa9, 0x72, 0x40, 0x02,
0x62, 0xf2, 0x77, 0x49, 0x72, 0x40, 0x01,
0x62, 0xf2, 0x7e, 0x08, 0x72, 0x40, 0x04,
0x62, 0xf2, 0x7e, 0x29, 0x72, 0x40, 0x02,
0x62, 0xf2, 0x7e, 0x49, 0x72, 0x40, 0x01,
0x62, 0xf2, 0x7e, 0x49, 0x72, 0x40, 0x01,
0x62, 0xf2, 0x76, 0x09, 0x52, 0x40, 0x04,
0x62, 0xf2, 0x76, 0x29, 0x52, 0x40, 0x02,
0x62, 0xf2, 0x76, 0x49, 0x52, 0x40, 0x01,
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
CYBOZU_TEST_AUTO(AMX)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
ldtilecfg(ptr[rax + rcx * 4 + 64]);
sttilecfg(ptr[rsp + rax * 8 + 128]);
tileloadd(tmm3, ptr[rdi + rdx * 2 + 8]);
tileloaddt1(tmm4, ptr[r8 + r9 + 32]);
tilerelease();
tilestored(ptr[r10 + r11 * 2 + 32], tmm2);
tilezero(tmm7);
tdpbssd(tmm1, tmm2, tmm3);
tdpbsud(tmm2, tmm3, tmm4);
tdpbusd(tmm3, tmm4, tmm5);
tdpbuud(tmm4, tmm5, tmm6);
tdpbf16ps(tmm5, tmm6, tmm7);
}
} c;
// generated code by patch
const uint8_t tbl[] = {
0xc4, 0xe2, 0x78, 0x49, 0x44, 0x88, 0x40, 0xc4, 0xe2, 0x79, 0x49, 0x84, 0xc4, 0x80, 0x00, 0x00,
0x00, 0xc4, 0xe2, 0x7b, 0x4b, 0x5c, 0x57, 0x08, 0xc4, 0x82, 0x79, 0x4b, 0x64, 0x08, 0x20, 0xc4,
0xe2, 0x78, 0x49, 0xc0, 0xc4, 0x82, 0x7a, 0x4b, 0x54, 0x5a, 0x20, 0xc4, 0xe2, 0x7b, 0x49, 0xf8,
0xc4, 0xe2, 0x63, 0x5e, 0xca, 0xc4, 0xe2, 0x5a, 0x5e, 0xd3, 0xc4, 0xe2, 0x51, 0x5e, 0xdc, 0xc4,
0xe2, 0x48, 0x5e, 0xe5, 0xc4, 0xe2, 0x42, 0x5c, 0xee,
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
CYBOZU_TEST_AUTO(tileloadd)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
tileloadd(tmm1, ptr[r8+r8]);
tileloadd(tmm1, ptr[rax+rcx*4]);
tileloadd(tmm1, ptr[r8+r9*1+0x40]);
}
void notSupported()
{
tileloadd(tmm1, ptr[r8]);
}
void notSupported2()
{
tileloadd(tmm1, ptr[r8*2]);
}
} c;
const uint8_t tbl[] = {
0xC4, 0x82, 0x7B, 0x4B, 0x0C, 0x00,
0xC4, 0xE2, 0x7B, 0x4B, 0x0C, 0x88,
0xC4, 0x82, 0x7B, 0x4B, 0x4C, 0x08, 0x40,
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
// current version does not support this sibmem format
CYBOZU_TEST_EXCEPTION(c.notSupported(), std::exception);
CYBOZU_TEST_EXCEPTION(c.notSupported2(), std::exception);
}
CYBOZU_TEST_AUTO(vnni)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
// default encoding is EVEX
vpdpbusd(xm0, xm1, xm2);
vpdpbusd(xm0, xm1, xm2, EvexEncoding); // EVEX
vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX
setDefaultEncoding(VexEncoding);
vpdpbusd(xm0, xm1, xm2); // VEX
setDefaultEncoding(EvexEncoding);
vpdpbusd(xm0, xm1, xm2); // EVEX
}
void badVex()
{
vpdpbusd(xm0, xm1, xm31, VexEncoding);
}
} c;
const uint8_t tbl[] = {
0x62, 0xF2, 0x75, 0x08, 0x50, 0xC2,
0x62, 0xF2, 0x75, 0x08, 0x50, 0xC2,
0xC4, 0xE2, 0x71, 0x50, 0xC2,
0xC4, 0xE2, 0x71, 0x50, 0xC2,
0x62, 0xF2, 0x75, 0x08, 0x50, 0xC2,
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
CYBOZU_TEST_EXCEPTION(c.badVex(), std::exception);
}
CYBOZU_TEST_AUTO(vaddph)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
vaddph(zmm0, zmm1, ptr[rax+64]);
vaddph(ymm0, ymm1, ptr[rax+64]);
vaddph(xmm0, xmm1, ptr[rax+64]);
vaddph(zmm0, zmm1, ptr_b[rax+64]);
vaddph(ymm0, ymm1, ptr_b[rax+64]);
vaddph(xmm0, xmm1, ptr_b[rax+64]);
vaddsh(xmm0, xmm15, ptr[rax+64]);
vaddsh(xmm0|k5|T_z|T_rd_sae, xmm15, xmm3);
vcmpph(k1, xm15, ptr[rax+64], 1);
vcmpph(k2, ym15, ptr[rax+64], 2);
vcmpph(k3, zm15, ptr[rax+64], 3);
vcmpph(k1, xm15, ptr_b[rax+64], 1);
vcmpph(k2, ym15, ptr_b[rax+64], 2);
vcmpph(k3, zm15, ptr_b[rax+64], 3);
vcmpsh(k1, xm15, ptr[rax+64], 1);
vcmpsh(k3|k5, xmm1, xmm25|T_sae, 4);
vcomish(xmm1, ptr[rax+64]);
vcomish(xmm1|T_sae, xmm15);
vucomish(xmm1, ptr [rax+0x40]);
vucomish(xmm1|T_sae, xmm15);
vfmaddsub213ph(xmm1, xmm2, ptr [rax+0x40]);
vfmaddsub213ph(xmm1, xmm2, ptr_b [rax+0x40]);
vfmaddsub213ph(xmm1|k3, xmm2, xmm5);
vfmaddsub213ph(ymm1, ymm2, ptr [rax+0x40]);
vfmaddsub213ph(ymm1, ymm2, ptr_b[rax+0x40]);
vfmaddsub213ph(ymm1|k3, ymm2, ymm5);
vfmaddsub213ph(zmm1, zmm2, ptr [rax+0x40]);
vfmaddsub213ph(zmm1, zmm2, ptr_b [rax+0x40]);
vfmaddsub213ph(zmm1|T_ru_sae, zmm2, zmm5);
vfmsubadd132ph(xmm1, xmm2, ptr [rax+0x40]);
vfmsubadd132ph(xmm1, xmm2, ptr_b [rax+0x40]);
vfmsubadd132ph(ymm1, ymm2, ptr [rax+0x40]);
vfmsubadd132ph(ymm1, ymm2, ptr_b [rax+0x40]);
vfmsubadd132ph(zmm1, zmm2, ptr [rax+0x40]);
vfmsubadd132ph(zmm1, zmm2, ptr_b [rax+0x40]);
vfmsubadd132ph(zmm1|T_ru_sae, zmm2, zmm5);
vfmadd132ph(xmm1, xmm2, ptr [rax+0x40]);
vfmadd132ph(xmm1, xmm2, ptr_b [rax+0x40]);
vfmadd132ph(ymm1, ymm2, ptr [rax+0x40]);
vfmadd132ph(ymm1, ymm2, ptr_b [rax+0x40]);
vfmadd132ph(zmm1, zmm2, ptr [rax+0x40]);
vfmadd132ph(zmm1, zmm2, ptr_b [rax+0x40]);
vfmadd132ph(zmm1|T_rd_sae, zmm2, zmm5);
vfmsub231ph(xmm1, xmm2, ptr [rax+0x40]);
vfmsub231ph(xmm1, xmm2, ptr_b [rax+0x40]);
vfmsub231ph(ymm1, ymm2, ptr [rax+0x40]);
vfmsub231ph(ymm1, ymm2, ptr_b [rax+0x40]);
vfmsub231ph(zmm1, zmm2, ptr [rax+0x40]);
vfmsub231ph(zmm1, zmm2, ptr_b [rax+0x40]);
vfmsub231ph(zmm1|T_rd_sae, zmm2, zmm5);
vfnmsub231ph(xmm1, xmm2, ptr [rax+0x40]);
vfnmsub231ph(ymm1, ymm2, ptr_b [rax+0x40]);
vfnmsub231ph(zmm1, zmm2, ptr_b [rax+0x40]);
vfnmsub231ph(zmm1|T_rd_sae, zmm2, zmm5);
vfmadd132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3);
vfmadd132sh(xmm1, xmm2, ptr [rax+0x40]);
vfnmadd132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3);
vfnmadd132sh(xmm1, xmm2, ptr [rax+0x40]);
vfmsub132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3);
vfmsub132sh(xmm1, xmm2, ptr [rax+0x40]);
vfnmsub132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3);
vfnmsub132sh(xmm1, xmm2, ptr [rax+0x40]);
vfcmaddcph(xmm1|k1|T_z, xmm2, ptr [rax+0x40]);
vfcmaddcph(ymm1|k1|T_z, ymm2, ptr [rax+0x40]);
vfcmaddcph(zmm1|k1, zmm2, ptr [rax+0x40]);
vfcmaddcph(zmm1|k1|T_rd_sae, zmm2, zmm5);
vfcmaddcph(xmm1|k1|T_z, xmm2, ptr_b [rax+0x40]);
vfcmaddcph(ymm1|k1|T_z, ymm2, ptr_b [rax+0x40]);
vfcmaddcph(zmm1|k1|T_z, zmm2, ptr_b [rax+0x40]);
vfmaddcph(xm1, xm2, ptr[rax+0x40]);
vfmaddcph(ym1|k1|T_z, ym2, ptr_b[rax+0x40]);
vfmaddcph(zm1, zm2, ptr_b[rax+0x40]);
vfcmulcph(xmm1, xmm2, ptr [rax+0x40]);
vfcmulcph(ymm1|k1|T_z, ymm2, ptr_b [rax+0x40]);
vfcmulcph(zmm1, zmm2, ptr_b [rax+0x40]);
vfmulcph(xmm1, xmm2, ptr [rax+0x40]);
vfmulcph(ymm1|k1|T_z, ymm2, ptr_b [rax+0x40]);
vfmulcph(zmm1, zmm2, ptr_b [rax+0x40]);
vrcpph(xmm1, ptr [rax+0x40]);
vrcpph(xmm1, ptr_b [rax+0x40]);
vrcpph(ymm1, ptr [rax+0x40]);
vrcpph(ymm1, ptr_b [rax+0x40]);
vrcpph(zmm1, ptr [rax+0x40]);
vrcpph(zmm1, ptr_b [rax+0x40]);
vrcpsh(xmm1, xmm3, ptr [rax+0x40]);
vrsqrtph(xmm1, ptr [rax+0x40]);
vrsqrtph(xmm1, ptr_b [rax+0x40]);
vrsqrtph(ymm2, ptr [rax+0x40]);
vrsqrtph(ymm2, ptr_b [rax+0x40]);
vrsqrtph(zmm2, ptr [rax+0x40]);
vrsqrtph(zmm2, ptr_b [rax+0x40]);
vrsqrtsh(xmm1|k5|T_z, xmm7, ptr [rax+0x40]);
vsqrtph(xmm1|k4|T_z, ptr [rax+0x40]);
vsqrtph(xmm1|k4|T_z, ptr_b [rax+0x40]);
vsqrtph(ymm1|k4|T_z, ptr_b [rax+0x40]);
vsqrtph(zmm1|k4|T_z, ptr [rax+0x40]);
vsqrtph(zmm1|k4|T_z, ptr_b [rax+0x40]);
vsqrtsh(xmm1|k4|T_z, xmm5, ptr [rax+0x40]);
vsqrtsh(xmm1|k4|T_z|T_rd_sae, xmm5, xmm7);
vscalefph(xmm1, xmm5, ptr [rax+0x40]);
vscalefph(xmm1, xmm5, ptr_b [rax+0x40]);
vscalefph(ymm1, ymm5, ptr [rax+0x40]);
vscalefph(ymm1, ymm5, ptr_b [rax+0x40]);
vscalefph(zmm1, zmm5, ptr [rax+0x40]);
vscalefph(zmm1, zmm5, ptr_b [rax+0x40]);
vscalefph(zmm1|k1|T_z|T_rd_sae, zmm5, zmm7);
vscalefsh(xmm1, xmm5, ptr [rax+0x40]);
vscalefsh(xmm1|k1|T_z|T_rd_sae, xmm5, xmm7);
vreduceph(xmm1, ptr [rax+0x40], 0x1);
vreduceph(xmm1, ptr_b [rax+0x40], 0x2);
vreduceph(ymm1, ptr [rax+0x40], 0x3);
vreduceph(ymm1, ptr_b [rax+0x40], 0x4);
vreduceph(zmm1, ptr [rax+0x40], 0x5);
vreduceph(zmm1, ptr_b [rax+0x40], 0x6);
vreduceph(zmm1|k1|T_z|T_sae, zmm5, 0x7);
vreducesh(xmm1, xmm3, ptr [rax+0x40], 0x1);
vreducesh(xmm1|k1|T_z|T_sae, xmm5, xmm4, 0x2);
vrndscaleph(xmm1, ptr [rax+0x40], 0x1);
vrndscaleph(xmm1, ptr_b [rax+0x40], 0x2);
vrndscaleph(ymm1, ptr [rax+0x40], 0x3);
vrndscaleph(ymm1, ptr_b [rax+0x40], 0x4);
vrndscaleph(zmm1, ptr [rax+0x40], 0x5);
vrndscaleph(zmm1, ptr_b [rax+0x40], 0x6);
vrndscaleph(zmm1|k1|T_z|T_sae, zmm5, 0x7);
vrndscalesh(xmm1, xmm3, ptr [rax+0x40], 0x1);
vrndscalesh(xmm1|k1|T_z|T_sae, xmm5, xmm4, 0x2);
vfpclassph(k1, xword [rax+0x40], 0x1);
vfpclassph(k1, xword_b[rax+0x40], 0x2);
vfpclassph(k1, yword [rax+0x40], 0x3);
vfpclassph(k1, yword_b[rax+0x40], 0x4);
vfpclassph(k1, zword [rax+0x40], 0x5);
vfpclassph(k1, zword_b[rax+0x40], 0x6);
vfpclasssh(k1|k2, xmm3, 0x5);
vfpclasssh(k1|k2, ptr [rax+0x40], 0x5);
vgetexpph(xmm1, ptr [rax+0x40]);
vgetexpph(ymm1, ptr_b [rax+0x40]);
vgetexpph(zmm1, ptr [rax+0x40]);
vgetexpph(zmm1|k1|T_z|T_sae, zmm5);
vgetexpsh(xmm1, xmm5, ptr [rax+0x40]);
vgetexpsh(xmm1|k1|T_z|T_sae, xmm3, xmm5);
vgetmantph(xmm1, ptr [rax+0x40], 0x1);
vgetmantph(ymm1, ptr_b [rax+0x40], 0x2);
vgetmantph(zmm1, ptr [rax+0x40], 0x3);
vgetmantph(zmm1|k1|T_z|T_sae, zmm5, 0x4);
vgetmantsh(xmm1, xmm5, ptr [rax+0x40], 0x5);
vgetmantsh(xmm1|k1|T_z|T_sae, xmm3, xmm5, 0x6);
vmovsh(xmm1|k1|T_z, ptr [rax+0x40]);
vmovsh(ptr [rax+0x40]|k1, xmm1);
vmovsh(xmm1|k2|T_z, xmm3, xmm5);
vmovw(xmm1, r13d);
vmovw(xmm3, ptr [rax+0x40]);
vmovw(r9d, xmm1);
vmovw(ptr [rax+0x40], xmm7);
vcvtsd2sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3);
vcvtsd2sh(xmm1, xmm2, ptr [rax+0x40]);
vcvtsh2sd(xmm1|k1|T_z|T_sae, xmm2, xmm3);
vcvtsh2sd(xmm1, xmm2, ptr [rax+0x40]);
vcvtsh2ss(xmm1|k1|T_z|T_sae, xmm2, xmm3);
vcvtsh2ss(xmm1, xmm2, ptr [rax+0x40]);
vcvtss2sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3);
vcvtss2sh(xmm1, xmm2, ptr [rax+0x40]);
vcvtsh2si(edx|T_rd_sae, xmm1);
vcvtsh2si(edx, ptr [rax+0x40]);
vcvtsh2si(rdx|T_rd_sae, xmm1);
vcvtsh2si(r8, ptr [rax+0x40]);
vcvtph2dq(xmm1, xmm5);
vcvtph2dq(xmm1, ptr [rax+0x40]);
vcvtph2dq(xmm1, ptr_b [rax+0x40]);
vcvtph2dq(ymm1|k2|T_z, xmm5);
vcvtph2dq(ymm1, ptr [rax+0x40]);
vcvtph2dq(ymm1, ptr_b [rax+0x40]);
vcvtph2dq(zmm1|k5|T_z|T_rd_sae, ymm3);
vcvtph2dq(zmm1|k5|T_z, ptr [rax+0x40]);
vcvtph2dq(zmm1|k5|T_z, ptr_b [rax+0x40]);
vcvtph2psx(xmm1, xmm5);
vcvtph2psx(xmm1, ptr [rax+0x40]);
vcvtph2psx(xmm1, ptr_b [rax+0x40]);
vcvtph2psx(ymm1|k2|T_z, xmm5);
vcvtph2psx(ymm1, ptr [rax+0x40]);
vcvtph2psx(ymm1, ptr_b [rax+0x40]);
vcvtph2psx(zmm1|k5|T_z|T_sae, ymm3);
vcvtph2psx(zmm1|k5|T_z, ptr [rax+0x40]);
vcvtph2psx(zmm1|k5|T_z, ptr_b [rax+0x40]);
vcvtph2udq(xmm1, xmm5);
vcvtph2udq(xmm1, ptr [rax+0x40]);
vcvtph2udq(xmm1, ptr_b [rax+0x40]);
vcvtph2udq(ymm1|k2|T_z, xmm5);
vcvtph2udq(ymm1, ptr [rax+0x40]);
vcvtph2udq(ymm1, ptr_b [rax+0x40]);
vcvtph2udq(zmm1|k5|T_z|T_rd_sae, ymm3);
vcvtph2udq(zmm1|k5|T_z, ptr [rax+0x40]);
vcvtph2udq(zmm1|k5|T_z, ptr_b [rax+0x40]);
vcvttph2dq(xmm1, xmm5);
vcvttph2dq(xmm1, ptr [rax+0x40]);
vcvttph2dq(xmm1, ptr_b [rax+0x40]);
vcvttph2dq(ymm1|k2|T_z, xmm5);
vcvttph2dq(ymm1, ptr [rax+0x40]);
vcvttph2dq(ymm1, ptr_b [rax+0x40]);
vcvttph2dq(zmm1|k5|T_z|T_sae, ymm3);
vcvttph2dq(zmm1|k5|T_z, ptr [rax+0x40]);
vcvttph2dq(zmm1|k5|T_z, ptr_b [rax+0x40]);
vcvttph2udq(xmm1, xmm5);
vcvttph2udq(xmm1, ptr [rax+0x40]);
vcvttph2udq(xmm1, ptr_b [rax+0x40]);
vcvttph2udq(ymm1|k2|T_z, xmm5);
vcvttph2udq(ymm1, ptr [rax+0x40]);
vcvttph2udq(ymm1, ptr_b [rax+0x40]);
vcvttph2udq(zmm1|k5|T_z|T_sae, ymm3);
vcvttph2udq(zmm1|k5|T_z, ptr [rax+0x40]);
vcvttph2udq(zmm1|k5|T_z, ptr_b [rax+0x40]);
vcvtph2pd(xmm1, xmm5);
vcvtph2pd(xmm1, ptr [rax+0x40]);
vcvtph2pd(xmm1, ptr_b [rax+0x40]);
vcvtph2pd(ymm1|k2|T_z, xmm5);
vcvtph2pd(ymm1, ptr [rax+0x40]);
vcvtph2pd(ymm1, ptr_b [rax+0x40]);
vcvtph2pd(zmm1|k5|T_z|T_sae, xmm3);
vcvtph2pd(zmm1|k5|T_z, ptr [rax+0x40]);
vcvtph2pd(zmm1|k5|T_z, ptr_b [rax+0x40]);
vcvtph2qq(xmm1, xmm5);
vcvtph2qq(xmm1, ptr [rax+0x40]);
vcvtph2qq(xmm1, ptr_b [rax+0x40]);
vcvtph2qq(ymm1|k2|T_z, xmm5);
vcvtph2qq(ymm1, ptr [rax+0x40]);
vcvtph2qq(ymm1, ptr_b [rax+0x40]);
vcvtph2qq(zmm1|k5|T_z|T_rd_sae, xmm3);
vcvtph2qq(zmm1|k5|T_z, ptr [rax+0x40]);
vcvtph2qq(zmm1|k5|T_z, ptr_b [rax+0x40]);
vcvtph2uqq(xmm1, xmm5);
vcvtph2uqq(xmm1, ptr [rax+0x40]);
vcvtph2uqq(xmm1, ptr_b [rax+0x40]);
vcvtph2uqq(ymm1|k2|T_z, xmm5);
vcvtph2uqq(ymm1, ptr [rax+0x40]);
vcvtph2uqq(ymm1, ptr_b [rax+0x40]);
vcvtph2uqq(zmm1|k5|T_z|T_rd_sae, xmm3);
vcvtph2uqq(zmm1|k5|T_z, ptr [rax+0x40]);
vcvtph2uqq(zmm1|k5|T_z, ptr_b [rax+0x40]);
vcvttph2uqq(xmm1, xmm5);
vcvttph2uqq(xmm1, ptr [rax+0x40]);
vcvttph2uqq(xmm1, ptr_b [rax+0x40]);
vcvttph2uqq(ymm1|k2|T_z, xmm5);
vcvttph2uqq(ymm1, ptr [rax+0x40]);
vcvttph2uqq(ymm1, ptr_b [rax+0x40]);
vcvttph2uqq(zmm1|k5|T_z|T_sae, xmm3);
vcvttph2uqq(zmm1|k5|T_z, ptr [rax+0x40]);
vcvttph2uqq(zmm1|k5|T_z, ptr_b [rax+0x40]);
vcvtdq2ph(xmm1, xmm5);
vcvtdq2ph(xmm1, xword [rax+0x40]);
vcvtdq2ph(xmm1, xword_b [rax+0x40]);
vcvtdq2ph(xmm1, yword [rax+0x40]);
vcvtdq2ph(xmm1, yword_b [rax+0x40]);
vcvtdq2ph(ymm1|k2|T_z|T_rd_sae, zmm5);
vcvtdq2ph(ymm1, ptr [rax+0x40]);
vcvtdq2ph(ymm1, ptr_b [rax+0x40]);
vcvtps2phx(xmm1, xmm5);
vcvtps2phx(xmm1, xword [rax+0x40]);
vcvtps2phx(xmm1, xword_b [rax+0x40]);
vcvtps2phx(xmm1, yword [rax+0x40]);
vcvtps2phx(xmm1, yword_b [rax+0x40]);
vcvtps2phx(ymm1|k2|T_z|T_rd_sae, zmm5);
vcvtps2phx(ymm1, ptr [rax+0x40]);
vcvtps2phx(ymm1, ptr_b [rax+0x40]);
vcvtudq2ph(xmm1, xmm5);
vcvtudq2ph(xmm1, xword [rax+0x40]);
vcvtudq2ph(xmm1, xword_b [rax+0x40]);
vcvtudq2ph(xmm1, yword [rax+0x40]);
vcvtudq2ph(xmm1, yword_b [rax+0x40]);
vcvtudq2ph(ymm1|k2|T_z|T_rd_sae, zmm5);
vcvtudq2ph(ymm1, ptr [rax+0x40]);
vcvtudq2ph(ymm1, ptr_b [rax+0x40]);
vcvtpd2ph(xmm1, xmm5);
vcvtpd2ph(xmm1, ymm5);
vcvtpd2ph(xmm1|k2|T_z|T_rd_sae, zmm5);
vcvtpd2ph(xmm1, xword [rax+0x40]);
vcvtpd2ph(xmm1, xword_b [rax+0x40]);
vcvtpd2ph(xmm1, yword [rax+0x40]);
vcvtpd2ph(xmm1, yword_b [rax+0x40]);
vcvtpd2ph(xmm1, zword [rax+0x40]);
vcvtpd2ph(xmm1, zword_b [rax+0x40]);
vcvtqq2ph(xmm1, xmm5);
vcvtqq2ph(xmm1, ymm5);
vcvtqq2ph(xmm1|k2|T_z|T_rd_sae, zmm5);
vcvtqq2ph(xmm1, xword [rax+0x40]);
vcvtqq2ph(xmm1, xword_b [rax+0x40]);
vcvtqq2ph(xmm1, yword [rax+0x40]);
vcvtqq2ph(xmm1, yword_b [rax+0x40]);
vcvtqq2ph(xmm1, zword [rax+0x40]);
vcvtqq2ph(xmm1, zword_b [rax+0x40]);
vcvtuqq2ph(xmm1, xmm5);
vcvtuqq2ph(xmm1, ymm5);
vcvtuqq2ph(xmm1|k2|T_z|T_rd_sae, zmm5);
vcvtuqq2ph(xmm1, xword [rax+0x40]);
vcvtuqq2ph(xmm1, xword_b [rax+0x40]);
vcvtuqq2ph(xmm1, yword [rax+0x40]);
vcvtuqq2ph(xmm1, yword_b [rax+0x40]);
vcvtuqq2ph(xmm1, zword [rax+0x40]);
vcvtuqq2ph(xmm1, zword_b [rax+0x40]);
vcvtph2uw(xmm1, xmm5);
vcvtph2uw(xmm1, ptr [rax+0x40]);
vcvtph2uw(xmm1, ptr_b [rax+0x40]);
vcvtph2uw(ymm1, ptr [rax+0x40]);
vcvtph2uw(ymm1, ptr_b [rax+0x40]);
vcvtph2uw(zmm1|k2|T_z|T_rd_sae, zmm5);
vcvtph2uw(zmm1, ptr [rax+0x40]);
vcvtph2uw(zmm1, ptr_b [rax+0x40]);
vcvtph2w(xmm1, xmm5);
vcvtph2w(xmm1, ptr [rax+0x40]);
vcvtph2w(xmm1, ptr_b [rax+0x40]);
vcvtph2w(ymm1, ptr [rax+0x40]);
vcvtph2w(ymm1, ptr_b [rax+0x40]);
vcvtph2w(zmm1|k2|T_z|T_rd_sae, zmm5);
vcvtph2w(zmm1, ptr [rax+0x40]);
vcvtph2w(zmm1, ptr_b [rax+0x40]);
vcvttph2uw(xmm1, xmm5);
vcvttph2uw(xmm1, ptr [rax+0x40]);
vcvttph2uw(xmm1, ptr_b [rax+0x40]);
vcvttph2uw(ymm1, ptr [rax+0x40]);
vcvttph2uw(ymm1, ptr_b [rax+0x40]);
vcvttph2uw(zmm1|k2|T_z|T_sae, zmm5);
vcvttph2uw(zmm1, ptr [rax+0x40]);
vcvttph2uw(zmm1, ptr_b [rax+0x40]);
vcvttph2w(xmm1, xmm5);
vcvttph2w(xmm1, ptr [rax+0x40]);
vcvttph2w(xmm1, ptr_b [rax+0x40]);
vcvttph2w(ymm1, ptr [rax+0x40]);
vcvttph2w(ymm1, ptr_b [rax+0x40]);
vcvttph2w(zmm1|k2|T_z|T_sae, zmm5);
vcvttph2w(zmm1, ptr [rax+0x40]);
vcvttph2w(zmm1, ptr_b [rax+0x40]);
vcvtuw2ph(xmm1, xmm5);
vcvtuw2ph(xmm1, ptr [rax+0x40]);
vcvtuw2ph(xmm1, ptr_b [rax+0x40]);
vcvtuw2ph(ymm1, ptr [rax+0x40]);
vcvtuw2ph(ymm1, ptr_b [rax+0x40]);
vcvtuw2ph(zmm1|k2|T_z|T_rd_sae, zmm5);
vcvtuw2ph(zmm1, ptr [rax+0x40]);
vcvtuw2ph(zmm1, ptr_b [rax+0x40]);
vcvtw2ph(xmm1, xmm5);
vcvtw2ph(xmm1, ptr [rax+0x40]);
vcvtw2ph(xmm1, ptr_b [rax+0x40]);
vcvtw2ph(ymm1, ptr [rax+0x40]);
vcvtw2ph(ymm1, ptr_b [rax+0x40]);
vcvtw2ph(zmm1|k2|T_z|T_rd_sae, zmm5);
vcvtw2ph(zmm1, ptr [rax+0x40]);
vcvtw2ph(zmm1, ptr_b [rax+0x40]);
vcvtps2ph(xmm1, xmm2, 0x1);
vcvtps2ph(ptr [rax+0x40], xmm2, 0x2);
vcvtps2ph(xmm1, ymm2, 0x3);
vcvtps2ph(ptr [rax+0x40], ymm2, 0x4);
vcvtps2ph(xmm1|k1|T_z, xmm2, 0x5);
vcvtps2ph(ptr [rax+0x40]|k1, xmm3, 0x6);
vcvtps2ph(xmm1|k2, ymm4, 0x7);
vcvtps2ph(ptr [rax+0x40]|k2, ymm5, 0x8);
vcvtps2ph(ymm1|k2|T_sae, zmm5, 0x9);
vcvtps2ph(ptr [rax+0x40]|k5, zmm4, 0xa);
vcvtsh2usi(ecx|T_rd_sae, xmm1);
vcvtsh2usi(eax, ptr [rax+0x40]);
vcvtsh2usi(r9|T_rd_sae, xmm1);
vcvtsh2usi(r13, ptr [rax+0x40]);
vcvttsh2si(ecx|T_sae, xmm1);
vcvttsh2si(eax, ptr [rax+0x40]);
vcvttsh2si(r9|T_sae, xmm1);
vcvttsh2si(r13, ptr [rax+0x40]);
vcvttsh2usi(ecx|T_sae, xmm1);
vcvttsh2usi(eax, ptr [rax+0x40]);
vcvttsh2usi(r9|T_sae, xmm1);
vcvttsh2usi(r13, ptr [rax+0x40]);
vcvttph2qq(xmm1, xmm5);
vcvttph2qq(xmm1, ptr [rax+0x40]);
vcvttph2qq(xmm1, ptr_b [rax+0x40]);
vcvttph2qq(ymm1|k2|T_z, xmm5);
vcvttph2qq(ymm1, ptr [rax+0x40]);
vcvttph2qq(ymm1, ptr_b [rax+0x40]);
vcvttph2qq(zmm1|k5|T_z|T_sae, xmm3);
vcvttph2qq(zmm1|k5|T_z, ptr [rax+0x40]);
vcvttph2qq(zmm1|k5|T_z, ptr_b [rax+0x40]);
vcvtsi2sh(xmm1|T_rd_sae, xmm2, eax);
vcvtsi2sh(xmm1, xmm2, dword [rax+0x40]);
vcvtsi2sh(xmm1|T_rd_sae, xmm2, r9);
vcvtsi2sh(xmm1, xmm2, qword [rax+0x40]);
vcvtusi2sh(xmm1|T_rd_sae, xmm2, eax);
vcvtusi2sh(xmm1, xmm2, dword [rax+0x40]);
vcvtusi2sh(xmm1|T_rd_sae, xmm2, r9);
vcvtusi2sh(xmm1, xmm2, qword [rax+0x40]);
}
} c;
const uint8_t tbl[] = {
// vaddph
0x62, 0xF5, 0x74, 0x48, 0x58, 0x40, 0x01,
0x62, 0xF5, 0x74, 0x28, 0x58, 0x40, 0x02,
0x62, 0xF5, 0x74, 0x08, 0x58, 0x40, 0x04,
0x62, 0xF5, 0x74, 0x58, 0x58, 0x40, 0x20,
0x62, 0xF5, 0x74, 0x38, 0x58, 0x40, 0x20,
0x62, 0xF5, 0x74, 0x18, 0x58, 0x40, 0x20,
// vaddsh
0x62, 0xF5, 0x06, 0x08, 0x58, 0x40, 0x20,
0x62, 0xF5, 0x06, 0xBD, 0x58, 0xC3,
// vcmpph
0x62, 0xf3, 0x04, 0x08, 0xc2, 0x48, 0x04, 0x01,
0x62, 0xf3, 0x04, 0x28, 0xc2, 0x50, 0x02, 0x02,
0x62, 0xf3, 0x04, 0x48, 0xc2, 0x58, 0x01, 0x03,
0x62, 0xf3, 0x04, 0x18, 0xc2, 0x48, 0x20, 0x01,
0x62, 0xf3, 0x04, 0x38, 0xc2, 0x50, 0x20, 0x02,
0x62, 0xf3, 0x04, 0x58, 0xc2, 0x58, 0x20, 0x03,
// vcmpsh
0x62, 0xf3, 0x06, 0x08, 0xc2, 0x48, 0x20, 0x01,
0x62, 0x93, 0x76, 0x1d, 0xc2, 0xd9, 0x04,
// vcomish
0x62, 0xf5, 0x7c, 0x08, 0x2f, 0x48, 0x20,
0x62, 0xd5, 0x7c, 0x18, 0x2f, 0xcf,
// vucomish
0x62, 0xf5, 0x7c, 0x08, 0x2e, 0x48, 0x20,
0x62, 0xd5, 0x7c, 0x18, 0x2e, 0xcf,
// vfmaddsub213ph
0x62, 0xf6, 0x6d, 0x08, 0xa6, 0x48, 0x04,
0x62, 0xf6, 0x6d, 0x18, 0xa6, 0x48, 0x20,
0x62, 0xf6, 0x6d, 0x0b, 0xa6, 0xcd,
0x62, 0xf6, 0x6d, 0x28, 0xa6, 0x48, 0x02,
0x62, 0xf6, 0x6d, 0x38, 0xa6, 0x48, 0x20,
0x62, 0xf6, 0x6d, 0x2b, 0xa6, 0xcd,
0x62, 0xf6, 0x6d, 0x48, 0xa6, 0x48, 0x01,
0x62, 0xf6, 0x6d, 0x58, 0xa6, 0x48, 0x20,
0x62, 0xf6, 0x6d, 0x58, 0xa6, 0xcd,
// vfmsubadd132ph
0x62, 0xf6, 0x6d, 0x08, 0x97, 0x48, 0x04,
0x62, 0xf6, 0x6d, 0x18, 0x97, 0x48, 0x20,
0x62, 0xf6, 0x6d, 0x28, 0x97, 0x48, 0x02,
0x62, 0xf6, 0x6d, 0x38, 0x97, 0x48, 0x20,
0x62, 0xf6, 0x6d, 0x48, 0x97, 0x48, 0x01,
0x62, 0xf6, 0x6d, 0x58, 0x97, 0x48, 0x20,
0x62, 0xf6, 0x6d, 0x58, 0x97, 0xcd,
// vfmadd132ph
0x62, 0xf6, 0x6d, 0x08, 0x98, 0x48, 0x04,
0x62, 0xf6, 0x6d, 0x18, 0x98, 0x48, 0x20,
0x62, 0xf6, 0x6d, 0x28, 0x98, 0x48, 0x02,
0x62, 0xf6, 0x6d, 0x38, 0x98, 0x48, 0x20,
0x62, 0xf6, 0x6d, 0x48, 0x98, 0x48, 0x01,
0x62, 0xf6, 0x6d, 0x58, 0x98, 0x48, 0x20,
0x62, 0xf6, 0x6d, 0x38, 0x98, 0xcd,
// vfmsub231ph
0x62, 0xf6, 0x6d, 0x08, 0xba, 0x48, 0x04,
0x62, 0xf6, 0x6d, 0x18, 0xba, 0x48, 0x20,
0x62, 0xf6, 0x6d, 0x28, 0xba, 0x48, 0x02,
0x62, 0xf6, 0x6d, 0x38, 0xba, 0x48, 0x20,
0x62, 0xf6, 0x6d, 0x48, 0xba, 0x48, 0x01,
0x62, 0xf6, 0x6d, 0x58, 0xba, 0x48, 0x20,
0x62, 0xf6, 0x6d, 0x38, 0xba, 0xcd,
// vfnmsub231ph
0x62, 0xf6, 0x6d, 0x08, 0xbe, 0x48, 0x04,
0x62, 0xf6, 0x6d, 0x38, 0xbe, 0x48, 0x20,
0x62, 0xf6, 0x6d, 0x58, 0xbe, 0x48, 0x20,
0x62, 0xf6, 0x6d, 0x38, 0xbe, 0xcd,
// vfmadd132sh
0x62, 0xf6, 0x6d, 0xb9, 0x99, 0xcb,
0x62, 0xf6, 0x6d, 0x08, 0x99, 0x48, 0x20,
// vfnmadd132sh
0x62, 0xf6, 0x6d, 0xb9, 0x9d, 0xcb,
0x62, 0xf6, 0x6d, 0x08, 0x9d, 0x48, 0x20,
// vfmsub132sh
0x62, 0xf6, 0x6d, 0xb9, 0x9b, 0xcb,
0x62, 0xf6, 0x6d, 0x08, 0x9b, 0x48, 0x20,
// vfnmsub132sh
0x62, 0xf6, 0x6d, 0xb9, 0x9f, 0xcb,
0x62, 0xf6, 0x6d, 0x08, 0x9f, 0x48, 0x20,
// vfcmaddcph
0x62, 0xf6, 0x6f, 0x89, 0x56, 0x48, 0x04,
0x62, 0xf6, 0x6f, 0xa9, 0x56, 0x48, 0x02,
0x62, 0xf6, 0x6f, 0x49, 0x56, 0x48, 0x01,
0x62, 0xf6, 0x6f, 0x39, 0x56, 0xcd,
0x62, 0xf6, 0x6f, 0x99, 0x56, 0x48, 0x10,
0x62, 0xf6, 0x6f, 0xb9, 0x56, 0x48, 0x10,
0x62, 0xf6, 0x6f, 0xd9, 0x56, 0x48, 0x10,
// vfmaddcph
0x62, 0xf6, 0x6e, 0x08, 0x56, 0x48, 0x04,
0x62, 0xf6, 0x6e, 0xb9, 0x56, 0x48, 0x10,
0x62, 0xf6, 0x6e, 0x58, 0x56, 0x48, 0x10,
// vfcmulcph
0x62, 0xf6, 0x6f, 0x08, 0xd6, 0x48, 0x04,
0x62, 0xf6, 0x6f, 0xb9, 0xd6, 0x48, 0x10,
0x62, 0xf6, 0x6f, 0x58, 0xd6, 0x48, 0x10,
// vfmulcph
0x62, 0xf6, 0x6e, 0x08, 0xd6, 0x48, 0x04,
0x62, 0xf6, 0x6e, 0xb9, 0xd6, 0x48, 0x10,
0x62, 0xf6, 0x6e, 0x58, 0xd6, 0x48, 0x10,
// vrcpph
0x62, 0xf6, 0x7d, 0x08, 0x4c, 0x48, 0x04,
0x62, 0xf6, 0x7d, 0x18, 0x4c, 0x48, 0x20,
0x62, 0xf6, 0x7d, 0x28, 0x4c, 0x48, 0x02,
0x62, 0xf6, 0x7d, 0x38, 0x4c, 0x48, 0x20,
0x62, 0xf6, 0x7d, 0x48, 0x4c, 0x48, 0x01,
0x62, 0xf6, 0x7d, 0x58, 0x4c, 0x48, 0x20,
// vrcpsh
0x62, 0xf6, 0x65, 0x08, 0x4d, 0x48, 0x20,
// vrsqrtph
0x62, 0xf6, 0x7d, 0x08, 0x4e, 0x48, 0x04,
0x62, 0xf6, 0x7d, 0x18, 0x4e, 0x48, 0x20,
0x62, 0xf6, 0x7d, 0x28, 0x4e, 0x50, 0x02,
0x62, 0xf6, 0x7d, 0x38, 0x4e, 0x50, 0x20,
0x62, 0xf6, 0x7d, 0x48, 0x4e, 0x50, 0x01,
0x62, 0xf6, 0x7d, 0x58, 0x4e, 0x50, 0x20,
// vrsqrtsh
0x62, 0xf6, 0x45, 0x8d, 0x4f, 0x48, 0x20,
// vsqrtph
0x62, 0xf5, 0x7c, 0x8c, 0x51, 0x48, 0x04,
0x62, 0xf5, 0x7c, 0x9c, 0x51, 0x48, 0x20,
0x62, 0xf5, 0x7c, 0xbc, 0x51, 0x48, 0x20,
0x62, 0xf5, 0x7c, 0xcc, 0x51, 0x48, 0x01,
0x62, 0xf5, 0x7c, 0xdc, 0x51, 0x48, 0x20,
// vsqrtsh
0x62, 0xf5, 0x56, 0x8c, 0x51, 0x48, 0x20,
0x62, 0xf5, 0x56, 0xbc, 0x51, 0xcf,
// vscalefph
0x62, 0xf6, 0x55, 0x08, 0x2c, 0x48, 0x04,
0x62, 0xf6, 0x55, 0x18, 0x2c, 0x48, 0x20,
0x62, 0xf6, 0x55, 0x28, 0x2c, 0x48, 0x02,
0x62, 0xf6, 0x55, 0x38, 0x2c, 0x48, 0x20,
0x62, 0xf6, 0x55, 0x48, 0x2c, 0x48, 0x01,
0x62, 0xf6, 0x55, 0x58, 0x2c, 0x48, 0x20,
0x62, 0xf6, 0x55, 0xb9, 0x2c, 0xcf,
// vscalefsh
0x62, 0xf6, 0x55, 0x08, 0x2d, 0x48, 0x20,
0x62, 0xf6, 0x55, 0xb9, 0x2d, 0xcf,
// vreduceph
0x62, 0xf3, 0x7c, 0x08, 0x56, 0x48, 0x04, 0x01,
0x62, 0xf3, 0x7c, 0x18, 0x56, 0x48, 0x20, 0x02,
0x62, 0xf3, 0x7c, 0x28, 0x56, 0x48, 0x02, 0x03,
0x62, 0xf3, 0x7c, 0x38, 0x56, 0x48, 0x20, 0x04,
0x62, 0xf3, 0x7c, 0x48, 0x56, 0x48, 0x01, 0x05,
0x62, 0xf3, 0x7c, 0x58, 0x56, 0x48, 0x20, 0x06,
0x62, 0xf3, 0x7c, 0x99, 0x56, 0xcd, 0x07,
// vreducesh
0x62, 0xf3, 0x64, 0x08, 0x57, 0x48, 0x20, 0x01,
0x62, 0xf3, 0x54, 0x99, 0x57, 0xcc, 0x02,
// vrndscaleph
0x62, 0xf3, 0x7c, 0x08, 0x08, 0x48, 0x04, 0x01,
0x62, 0xf3, 0x7c, 0x18, 0x08, 0x48, 0x20, 0x02,
0x62, 0xf3, 0x7c, 0x28, 0x08, 0x48, 0x02, 0x03,
0x62, 0xf3, 0x7c, 0x38, 0x08, 0x48, 0x20, 0x04,
0x62, 0xf3, 0x7c, 0x48, 0x08, 0x48, 0x01, 0x05,
0x62, 0xf3, 0x7c, 0x58, 0x08, 0x48, 0x20, 0x06,
0x62, 0xf3, 0x7c, 0x99, 0x08, 0xcd, 0x07,
// vrndscalesh
0x62, 0xf3, 0x64, 0x08, 0x0a, 0x48, 0x20, 0x01,
0x62, 0xf3, 0x54, 0x99, 0x0a, 0xcc, 0x02,
// vfpclassph
0x62, 0xf3, 0x7c, 0x08, 0x66, 0x48, 0x04, 0x01,
0x62, 0xf3, 0x7c, 0x18, 0x66, 0x48, 0x20, 0x02,
0x62, 0xf3, 0x7c, 0x28, 0x66, 0x48, 0x02, 0x03,
0x62, 0xf3, 0x7c, 0x38, 0x66, 0x48, 0x20, 0x04,
0x62, 0xf3, 0x7c, 0x48, 0x66, 0x48, 0x01, 0x05,
0x62, 0xf3, 0x7c, 0x58, 0x66, 0x48, 0x20, 0x06,
// vfpclasssh
0x62, 0xf3, 0x7c, 0x0a, 0x67, 0xcb, 0x05,
0x62, 0xf3, 0x7c, 0x0a, 0x67, 0x48, 0x20, 0x05,
// vgetexpph
0x62, 0xf6, 0x7d, 0x08, 0x42, 0x48, 0x04,
0x62, 0xf6, 0x7d, 0x38, 0x42, 0x48, 0x20,
0x62, 0xf6, 0x7d, 0x48, 0x42, 0x48, 0x01,
0x62, 0xf6, 0x7d, 0x99, 0x42, 0xcd,
// vgetexpsh
0x62, 0xf6, 0x55, 0x08, 0x43, 0x48, 0x20,
0x62, 0xf6, 0x65, 0x99, 0x43, 0xcd,
// vgetmantph
0x62, 0xf3, 0x7c, 0x08, 0x26, 0x48, 0x04, 0x01,
0x62, 0xf3, 0x7c, 0x38, 0x26, 0x48, 0x20, 0x02,
0x62, 0xf3, 0x7c, 0x48, 0x26, 0x48, 0x01, 0x03,
0x62, 0xf3, 0x7c, 0x99, 0x26, 0xcd, 0x04,
// vgetmantsh
0x62, 0xf3, 0x54, 0x08, 0x27, 0x48, 0x20, 0x05,
0x62, 0xf3, 0x64, 0x99, 0x27, 0xcd, 0x06,
// vmovsh
0x62, 0xf5, 0x7e, 0x89, 0x10, 0x48, 0x20,
0x62, 0xf5, 0x7e, 0x09, 0x11, 0x48, 0x20,
0x62, 0xf5, 0x66, 0x8a, 0x10, 0xcd,
// vmovw
0x62, 0xd5, 0x7d, 0x08, 0x6e, 0xcd,
0x62, 0xf5, 0x7d, 0x08, 0x6e, 0x58, 0x20,
0x62, 0xd5, 0x7d, 0x08, 0x7e, 0xc9,
0x62, 0xf5, 0x7d, 0x08, 0x7e, 0x78, 0x20,
// vcvtsd2sh
0x62, 0xf5, 0xef, 0xb9, 0x5a, 0xcb,
0x62, 0xf5, 0xef, 0x08, 0x5a, 0x48, 0x08,
// vcvtsh2sd
0x62, 0xf5, 0x6e, 0x99, 0x5a, 0xcb,
0x62, 0xf5, 0x6e, 0x08, 0x5a, 0x48, 0x20,
// vcvtsh2ss
0x62, 0xf6, 0x6c, 0x99, 0x13, 0xcb,
0x62, 0xf6, 0x6c, 0x08, 0x13, 0x48, 0x20,
// vcvtss2sh
0x62, 0xf5, 0x6c, 0xb9, 0x1d, 0xcb,
0x62, 0xf5, 0x6c, 0x08, 0x1d, 0x48, 0x10,
// vcvtsh2si
0x62, 0xf5, 0x7e, 0x38, 0x2d, 0xd1,
0x62, 0xf5, 0x7e, 0x08, 0x2d, 0x50, 0x20,
0x62, 0xf5, 0xfe, 0x38, 0x2d, 0xd1,
0x62, 0x75, 0xfe, 0x08, 0x2d, 0x40, 0x20,
// vcvtph2dq
0x62, 0xf5, 0x7d, 0x08, 0x5b, 0xcd,
0x62, 0xf5, 0x7d, 0x08, 0x5b, 0x48, 0x08,
0x62, 0xf5, 0x7d, 0x18, 0x5b, 0x48, 0x20,
0x62, 0xf5, 0x7d, 0xaa, 0x5b, 0xcd,
0x62, 0xf5, 0x7d, 0x28, 0x5b, 0x48, 0x04,
0x62, 0xf5, 0x7d, 0x38, 0x5b, 0x48, 0x20,
0x62, 0xf5, 0x7d, 0xbd, 0x5b, 0xcb,
0x62, 0xf5, 0x7d, 0xcd, 0x5b, 0x48, 0x02,
0x62, 0xf5, 0x7d, 0xdd, 0x5b, 0x48, 0x20,
// vcvtph2psx
0x62, 0xf6, 0x7d, 0x08, 0x13, 0xcd,
0x62, 0xf6, 0x7d, 0x08, 0x13, 0x48, 0x08,
0x62, 0xf6, 0x7d, 0x18, 0x13, 0x48, 0x20,
0x62, 0xf6, 0x7d, 0xaa, 0x13, 0xcd,
0x62, 0xf6, 0x7d, 0x28, 0x13, 0x48, 0x04,
0x62, 0xf6, 0x7d, 0x38, 0x13, 0x48, 0x20,
0x62, 0xf6, 0x7d, 0x9d, 0x13, 0xcb,
0x62, 0xf6, 0x7d, 0xcd, 0x13, 0x48, 0x02,
0x62, 0xf6, 0x7d, 0xdd, 0x13, 0x48, 0x20,
// vcvtph2udq
0x62, 0xf5, 0x7c, 0x08, 0x79, 0xcd,
0x62, 0xf5, 0x7c, 0x08, 0x79, 0x48, 0x08,
0x62, 0xf5, 0x7c, 0x18, 0x79, 0x48, 0x20,
0x62, 0xf5, 0x7c, 0xaa, 0x79, 0xcd,
0x62, 0xf5, 0x7c, 0x28, 0x79, 0x48, 0x04,
0x62, 0xf5, 0x7c, 0x38, 0x79, 0x48, 0x20,
0x62, 0xf5, 0x7c, 0xbd, 0x79, 0xcb,
0x62, 0xf5, 0x7c, 0xcd, 0x79, 0x48, 0x02,
0x62, 0xf5, 0x7c, 0xdd, 0x79, 0x48, 0x20,
// vcvttph2dq
0x62, 0xf5, 0x7e, 0x08, 0x5b, 0xcd,
0x62, 0xf5, 0x7e, 0x08, 0x5b, 0x48, 0x08,
0x62, 0xf5, 0x7e, 0x18, 0x5b, 0x48, 0x20,
0x62, 0xf5, 0x7e, 0xaa, 0x5b, 0xcd,
0x62, 0xf5, 0x7e, 0x28, 0x5b, 0x48, 0x04,
0x62, 0xf5, 0x7e, 0x38, 0x5b, 0x48, 0x20,
0x62, 0xf5, 0x7e, 0x9d, 0x5b, 0xcb,
0x62, 0xf5, 0x7e, 0xcd, 0x5b, 0x48, 0x02,
0x62, 0xf5, 0x7e, 0xdd, 0x5b, 0x48, 0x20,
// vcvttph2udq
0x62, 0xf5, 0x7c, 0x08, 0x78, 0xcd,
0x62, 0xf5, 0x7c, 0x08, 0x78, 0x48, 0x08,
0x62, 0xf5, 0x7c, 0x18, 0x78, 0x48, 0x20,
0x62, 0xf5, 0x7c, 0xaa, 0x78, 0xcd,
0x62, 0xf5, 0x7c, 0x28, 0x78, 0x48, 0x04,
0x62, 0xf5, 0x7c, 0x38, 0x78, 0x48, 0x20,
0x62, 0xf5, 0x7c, 0x9d, 0x78, 0xcb,
0x62, 0xf5, 0x7c, 0xcd, 0x78, 0x48, 0x02,
0x62, 0xf5, 0x7c, 0xdd, 0x78, 0x48, 0x20,
// vcvtph2pd
0x62, 0xf5, 0x7c, 0x08, 0x5a, 0xcd,
0x62, 0xf5, 0x7c, 0x08, 0x5a, 0x48, 0x10,
0x62, 0xf5, 0x7c, 0x18, 0x5a, 0x48, 0x20,
0x62, 0xf5, 0x7c, 0xaa, 0x5a, 0xcd,
0x62, 0xf5, 0x7c, 0x28, 0x5a, 0x48, 0x08,
0x62, 0xf5, 0x7c, 0x38, 0x5a, 0x48, 0x20,
0x62, 0xf5, 0x7c, 0x9d, 0x5a, 0xcb,
0x62, 0xf5, 0x7c, 0xcd, 0x5a, 0x48, 0x04,
0x62, 0xf5, 0x7c, 0xdd, 0x5a, 0x48, 0x20,
// vcvtph2qq
0x62, 0xf5, 0x7d, 0x08, 0x7b, 0xcd,
0x62, 0xf5, 0x7d, 0x08, 0x7b, 0x48, 0x10,
0x62, 0xf5, 0x7d, 0x18, 0x7b, 0x48, 0x20,
0x62, 0xf5, 0x7d, 0xaa, 0x7b, 0xcd,
0x62, 0xf5, 0x7d, 0x28, 0x7b, 0x48, 0x08,
0x62, 0xf5, 0x7d, 0x38, 0x7b, 0x48, 0x20,
0x62, 0xf5, 0x7d, 0xbd, 0x7b, 0xcb,
0x62, 0xf5, 0x7d, 0xcd, 0x7b, 0x48, 0x04,
0x62, 0xf5, 0x7d, 0xdd, 0x7b, 0x48, 0x20,
// vcvtph2uqq
0x62, 0xf5, 0x7d, 0x08, 0x79, 0xcd,
0x62, 0xf5, 0x7d, 0x08, 0x79, 0x48, 0x10,
0x62, 0xf5, 0x7d, 0x18, 0x79, 0x48, 0x20,
0x62, 0xf5, 0x7d, 0xaa, 0x79, 0xcd,
0x62, 0xf5, 0x7d, 0x28, 0x79, 0x48, 0x08,
0x62, 0xf5, 0x7d, 0x38, 0x79, 0x48, 0x20,
0x62, 0xf5, 0x7d, 0xbd, 0x79, 0xcb,
0x62, 0xf5, 0x7d, 0xcd, 0x79, 0x48, 0x04,
0x62, 0xf5, 0x7d, 0xdd, 0x79, 0x48, 0x20,
// vcvttph2uqq
0x62, 0xf5, 0x7d, 0x08, 0x78, 0xcd,
0x62, 0xf5, 0x7d, 0x08, 0x78, 0x48, 0x10,
0x62, 0xf5, 0x7d, 0x18, 0x78, 0x48, 0x20,
0x62, 0xf5, 0x7d, 0xaa, 0x78, 0xcd,
0x62, 0xf5, 0x7d, 0x28, 0x78, 0x48, 0x08,
0x62, 0xf5, 0x7d, 0x38, 0x78, 0x48, 0x20,
0x62, 0xf5, 0x7d, 0x9d, 0x78, 0xcb,
0x62, 0xf5, 0x7d, 0xcd, 0x78, 0x48, 0x04,
0x62, 0xf5, 0x7d, 0xdd, 0x78, 0x48, 0x20,
// vcvtdq2ph
0x62, 0xf5, 0x7c, 0x08, 0x5b, 0xcd,
0x62, 0xf5, 0x7c, 0x08, 0x5b, 0x48, 0x04,
0x62, 0xf5, 0x7c, 0x18, 0x5b, 0x48, 0x10,
0x62, 0xf5, 0x7c, 0x28, 0x5b, 0x48, 0x02,
0x62, 0xf5, 0x7c, 0x38, 0x5b, 0x48, 0x10,
0x62, 0xf5, 0x7c, 0xba, 0x5b, 0xcd,
0x62, 0xf5, 0x7c, 0x48, 0x5b, 0x48, 0x01,
0x62, 0xf5, 0x7c, 0x58, 0x5b, 0x48, 0x10,
// vcvtps2phx
0x62, 0xf5, 0x7d, 0x08, 0x1d, 0xcd,
0x62, 0xf5, 0x7d, 0x08, 0x1d, 0x48, 0x04,
0x62, 0xf5, 0x7d, 0x18, 0x1d, 0x48, 0x10,
0x62, 0xf5, 0x7d, 0x28, 0x1d, 0x48, 0x02,
0x62, 0xf5, 0x7d, 0x38, 0x1d, 0x48, 0x10,
0x62, 0xf5, 0x7d, 0xba, 0x1d, 0xcd,
0x62, 0xf5, 0x7d, 0x48, 0x1d, 0x48, 0x01,
0x62, 0xf5, 0x7d, 0x58, 0x1d, 0x48, 0x10,
// vcvtudq2ph
0x62, 0xf5, 0x7f, 0x08, 0x7a, 0xcd,
0x62, 0xf5, 0x7f, 0x08, 0x7a, 0x48, 0x04,
0x62, 0xf5, 0x7f, 0x18, 0x7a, 0x48, 0x10,
0x62, 0xf5, 0x7f, 0x28, 0x7a, 0x48, 0x02,
0x62, 0xf5, 0x7f, 0x38, 0x7a, 0x48, 0x10,
0x62, 0xf5, 0x7f, 0xba, 0x7a, 0xcd,
0x62, 0xf5, 0x7f, 0x48, 0x7a, 0x48, 0x01,
0x62, 0xf5, 0x7f, 0x58, 0x7a, 0x48, 0x10,
// vcvtpd2ph
0x62, 0xf5, 0xfd, 0x08, 0x5a, 0xcd,
0x62, 0xf5, 0xfd, 0x28, 0x5a, 0xcd,
0x62, 0xf5, 0xfd, 0xba, 0x5a, 0xcd,
0x62, 0xf5, 0xfd, 0x08, 0x5a, 0x48, 0x04,
0x62, 0xf5, 0xfd, 0x18, 0x5a, 0x48, 0x08,
0x62, 0xf5, 0xfd, 0x28, 0x5a, 0x48, 0x02,
0x62, 0xf5, 0xfd, 0x38, 0x5a, 0x48, 0x08,
0x62, 0xf5, 0xfd, 0x48, 0x5a, 0x48, 0x01,
0x62, 0xf5, 0xfd, 0x58, 0x5a, 0x48, 0x08,
// vcvtqq2ph
0x62, 0xf5, 0xfc, 0x08, 0x5b, 0xcd,
0x62, 0xf5, 0xfc, 0x28, 0x5b, 0xcd,
0x62, 0xf5, 0xfc, 0xba, 0x5b, 0xcd,
0x62, 0xf5, 0xfc, 0x08, 0x5b, 0x48, 0x04,
0x62, 0xf5, 0xfc, 0x18, 0x5b, 0x48, 0x08,
0x62, 0xf5, 0xfc, 0x28, 0x5b, 0x48, 0x02,
0x62, 0xf5, 0xfc, 0x38, 0x5b, 0x48, 0x08,
0x62, 0xf5, 0xfc, 0x48, 0x5b, 0x48, 0x01,
0x62, 0xf5, 0xfc, 0x58, 0x5b, 0x48, 0x08,
// vcvtuqq2ph
0x62, 0xf5, 0xff, 0x08, 0x7a, 0xcd,
0x62, 0xf5, 0xff, 0x28, 0x7a, 0xcd,
0x62, 0xf5, 0xff, 0xba, 0x7a, 0xcd,
0x62, 0xf5, 0xff, 0x08, 0x7a, 0x48, 0x04,
0x62, 0xf5, 0xff, 0x18, 0x7a, 0x48, 0x08,
0x62, 0xf5, 0xff, 0x28, 0x7a, 0x48, 0x02,
0x62, 0xf5, 0xff, 0x38, 0x7a, 0x48, 0x08,
0x62, 0xf5, 0xff, 0x48, 0x7a, 0x48, 0x01,
0x62, 0xf5, 0xff, 0x58, 0x7a, 0x48, 0x08,
// vcvtph2uw
0x62, 0xf5, 0x7c, 0x08, 0x7d, 0xcd,
0x62, 0xf5, 0x7c, 0x08, 0x7d, 0x48, 0x04,
0x62, 0xf5, 0x7c, 0x18, 0x7d, 0x48, 0x20,
0x62, 0xf5, 0x7c, 0x28, 0x7d, 0x48, 0x02,
0x62, 0xf5, 0x7c, 0x38, 0x7d, 0x48, 0x20,
0x62, 0xf5, 0x7c, 0xba, 0x7d, 0xcd,
0x62, 0xf5, 0x7c, 0x48, 0x7d, 0x48, 0x01,
0x62, 0xf5, 0x7c, 0x58, 0x7d, 0x48, 0x20,
// vcvtph2w
0x62, 0xf5, 0x7d, 0x08, 0x7d, 0xcd,
0x62, 0xf5, 0x7d, 0x08, 0x7d, 0x48, 0x04,
0x62, 0xf5, 0x7d, 0x18, 0x7d, 0x48, 0x20,
0x62, 0xf5, 0x7d, 0x28, 0x7d, 0x48, 0x02,
0x62, 0xf5, 0x7d, 0x38, 0x7d, 0x48, 0x20,
0x62, 0xf5, 0x7d, 0xba, 0x7d, 0xcd,
0x62, 0xf5, 0x7d, 0x48, 0x7d, 0x48, 0x01,
0x62, 0xf5, 0x7d, 0x58, 0x7d, 0x48, 0x20,
// vcvttph2uw
0x62, 0xf5, 0x7c, 0x08, 0x7c, 0xcd,
0x62, 0xf5, 0x7c, 0x08, 0x7c, 0x48, 0x04,
0x62, 0xf5, 0x7c, 0x18, 0x7c, 0x48, 0x20,
0x62, 0xf5, 0x7c, 0x28, 0x7c, 0x48, 0x02,
0x62, 0xf5, 0x7c, 0x38, 0x7c, 0x48, 0x20,
0x62, 0xf5, 0x7c, 0x9a, 0x7c, 0xcd,
0x62, 0xf5, 0x7c, 0x48, 0x7c, 0x48, 0x01,
0x62, 0xf5, 0x7c, 0x58, 0x7c, 0x48, 0x20,
// vcvttph2w
0x62, 0xf5, 0x7d, 0x08, 0x7c, 0xcd,
0x62, 0xf5, 0x7d, 0x08, 0x7c, 0x48, 0x04,
0x62, 0xf5, 0x7d, 0x18, 0x7c, 0x48, 0x20,
0x62, 0xf5, 0x7d, 0x28, 0x7c, 0x48, 0x02,
0x62, 0xf5, 0x7d, 0x38, 0x7c, 0x48, 0x20,
0x62, 0xf5, 0x7d, 0x9a, 0x7c, 0xcd,
0x62, 0xf5, 0x7d, 0x48, 0x7c, 0x48, 0x01,
0x62, 0xf5, 0x7d, 0x58, 0x7c, 0x48, 0x20,
// vcvtuw2ph
0x62, 0xf5, 0x7f, 0x08, 0x7d, 0xcd,
0x62, 0xf5, 0x7f, 0x08, 0x7d, 0x48, 0x04,
0x62, 0xf5, 0x7f, 0x18, 0x7d, 0x48, 0x20,
0x62, 0xf5, 0x7f, 0x28, 0x7d, 0x48, 0x02,
0x62, 0xf5, 0x7f, 0x38, 0x7d, 0x48, 0x20,
0x62, 0xf5, 0x7f, 0xba, 0x7d, 0xcd,
0x62, 0xf5, 0x7f, 0x48, 0x7d, 0x48, 0x01,
0x62, 0xf5, 0x7f, 0x58, 0x7d, 0x48, 0x20,
// vcvtw2ph
0x62, 0xf5, 0x7e, 0x08, 0x7d, 0xcd,
0x62, 0xf5, 0x7e, 0x08, 0x7d, 0x48, 0x04,
0x62, 0xf5, 0x7e, 0x18, 0x7d, 0x48, 0x20,
0x62, 0xf5, 0x7e, 0x28, 0x7d, 0x48, 0x02,
0x62, 0xf5, 0x7e, 0x38, 0x7d, 0x48, 0x20,
0x62, 0xf5, 0x7e, 0xba, 0x7d, 0xcd,
0x62, 0xf5, 0x7e, 0x48, 0x7d, 0x48, 0x01,
0x62, 0xf5, 0x7e, 0x58, 0x7d, 0x48, 0x20,
// vcvtps2ph
0xc4, 0xe3, 0x79, 0x1d, 0xd1, 0x01,
0xc4, 0xe3, 0x79, 0x1d, 0x50, 0x40, 0x02,
0xc4, 0xe3, 0x7d, 0x1d, 0xd1, 0x03,
0xc4, 0xe3, 0x7d, 0x1d, 0x50, 0x40, 0x04,
0x62, 0xf3, 0x7d, 0x89, 0x1d, 0xd1, 0x05,
0x62, 0xf3, 0x7d, 0x09, 0x1d, 0x58, 0x08, 0x06,
0x62, 0xf3, 0x7d, 0x2a, 0x1d, 0xe1, 0x07,
0x62, 0xf3, 0x7d, 0x2a, 0x1d, 0x68, 0x04, 0x08,
0x62, 0xf3, 0x7d, 0x1a, 0x1d, 0xe9, 0x09,
0x62, 0xf3, 0x7d, 0x4d, 0x1d, 0x60, 0x02, 0x0a,
// vcvtsh2usi
0x62, 0xf5, 0x7e, 0x38, 0x79, 0xc9,
0x62, 0xf5, 0x7e, 0x08, 0x79, 0x40, 0x20,
0x62, 0x75, 0xfe, 0x38, 0x79, 0xc9,
0x62, 0x75, 0xfe, 0x08, 0x79, 0x68, 0x20,
// vcvttsh2si
0x62, 0xf5, 0x7e, 0x18, 0x2c, 0xc9,
0x62, 0xf5, 0x7e, 0x08, 0x2c, 0x40, 0x20,
0x62, 0x75, 0xfe, 0x18, 0x2c, 0xc9,
0x62, 0x75, 0xfe, 0x08, 0x2c, 0x68, 0x20,
// vcvttsh2usi
0x62, 0xf5, 0x7e, 0x18, 0x78, 0xc9,
0x62, 0xf5, 0x7e, 0x08, 0x78, 0x40, 0x20,
0x62, 0x75, 0xfe, 0x18, 0x78, 0xc9,
0x62, 0x75, 0xfe, 0x08, 0x78, 0x68, 0x20,
// vcvttph2qq
0x62, 0xf5, 0x7d, 0x08, 0x7a, 0xcd,
0x62, 0xf5, 0x7d, 0x08, 0x7a, 0x48, 0x10,
0x62, 0xf5, 0x7d, 0x18, 0x7a, 0x48, 0x20,
0x62, 0xf5, 0x7d, 0xaa, 0x7a, 0xcd,
0x62, 0xf5, 0x7d, 0x28, 0x7a, 0x48, 0x08,
0x62, 0xf5, 0x7d, 0x38, 0x7a, 0x48, 0x20,
0x62, 0xf5, 0x7d, 0x9d, 0x7a, 0xcb,
0x62, 0xf5, 0x7d, 0xcd, 0x7a, 0x48, 0x04,
0x62, 0xf5, 0x7d, 0xdd, 0x7a, 0x48, 0x20,
// vcvtsi2sh
0x62, 0xf5, 0x6e, 0x38, 0x2a, 0xc8,
0x62, 0xf5, 0x6e, 0x08, 0x2a, 0x48, 0x10,
0x62, 0xd5, 0xee, 0x38, 0x2a, 0xc9,
0x62, 0xf5, 0xee, 0x08, 0x2a, 0x48, 0x08,
// vcvtusi2sh
0x62, 0xf5, 0x6e, 0x38, 0x7b, 0xc8,
0x62, 0xf5, 0x6e, 0x08, 0x7b, 0x48, 0x10,
0x62, 0xd5, 0xee, 0x38, 0x7b, 0xc9,
0x62, 0xf5, 0xee, 0x08, 0x7b, 0x48, 0x08,
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
#endif
CYBOZU_TEST_AUTO(waitpkg)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
tpause(eax);
tpause(ebx);
#ifdef XBYAK32
umonitor(cx);
umonitor(ecx);
#else
umonitor(ecx);
umonitor(rcx);
#endif
umwait(eax);
umwait(ebx);
}
} c;
const uint8_t tbl[] = {
// tpause
0x66, 0x0f, 0xae, 0xf0,
0x66, 0x0f, 0xae, 0xf3,
// umonitor
0x67, 0xf3, 0x0f, 0xae, 0xf1,
0xf3, 0x0f, 0xae, 0xf1,
// tpause
0xf2, 0x0f, 0xae, 0xf0,
0xf2, 0x0f, 0xae, 0xf3,
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
CYBOZU_TEST_AUTO(misc)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
cldemote(ptr[eax+esi*4+0x12]);
movdiri(ptr[edx+esi*2+4], eax);
movdir64b(eax, ptr[edx]);
xresldtrk();
xsusldtrk();
#ifdef XBYAK64
cldemote(ptr[rax+rdi*8+0x123]);
movdiri(ptr[rax+r12], r9);
movdiri(ptr[rax+r12*2+4], r9d);
movdir64b(r10, ptr[r8]);
clui();
senduipi(rax);
senduipi(r10);
stui();
testui();
uiret();
#endif
}
} c;
const uint8_t tbl[] = {
#ifdef XBYAK64
0x67,
#endif
0x0f, 0x1c, 0x44, 0xb0, 0x12, // cldemote
#ifdef XBYAK64
0x67,
#endif
0x0f, 0x38, 0xf9, 0x44, 0x72, 0x04, // movdiri
0x66,
#ifdef XBYAK64
0x67,
#endif
0x0f, 0x38, 0xf8, 0x02, // movdir64b
0xf2, 0x0f, 0x01, 0xe9, // xresldtrk
0xf2, 0x0f, 0x01, 0xe8, // xsusldtrk
#ifdef XBYAK64
0x0f, 0x1c, 0x84, 0xf8, 0x23, 0x01, 0x00, 0x00, // cldemote
0x4e, 0x0f, 0x38, 0xf9, 0x0c, 0x20, // movdiri
0x46, 0x0f, 0x38, 0xf9, 0x4c, 0x60, 0x04, // movdiri
0x66, 0x45, 0x0f, 0x38, 0xf8, 0x10, // movdir64b
0xf3, 0x0f, 0x01, 0xee, // clui
0xf3, 0x0f, 0xc7, 0xf0, // senduipi rax
0xf3, 0x41, 0x0f, 0xc7, 0xf2, // senduipi r10
0xf3, 0x0f, 0x01, 0xef, // stui
0xf3, 0x0f, 0x01, 0xed, // testui
0xf3, 0x0f, 0x01, 0xec, // uiret
#endif
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
CYBOZU_TEST_AUTO(cpu)
{
// https://github.com/herumi/xbyak/issues/148
using namespace Xbyak::util;
Cpu cpu;
CYBOZU_TEST_EQUAL(cpu.has(Cpu::tINTEL) && cpu.has(Cpu::tAMD), cpu.has(Cpu::tINTEL | Cpu::tAMD));
}
CYBOZU_TEST_AUTO(minmax)
{
using namespace Xbyak::util;
CYBOZU_TEST_EQUAL((std::min)(3, 4), local::min_(3, 4));
CYBOZU_TEST_EQUAL((std::max)(3, 4), local::max_(3, 4));
}
CYBOZU_TEST_AUTO(rao_int)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
#ifdef XBYAK64
aadd(ptr[rax], ecx);
aadd(ptr[eax], ecx);
aadd(ptr[rax], r10);
aand(ptr[rax], ecx);
aand(ptr[eax], ecx);
aand(ptr[rax], r10);
aor(ptr[rax], ecx);
aor(ptr[eax], ecx);
aor(ptr[rax], r10);
axor(ptr[rax], ecx);
axor(ptr[eax], ecx);
axor(ptr[rax], r10);
#else
aadd(ptr[eax], ecx);
aand(ptr[eax], ecx);
aor(ptr[eax], ecx);
axor(ptr[eax], ecx);
#endif
}
} c;
const uint8_t tbl[] = {
#ifdef XBYAK64
// aadd
0x0f, 0x38, 0xfc, 0x08,
0x67, 0x0f, 0x38, 0xfc, 0x08,
0x4c, 0x0f, 0x38, 0xfc, 0x10,
// aand
0x66, 0x0f, 0x38, 0xfc, 0x08,
0x66, 0x67, 0x0f, 0x38, 0xfc, 0x08,
0x66, 0x4c, 0x0f, 0x38, 0xfc, 0x10,
// aor
0xf2, 0x0f, 0x38, 0xfc, 0x08,
0xf2, 0x67, 0x0f, 0x38, 0xfc, 0x08,
0xf2, 0x4c, 0x0f, 0x38, 0xfc, 0x10,
// axor
0xf3, 0x0f, 0x38, 0xfc, 0x08,
0xf3, 0x67, 0x0f, 0x38, 0xfc, 0x08,
0xf3, 0x4c, 0x0f, 0x38, 0xfc, 0x10,
#else
// aadd
0x0f, 0x38, 0xfc, 0x08,
// aand
0x66, 0x0f, 0x38, 0xfc, 0x08,
// aor
0xf2, 0x0f, 0x38, 0xfc, 0x08,
// axor
0xf3, 0x0f, 0x38, 0xfc, 0x08,
#endif
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
#ifdef XBYAK64
CYBOZU_TEST_AUTO(CMPccXADD)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
// 32bit reg
cmpbexadd(ptr[rax+r10*4], ecx, edx);
cmpbxadd(ptr[rax+r10*4], ecx, edx);
cmplexadd(ptr[rax+r10*4], ecx, edx);
cmplxadd(ptr[rax+r10*4], ecx, edx);
cmpnbexadd(ptr[rax+r10*4], ecx, edx);
cmpnbxadd(ptr[rax+r10*4], ecx, edx);
cmpnlexadd(ptr[rax+r10*4], ecx, edx);
cmpnlxadd(ptr[rax+r10*4], ecx, edx);
cmpnoxadd(ptr[rax+r10*4], ecx, edx);
cmpnpxadd(ptr[rax+r10*4], ecx, edx);
cmpnsxadd(ptr[rax+r10*4], ecx, edx);
cmpnzxadd(ptr[rax+r10*4], ecx, edx);
cmpoxadd(ptr[rax+r10*4], ecx, edx);
cmppxadd(ptr[rax+r10*4], ecx, edx);
cmpsxadd(ptr[rax+r10*4], ecx, edx);
cmpzxadd(ptr[rax+r10*4], ecx, edx);
// 64bit reg
cmpbexadd(ptr[rax+r10*4], rcx, rdx);
cmpbxadd(ptr[rax+r10*4], rcx, rdx);
cmplexadd(ptr[rax+r10*4], rcx, rdx);
cmplxadd(ptr[rax+r10*4], rcx, rdx);
cmpnbexadd(ptr[rax+r10*4], rcx, rdx);
cmpnbxadd(ptr[rax+r10*4], rcx, rdx);
cmpnlexadd(ptr[rax+r10*4], rcx, rdx);
cmpnlxadd(ptr[rax+r10*4], rcx, rdx);
cmpnoxadd(ptr[rax+r10*4], rcx, rdx);
cmpnpxadd(ptr[rax+r10*4], rcx, rdx);
cmpnsxadd(ptr[rax+r10*4], rcx, rdx);
cmpnzxadd(ptr[rax+r10*4], rcx, rdx);
cmpoxadd(ptr[rax+r10*4], rcx, rdx);
cmppxadd(ptr[rax+r10*4], rcx, rdx);
cmpsxadd(ptr[rax+r10*4], rcx, rdx);
cmpzxadd(ptr[rax+r10*4], rcx, rdx);
}
} c;
const uint8_t tbl[] = {
// 32bit reg
0xc4, 0xa2, 0x69, 0xe6, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xe2, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xee, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xec, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xe7, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xe3, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xef, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xed, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xe1, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xeb, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xe9, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xe5, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xe0, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xea, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xe8, 0x0c, 0x90,
0xc4, 0xa2, 0x69, 0xe4, 0x0c, 0x90,
// 64bit reg
0xc4, 0xa2, 0xe9, 0xe6, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xe2, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xee, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xec, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xe7, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xe3, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xef, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xed, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xe1, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xeb, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xe9, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xe5, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xe0, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xea, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xe8, 0x0c, 0x90,
0xc4, 0xa2, 0xe9, 0xe4, 0x0c, 0x90,
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
CYBOZU_TEST_AUTO(prefetchiti)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
prefetchit0(ptr[rax]);
prefetchit1(ptr[rax]);
}
} c;
const uint8_t tbl[] = {
0x0f, 0x18, 0x38,
0x0f, 0x18, 0x30
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
CYBOZU_TEST_AUTO(crypto)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
vsha512msg1(ymm3, xmm5);
vsha512msg2(ymm9, ymm10);
vsha512rnds2(ymm1, ymm3, xmm2);
vsm3msg1(xmm1, xmm2, xmm3);
vsm3msg1(xmm1, xmm2, ptr [rax]);
vsm3msg2(xmm5, xmm7, xmm3);
vsm3msg2(xmm5, xmm6, ptr [rax]);
vsm3rnds2(xmm5, xmm7, xmm3, 0x12);
vsm3rnds2(xmm5, xmm7, ptr [rcx], 0x34);
vsm4key4(xmm1, xmm2, xmm3);
vsm4key4(xmm1, xmm2, ptr [rdx]);
vsm4rnds4(xmm1, xmm2, xmm3);
vsm4rnds4(xmm5, xmm6, ptr [rcx+rax*4]);
}
} c;
const uint8_t tbl[] = {
// sha512
0xc4, 0xe2, 0x7f, 0xcc, 0xdd,
0xc4, 0x42, 0x7f, 0xcd, 0xca,
0xc4, 0xe2, 0x67, 0xcb, 0xca,
// sm3
0xC4, 0xE2, 0x68, 0xDA, 0xCB,
0xC4, 0xE2, 0x68, 0xDA, 0x08,
0xC4, 0xE2, 0x41, 0xDA, 0xEB,
0xC4, 0xE2, 0x49, 0xDA, 0x28,
0xC4, 0xE3, 0x41, 0xDE, 0xEB, 0x12,
0xC4, 0xE3, 0x41, 0xDE, 0x29, 0x34,
// sm4
0xc4, 0xe2, 0x6a, 0xda, 0xcb,
0xc4, 0xe2, 0x6a, 0xda, 0x0a,
0xc4, 0xe2, 0x6b, 0xda, 0xcb,
0xc4, 0xe2, 0x4b, 0xda, 0x2c, 0x81,
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
CYBOZU_TEST_AUTO(avx_vnni_int)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
vpdpbssd(xmm1, xmm2, xmm3);
vpdpbssd(ymm1, ymm2, ptr [rax]);
vpdpbssds(xmm1, xmm2, xmm3);
vpdpbssds(ymm1, ymm2, ptr [rax]);
vpdpbsud(xmm1, xmm2, xmm3);
vpdpbsud(ymm1, ymm2, ptr [rax]);
vpdpbsuds(xmm1, xmm2, xmm3);
vpdpbsuds(ymm1, ymm2, ptr [rax]);
vpdpbuud(xmm1, xmm2, xmm3);
vpdpbuud(ymm1, ymm2, ptr [rax]);
vpdpbuuds(xmm1, xmm2, xmm3);
vpdpbuuds(ymm1, ymm2, ptr [rax]);
vpdpwsud(xmm1, xmm2, xmm3);
vpdpwsud(ymm1, ymm2, ptr [rax]);
vpdpwsuds(xmm1, xmm2, xmm3);
vpdpwsuds(ymm1, ymm2, ptr [rax]);
vpdpwusd(xmm1, xmm2, xmm3);
vpdpwusd(ymm1, ymm2, ptr [rax]);
vpdpwusds(xmm1, xmm2, xmm3);
vpdpwusds(ymm1, ymm2, ptr [rax]);
vpdpwuud(xmm1, xmm2, xmm3);
vpdpwuud(ymm1, ymm2, ptr [rax]);
vpdpwuuds(xmm1, xmm2, xmm3);
vpdpwuuds(ymm1, ymm2, ptr [rax]);
}
} c;
const uint8_t tbl[] = {
0xc4, 0xe2, 0x6b, 0x50, 0xcb,
0xc4, 0xe2, 0x6f, 0x50, 0x08,
0xc4, 0xe2, 0x6b, 0x51, 0xcb,
0xc4, 0xe2, 0x6f, 0x51, 0x08,
0xc4, 0xe2, 0x6a, 0x50, 0xcb,
0xc4, 0xe2, 0x6e, 0x50, 0x08,
0xc4, 0xe2, 0x6a, 0x51, 0xcb,
0xc4, 0xe2, 0x6e, 0x51, 0x08,
0xc4, 0xe2, 0x68, 0x50, 0xcb,
0xc4, 0xe2, 0x6c, 0x50, 0x08,
0xc4, 0xe2, 0x68, 0x51, 0xcb,
0xc4, 0xe2, 0x6c, 0x51, 0x08,
0xc4, 0xe2, 0x6a, 0xd2, 0xcb,
0xc4, 0xe2, 0x6e, 0xd2, 0x08,
0xc4, 0xe2, 0x6a, 0xd3, 0xcb,
0xc4, 0xe2, 0x6e, 0xd3, 0x08,
0xc4, 0xe2, 0x69, 0xd2, 0xcb,
0xc4, 0xe2, 0x6d, 0xd2, 0x08,
0xc4, 0xe2, 0x69, 0xd3, 0xcb,
0xc4, 0xe2, 0x6d, 0xd3, 0x08,
0xc4, 0xe2, 0x68, 0xd2, 0xcb,
0xc4, 0xe2, 0x6c, 0xd2, 0x08,
0xc4, 0xe2, 0x68, 0xd3, 0xcb,
0xc4, 0xe2, 0x6c, 0xd3, 0x08,
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
#endif