Merge branch 'dev'
Some checks are pending
test / test (push) Waiting to run

This commit is contained in:
MITSUNARI Shigeo 2024-10-30 06:39:15 +09:00
commit 97b66116ff
15 changed files with 222 additions and 63 deletions

View file

@ -1,6 +1,6 @@
cmake_minimum_required(VERSION 3.5) cmake_minimum_required(VERSION 3.5)
project(xbyak LANGUAGES CXX VERSION 7.20) project(xbyak LANGUAGES CXX VERSION 7.20.1)
file(GLOB headers xbyak/*.h) file(GLOB headers xbyak/*.h)

View file

@ -1,5 +1,6 @@
# History # History
* 2024/Oct/17 ver 7.20.1 Updated to comply with AVX10.2 specification rev 2.0
* 2024/Oct/15 ver 7.20 Fixed the specification of setDefaultEncoding, setDefaultEncodingAVX10. * 2024/Oct/15 ver 7.20 Fixed the specification of setDefaultEncoding, setDefaultEncodingAVX10.
* 2024/Oct/15 ver 7.11 Added full support for AVX10.2 * 2024/Oct/15 ver 7.11 Added full support for AVX10.2
* 2024/Oct/13 ver 7.10 support AVX10 integer and fp16 vnni, media new instructions. setDefaultEncoding is extended. * 2024/Oct/13 ver 7.10 support AVX10 integer and fp16 vnni, media new instructions. setDefaultEncoding is extended.

View file

@ -110,6 +110,15 @@ vfpclasspd k5{k3}, [rax+64]{1to2}, 5 --> vfpclasspd(k5|k3, xword_b [rax+64],
vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, yword_b [rax+64], 5); // broadcast 64-bit to 256-bit vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, yword_b [rax+64], 5); // broadcast 64-bit to 256-bit
``` ```
### Remark
* `k1`, ..., `k7` are opmask registers.
- `k0` is dealt as no mask.
- e.g. `vmovaps(zmm0|k0, ptr[rax]);` and `vmovaps(zmm0|T_z, ptr[rax]);` are same to `vmovaps(zmm0, ptr[rax]);`.
* use `| T_z`, `| T_sae`, `| T_rn_sae`, `| T_rd_sae`, `| T_ru_sae`, `| T_rz_sae` instead of `,{z}`, `,{sae}`, `,{rn-sae}`, `,{rd-sae}`, `,{ru-sae}`, `,{rz-sae}` respectively.
* `k4 | k3` is different from `k3 | k4`.
* use `ptr_b` for broadcast `{1toX}`. X is automatically determined.
* specify `xword`/`yword`/`zword(_b)` for m128/m256/m512 if necessary.
## Selecting AVX512-VNNI, AVX-VNNI, AVX-VNNI-INT8, AVX10.2. ## Selecting AVX512-VNNI, AVX-VNNI, AVX-VNNI-INT8, AVX10.2.
Some mnemonics have some types of encodings: VEX, EVEX, AVX10.2. Some mnemonics have some types of encodings: VEX, EVEX, AVX10.2.
The functions for these mnemonics include an optional parameter as the last argument to specify the encoding. The functions for these mnemonics include an optional parameter as the last argument to specify the encoding.
@ -145,20 +154,17 @@ feature|AVX512-VNNI|AVX-VNNI
-|-|- -|-|-
feature|AVX-VNNI-INT8, AVX512-FP16|AVX10.2 feature|AVX-VNNI-INT8, AVX512-FP16|AVX10.2
- Target functions: vmpsadbw, vpdpbssd, vpdpbssds, vpdpbsud, vpdpbsuds, vpdpbuud, vpdpbuuds, vpdpwsud vpdpwsuds vpdpwusd vpdpwusds vpdpwuud, vpdpwuuds, vmovd, vmovw - Target functions: vmpsadbw, vpdpbssd, vpdpbssds, vpdpbsud, vpdpbsuds, vpdpbuud, vpdpbuuds, vpdpwsud vpdpwsuds vpdpwusd vpdpwusds vpdpwuud, vpdpwuuds and vmovd, vmovw with MEM-to-MEM.
- Remark: vmovd and vmovw several kinds of encoding such as AVX/AVX512F/AVX512-FP16/AVX10.2.
At first, I attempted to use EvexEncoding (resp. VexEncoding) instead of AVX10v2Encoding (resp. EvexEncoding) for `setDefaultEncodingAVX10`.
But I abandoned this idea when I found that `vmovd` and `vmovw` had different EVEX encodings in AVX512 and AVX10.2
### Remark ### Remark
* `k1`, ..., `k7` are opmask registers.
- `k0` is dealt as no mask. 1. `vmovd` and `vmovw` instructions with REG-to-XMM or XMM-to-REG operands are always encoded using AVX10.1.
- e.g. `vmovaps(zmm0|k0, ptr[rax]);` and `vmovaps(zmm0|T_z, ptr[rax]);` are same to `vmovaps(zmm0, ptr[rax]);`. When used with XMM-to-XMM operands, these instructions are always encoded using AVX10.2.
* use `| T_z`, `| T_sae`, `| T_rn_sae`, `| T_rd_sae`, `| T_ru_sae`, `| T_rz_sae` instead of `,{z}`, `,{sae}`, `,{rn-sae}`, `,{rd-sae}`, `,{ru-sae}`, `,{rz-sae}` respectively.
* `k4 | k3` is different from `k3 | k4`. 2. `vmovd` and `vmovw` instructions with XMM-to-MEM or MEM-to-XMM operands support multiple encoding formats, including AVX, AVX512F, AVX512-FP16, and AVX10.2.
* use `ptr_b` for broadcast `{1toX}`. X is automatically determined.
* specify `xword`/`yword`/`zword(_b)` for m128/m256/m512 if necessary. Initially, I tried implementing `setDefaultEncodingAVX10` using `EvexEncoding` (resp. `VexEncoding`) instead of `AVX10v2Encoding` (resp. `EvexEncoding`).
However, I abandoned this approach after discovering the complexity of the encoding requirements of `vmovd` and `vmovw`.
## APX ## APX
[Advanced Performance Extensions (APX) Architecture Specification](https://www.intel.com/content/www/us/en/content-details/786223/intel-advanced-performance-extensions-intel-apx-architecture-specification.html) [Advanced Performance Extensions (APX) Architecture Specification](https://www.intel.com/content/www/us/en/content-details/786223/intel-advanced-performance-extensions-intel-apx-architecture-specification.html)

View file

@ -202,13 +202,13 @@ void putX_XM()
{ 0x2F, "vcomish", T_MUST_EVEX | T_MAP5 | T_EW0 | T_SAE_X | T_N2 }, { 0x2F, "vcomish", T_MUST_EVEX | T_MAP5 | T_EW0 | T_SAE_X | T_N2 },
{ 0x2E, "vucomish", T_MUST_EVEX | T_MAP5 | T_EW0 | T_SAE_X | T_N2 }, { 0x2E, "vucomish", T_MUST_EVEX | T_MAP5 | T_EW0 | T_SAE_X | T_N2 },
{ 0x2F, "vcomxsd", T_MUST_EVEX | T_F3 | T_0F | T_EW1 | T_SAE_X | T_N8 }, { 0x2F, "vcomxsd", T_MUST_EVEX | T_F2 | T_0F | T_EW1 | T_SAE_X | T_N8 },
{ 0x2F, "vcomxsh", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_SAE_X | T_N2 }, { 0x2F, "vcomxsh", T_MUST_EVEX | T_F3 | T_MAP5 | T_EW0 | T_SAE_X | T_N2 },
{ 0x2F, "vcomxss", T_MUST_EVEX | T_F2 | T_0F | T_EW0 | T_SAE_X | T_N4 }, { 0x2F, "vcomxss", T_MUST_EVEX | T_F3 | T_0F | T_EW0 | T_SAE_X | T_N4 },
{ 0x2E, "vucomxsd", T_MUST_EVEX | T_F3 | T_0F | T_EW1 | T_SAE_X | T_N8 }, { 0x2E, "vucomxsd", T_MUST_EVEX | T_F2 | T_0F | T_EW1 | T_SAE_X | T_N8 },
{ 0x2E, "vucomxsh", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_SAE_X | T_N2 }, { 0x2E, "vucomxsh", T_MUST_EVEX | T_F3 | T_MAP5 | T_EW0 | T_SAE_X | T_N2 },
{ 0x2E, "vucomxss", T_MUST_EVEX | T_F2 | T_0F | T_EW0 | T_SAE_X | T_N4 }, { 0x2E, "vucomxss", T_MUST_EVEX | T_F3 | T_0F | T_EW0 | T_SAE_X | T_N4 },
// 13.1 // 13.1
{ 0x69, "vcvtnebf162ibs", T_MUST_EVEX | T_YMM | T_F2 | T_MAP5 | T_EW0 | T_B16 }, { 0x69, "vcvtnebf162ibs", T_MUST_EVEX | T_YMM | T_F2 | T_MAP5 | T_EW0 | T_B16 },
@ -893,7 +893,7 @@ void putX_XM_IMM()
{ 0x62, "vpexpandw", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z | T_N2, false }, { 0x62, "vpexpandw", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z | T_N2, false },
{ 0x2F, "vcomsbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_N2, false }, { 0x2F, "vcomsbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_N2, false },
{ 0x42, "vgetexppbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false }, { 0x42, "vgetexppbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false },
{ 0x26, "vgetmantpbf16", T_MUST_EVEX | T_F2 | T_0F3A | T_EW0 | T_YMM | T_B16, true }, { 0x26, "vgetmantpbf16", T_MUST_EVEX | T_F2 | T_0F3A | T_EW0 | T_YMM | T_B16, true },
{ 0x4C, "vrcppbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, { 0x4C, "vrcppbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false },
{ 0x56, "vreducenepbf16", T_MUST_EVEX | T_F2 | T_0F3A | T_EW0 | T_YMM | T_B16, true }, { 0x56, "vreducenepbf16", T_MUST_EVEX | T_F2 | T_0F3A | T_EW0 | T_YMM | T_B16, true },

View file

@ -5,7 +5,7 @@
project( project(
'xbyak', 'xbyak',
'cpp', 'cpp',
version: '7.20', version: '7.20.1',
license: 'BSD-3-Clause', license: 'BSD-3-Clause',
default_options: 'b_ndebug=if-release' default_options: 'b_ndebug=if-release'
) )

View file

@ -1,5 +1,5 @@
# Xbyak 7.20 [![Badge Build]][Build Status] # Xbyak 7.20.1 [![Badge Build]][Build Status]
*A JIT assembler for x86/x64 architectures supporting advanced instruction sets up to AVX10.2* *A JIT assembler for x86/x64 architectures supporting advanced instruction sets up to AVX10.2*

View file

@ -1,5 +1,5 @@
C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 7.20 C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 7.20.1
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
◎概要 ◎概要
@ -404,6 +404,7 @@ sample/{echo,hello}.bfは http://www.kmonos.net/alang/etc/brainfuck.php から
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
◎履歴 ◎履歴
2024/10/17 ver 7.20.1 AVX10.2 rev 2.0仕様書の変更に追従
2024/10/15 ver 7.20 setDefaultEncoding/setDefaultEncodingAVX10の仕様確定 2024/10/15 ver 7.20 setDefaultEncoding/setDefaultEncodingAVX10の仕様確定
2024/10/15 ver 7.11 AVX10.2完全サポート 2024/10/15 ver 7.11 AVX10.2完全サポート
2024/10/13 ver 7.10 AVX10 integer and fp16 vnni, mediaの新命令対応. setDefaultEncodingの拡張. 2024/10/13 ver 7.10 AVX10 integer and fp16 vnni, mediaの新命令対応. setDefaultEncodingの拡張.

View file

@ -60,7 +60,8 @@ apx: apx.cpp $(XBYAK_INC)
avx10_test: avx10_test.cpp $(XBYAK_INC) avx10_test: avx10_test.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) avx10_test.cpp -o $@ -DXBYAK64 $(CXX) $(CFLAGS) avx10_test.cpp -o $@ -DXBYAK64
TEST_FILES=old.txt new-ymm.txt bf16.txt comp.txt misc.txt convert.txt minmax.txt saturation.txt #TEST_FILES=old.txt new-ymm.txt bf16.txt comp.txt misc.txt convert.txt minmax.txt saturation.txt
TEST_FILES=old.txt new-ymm.txt bf16.txt misc.txt convert.txt minmax.txt saturation.txt
xed_test: xed_test:
@set -e; \ @set -e; \
for target in $(addprefix avx10/, $(TEST_FILES)); do \ for target in $(addprefix avx10/, $(TEST_FILES)); do \

View file

@ -113,17 +113,17 @@ vfpclasspbf16(k7|k5, zword_b[rax+128], 13);
vcomsbf16(xm2, xm3); vcomsbf16(xm2, xm3);
vcomsbf16(xm2, ptr[rax+128]); vcomsbf16(xm2, ptr[rax+128]);
vgetexppbf16(xm1|k3, xmm2); //vgetexppbf16(xm1|k3, xmm2);
vgetexppbf16(xm1|k3, ptr[rax+128]); //vgetexppbf16(xm1|k3, ptr[rax+128]);
vgetexppbf16(xm1|k3, ptr_b[rax+128]); //vgetexppbf16(xm1|k3, ptr_b[rax+128]);
vgetexppbf16(ym1|k3, ymm2); //vgetexppbf16(ym1|k3, ymm2);
vgetexppbf16(ym1|k3, ptr[rax+128]); //vgetexppbf16(ym1|k3, ptr[rax+128]);
vgetexppbf16(ym1|k3, ptr_b[rax+128]); //vgetexppbf16(ym1|k3, ptr_b[rax+128]);
vgetexppbf16(zm1|k3, zmm2); //vgetexppbf16(zm1|k3, zmm2);
vgetexppbf16(zm1|k3, ptr[rax+128]); //vgetexppbf16(zm1|k3, ptr[rax+128]);
vgetexppbf16(zm1|k3, ptr_b[rax+128]); //vgetexppbf16(zm1|k3, ptr_b[rax+128]);
vgetmantpbf16(xm1|k3, xmm2, 3); vgetmantpbf16(xm1|k3, xmm2, 3);
vgetmantpbf16(xm1|k3, ptr[rax+128], 5); vgetmantpbf16(xm1|k3, ptr[rax+128], 5);

View file

@ -2284,4 +2284,100 @@ CYBOZU_TEST_AUTO(avx_vnni_int)
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
} }
CYBOZU_TEST_AUTO(vmovd)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
setDefaultEncodingAVX10(PreAVX10v2Encoding);
vmovd(eax, xm1); // always AVX10.1
vmovd(xm1, eax); // always AVX10.1
vmovd(xm3, xm1); // always AVX10.2
// AVX-512 (AVX10.1)
vmovd(ptr[rax+128], xm1);
vmovd(xm1, ptr[rax+128]);
vmovd(ptr[rax+128], xm30);
vmovd(xm30, ptr[rax+128]);
setDefaultEncodingAVX10(AVX10v2Encoding);
vmovd(eax, xm1); // always AVX10.1
vmovd(xm1, eax); // always AVX10.1
vmovd(xm3, xm1); // always AVX10.2
// AVX10.2
vmovd(ptr[rax+128], xm1);
vmovd(xm1, ptr[rax+128]);
vmovd(ptr[rax+128], xm30);
vmovd(xm30, ptr[rax+128]);
}
} c;
const uint8_t tbl[] = {
0xc5, 0xf9, 0x7e, 0xc8, // avx10.1
0xc5, 0xf9, 0x6e, 0xc8, // avx10.1
0x62, 0xf1, 0x7e, 0x08, 0x7e, 0xd9, // avx10.2
0xc5, 0xf9, 0x7e, 0x88, 0x80, 0x00, 0x00, 0x00, // avx
0xc5, 0xf9, 0x6e, 0x88, 0x80, 0x00, 0x00, 0x00, // avx
0x62, 0x61, 0x7d, 0x08, 0x7e, 0x70, 0x20, // avx10.1
0x62, 0x61, 0x7d, 0x08, 0x6e, 0x70, 0x20, // avx10.1
0xc5, 0xf9, 0x7e, 0xc8, // avx10.1
0xc5, 0xf9, 0x6e, 0xc8, // avx10.1
0x62, 0xf1, 0x7e, 0x08, 0x7e, 0xd9, // avx10.2
0x62, 0xf1, 0x7d, 0x08, 0xd6, 0x48, 0x20, // avx10.2
0x62, 0xf1, 0x7e, 0x08, 0x7e, 0x48, 0x20, // avx10.2
0x62, 0x61, 0x7d, 0x08, 0xd6, 0x70, 0x20, // avx10.2
0x62, 0x61, 0x7e, 0x08, 0x7e, 0x70, 0x20, // avx10.2
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
CYBOZU_TEST_AUTO(vmovw)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
setDefaultEncodingAVX10(PreAVX10v2Encoding);
vmovw(eax, xm1); // always avx10.1
vmovw(xm1, eax); // always avx10.1
vmovw(xm3, xm1); // always avx10.2
// AVX10.1
vmovw(ptr[rax+128], xm1);
vmovw(xm1, ptr[rax+128]);
vmovw(ptr[rax+128], xm30);
vmovw(xm30, ptr[rax+128]);
setDefaultEncodingAVX10(AVX10v2Encoding);
vmovw(eax, xm1); // always avx10.1
vmovw(xm1, eax); // always avx10.1
vmovw(xm3, xm1); // always avx10.2
// AVX10.2
vmovw(ptr[rax+128], xm1);
vmovw(xm1, ptr[rax+128]);
vmovw(ptr[rax+128], xm30);
vmovw(xm30, ptr[rax+128]);
}
} c;
const uint8_t tbl[] = {
0x62, 0xf5, 0x7d, 0x08, 0x7e, 0xc8,
0x62, 0xf5, 0x7d, 0x08, 0x6e, 0xc8,
0x62, 0xf5, 0x7e, 0x08, 0x6e, 0xd9,
0x62, 0xf5, 0x7d, 0x08, 0x7e, 0x48, 0x40,
0x62, 0xf5, 0x7d, 0x08, 0x6e, 0x48, 0x40,
0x62, 0x65, 0x7d, 0x08, 0x7e, 0x70, 0x40,
0x62, 0x65, 0x7d, 0x08, 0x6e, 0x70, 0x40,
0x62, 0xf5, 0x7d, 0x08, 0x7e, 0xc8,
0x62, 0xf5, 0x7d, 0x08, 0x6e, 0xc8,
0x62, 0xf5, 0x7e, 0x08, 0x6e, 0xd9,
0x62, 0xf5, 0x7e, 0x08, 0x7e, 0x48, 0x40,
0x62, 0xf5, 0x7e, 0x08, 0x6e, 0x48, 0x40,
0x62, 0x65, 0x7e, 0x08, 0x7e, 0x70, 0x40,
0x62, 0x65, 0x7e, 0x08, 0x6e, 0x70, 0x40,
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
#endif #endif

6
test/test_by_xed.bat Normal file
View file

@ -0,0 +1,6 @@
@echo off
set CFLAGS=-I ../ /EHsc /nologo
copy %1% tmp.cpp
cl %CFLAGS% test_by_xed.cpp && test_by_xed.exe
%XED% -64 -ir bin > out.txt
python3 test_by_xed.py %1% out.txt

View file

@ -76,7 +76,7 @@ def newReg(s):
return s return s
class Memory: class Memory:
def __init__(self, size=0, base=None, index=None, scale=0, disp=0, broadcast=False): def __init__(self, size=0, base=None, index=None, scale=0, disp=0, broadcast=0):
self.size = size self.size = size
self.base = newReg(base) self.base = newReg(base)
self.index = newReg(index) self.index = newReg(index)
@ -85,8 +85,12 @@ class Memory:
self.broadcast = broadcast self.broadcast = broadcast
def __str__(self): def __str__(self):
s = 'ptr' if self.size == 0 else g_sizeTbl[int(math.log2(self.size))] if self.size == 0:
if self.broadcast: s = 'ptr'
else:
idx = self.size * max(self.broadcast, 1)
s = g_sizeTbl[int(math.log2(idx))]
if self.broadcast > 0:
s += '_b' s += '_b'
s += ' [' s += ' ['
needPlus = False needPlus = False
@ -107,23 +111,36 @@ class Memory:
s += ']' s += ']'
return s return s
# Xbyak uses 'ptr' when it can be automatically detected, so we should consider this in the comparison.
def __eq__(self, rhs): def __eq__(self, rhs):
# xbyak uses ptr if it is automatically detected, so xword == ptr is true if self.broadcast > rhs.broadcast:
if self.broadcast != rhs.broadcast: return False return rhs == self
# if not self.broadcast and 0 < self.size <= 8 and 0 < rhs.size <= 8 and self.size != rhs.size: return False assert(self.broadcast <= rhs.broadcast)
if not self.broadcast and self.size > 0 and rhs.size > 0 and self.size != rhs.size: return False if self.broadcast == 0:
if rhs.broadcast > 0: return False
# Xbyak uses 'ptr' when it is automatically detected.
# Therefore, the comparison is true if 'ptr' (i.e., size = 0) is used.
if 0 < self.size and 0 < rhs.size and self.size != rhs.size: return False
if self.broadcast == 1: # _b
if rhs.broadcast == 1: # compare ptr_b with ptr_b
if self.size != rhs.size:
return False
if self.size > 0 and (self.size != rhs.size * rhs.broadcast): # compare ptr_b with {1toX}
return False
else:
if self.broadcast != rhs.broadcast: return False
r = self.base == rhs.base and self.index == rhs.index and self.scale == rhs.scale and self.disp == rhs.disp r = self.base == rhs.base and self.index == rhs.index and self.scale == rhs.scale and self.disp == rhs.disp
return r return r
def parseBroadcast(s): def parseBroadcast(s):
if '_b' in s: if '_b' in s:
return (s.replace('_b', ''), True) return (s.replace('_b', ''), 1)
r = re.search(r'({1to\d+})', s) r = re.search(r'({1to(\d+)})', s)
if not r: if not r:
return (s, False) return (s, 0)
return (s.replace(r.group(1), ''), True) return (s.replace(r.group(1), ''), int(r.group(2)))
def parseMemory(s, broadcast=False): def parseMemory(s, broadcast=0):
org_s = s org_s = s
s = s.replace(' ', '').lower() s = s.replace(' ', '').lower()
@ -133,7 +150,7 @@ def parseMemory(s, broadcast=False):
scale = 0 scale = 0
disp = 0 disp = 0
if not broadcast: if broadcast == 0:
(s, broadcast) = parseBroadcast(s) (s, broadcast) = parseBroadcast(s)
# Parse size # Parse size
@ -157,7 +174,7 @@ def parseMemory(s, broadcast=False):
s = s[3:] s = s[3:]
if s.startswith('_b'): if s.startswith('_b'):
broadcast = True broadcast = 1
s = s[2:] s = s[2:]
# Extract the content inside brackets # Extract the content inside brackets
@ -335,7 +352,7 @@ def parseMemoryTest():
('[]', Memory()), ('[]', Memory()),
('[rax]', Memory(0, rax)), ('[rax]', Memory(0, rax)),
('ptr[rax]', Memory(0, rax)), ('ptr[rax]', Memory(0, rax)),
('ptr_b[rax]', Memory(0, rax, broadcast=True)), ('ptr_b[rax]', Memory(0, rax, broadcast=1)),
('dword[rbx]', Memory(4, rbx)), ('dword[rbx]', Memory(4, rbx)),
('xword ptr[rcx]', Memory(16, rcx)), ('xword ptr[rcx]', Memory(16, rcx)),
('xmmword ptr[rcx]', Memory(16, rcx)), ('xmmword ptr[rcx]', Memory(16, rcx)),
@ -344,11 +361,36 @@ def parseMemoryTest():
('[0x12345]', Memory(0, None, None, 0, 0x12345)), ('[0x12345]', Memory(0, None, None, 0, 0x12345)),
('yword [rax+rdx*4]', Memory(32, rax, rdx, 4)), ('yword [rax+rdx*4]', Memory(32, rax, rdx, 4)),
('zword [rax+rdx*4+123]', Memory(64, rax, rdx, 4, 123)), ('zword [rax+rdx*4+123]', Memory(64, rax, rdx, 4, 123)),
('xword_b [rax]', Memory(16, rax, None, 0, 0, 1)),
('dword [rax]{1to4}', Memory(16, rax, None, 0, 0, 1)),
('yword_b [rax]', Memory(32, rax, None, 0, 0, 1)),
('dword [rax]{1to8}', Memory(32, rax, None, 0, 0, 1)),
] ]
for (s, expected) in tbl: for (s, expected) in tbl:
my = parseMemory(s) my = parseMemory(s)
assertEqualStr(my, expected) assertEqualStr(my, expected)
print('compare test')
tbl = [
('ptr[rax]', 'dword[rax]', True),
('byte[rax]', 'dword[rax]', False),
('yword_b[rax]', 'dword [rax]{1to8}', True),
('yword_b[rax]', 'word [rax]{1to16}', True),
('zword_b[rax]', 'word [rax]{1to32}', True),
('zword_b[rax]', 'word [rax]{1to16}', False),
('dword [rax]{1to2}', 'dword [rax] {1to4}', False),
('zword_b[rax]', 'xword_b [rax]', False),
('ptr_b[rax]', 'word [rax]{1to32}', True), # ignore size
]
for (lhs, rhs, eq) in tbl:
a = parseMemory(lhs)
b = parseMemory(rhs)
if eq:
assertEqual(a, b)
assertEqual(b, a)
else:
assert(parseMemory(lhs) != parseMemory(rhs))
def parseNmemonicTest(): def parseNmemonicTest():
print('parseNmemonicTest') print('parseNmemonicTest')
tbl = [ tbl = [
@ -364,8 +406,8 @@ def parseNmemonicTest():
('vpcompressw(zmm30 | k2 |T_z, zmm1);', Nmemonic('vpcompressw', [zmm30, zmm1], [k2, T_z])), ('vpcompressw(zmm30 | k2 |T_z, zmm1);', Nmemonic('vpcompressw', [zmm30, zmm1], [k2, T_z])),
('vpcompressw zmm30{k2}{z}, zmm1', Nmemonic('vpcompressw', [zmm30, zmm1], [k2, T_z])), ('vpcompressw zmm30{k2}{z}, zmm1', Nmemonic('vpcompressw', [zmm30, zmm1], [k2, T_z])),
('vpshldw(xmm9|k3|T_z, xmm2, ptr [rax + 0x40], 5);', Nmemonic('vpshldw', [xmm9, xmm2, Memory(0, rax, None, 0, 0x40), 5], [k3, T_z])), ('vpshldw(xmm9|k3|T_z, xmm2, ptr [rax + 0x40], 5);', Nmemonic('vpshldw', [xmm9, xmm2, Memory(0, rax, None, 0, 0x40), 5], [k3, T_z])),
('vpshrdd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5);', Nmemonic('vpshrdd', [xmm5, xmm2, Memory(0, rax, None, 0, 0x40, True), 5], [k3, T_z])), ('vpshrdd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5);', Nmemonic('vpshrdd', [xmm5, xmm2, Memory(0, rax, None, 0, 0x40, 1), 5], [k3, T_z])),
('vpshrdd xmm5{k3}{z}, xmm2, dword ptr [rax+0x40]{1to4}, 0x5', Nmemonic('vpshrdd', [xmm5, xmm2, Memory(0, rax, None, 0, 0x40, True), 5], [k3, T_z])), ('vpshrdd xmm5{k3}{z}, xmm2, dword ptr [rax+0x40]{1to4}, 0x5', Nmemonic('vpshrdd', [xmm5, xmm2, Memory(0, rax, None, 0, 0x40, 4), 5], [k3, T_z])),
('vcmpph(k1, xmm15, ptr[rax+64], 1);', Nmemonic('vcmpph', [k1, xmm15, Memory(0, rax, None, 0, 64), 1])), ('vcmpph(k1, xmm15, ptr[rax+64], 1);', Nmemonic('vcmpph', [k1, xmm15, Memory(0, rax, None, 0, 64), 1])),
] ]
for (s, expected) in tbl: for (s, expected) in tbl:

5
test/test_by_xed_all.bat Normal file
View file

@ -0,0 +1,5 @@
set TARGETS=old.txt new-ymm.txt bf16.txt misc.txt convert.txt minmax.txt saturation.txt
for %%f in (%TARGETS%) do (
echo %%f
call test_by_xed.bat avx10\%%f
)

View file

@ -155,7 +155,7 @@ namespace Xbyak {
enum { enum {
DEFAULT_MAX_CODE_SIZE = 4096, DEFAULT_MAX_CODE_SIZE = 4096,
VERSION = 0x7200 /* 0xABCD = A.BC(.D) */ VERSION = 0x7201 /* 0xABCD = A.BC(.D) */
}; };
#ifndef MIE_INTEGER_TYPE_DEFINED #ifndef MIE_INTEGER_TYPE_DEFINED
@ -2809,11 +2809,12 @@ private:
std::swap(p1, p2); std::swap(p1, p2);
rev = !rev; rev = !rev;
} }
enc = getEncoding(enc, 1);
int sel = -1; int sel = -1;
if (getEncoding(enc, 1) == AVX10v2Encoding) { if (p1->isXMM() || (p1->isMEM() && enc == AVX10v2Encoding)) {
if ((p1->isXMM() || p1->isMEM()) && p2->isXMM()) sel = 2 + int(rev); sel = 2 + int(rev);
} else { } else if (p1->isREG(bit) || p1->isMEM()) {
if ((p1->isREG(bit) || p1->isMEM()) && p2->isXMM()) sel = int(rev); sel = int(rev);
} }
if (sel == -1) XBYAK_THROW(ERR_BAD_COMBINATION) if (sel == -1) XBYAK_THROW(ERR_BAD_COMBINATION)
opAVX_X_X_XM(*static_cast<const Xmm*>(p2), xm0, *p1, typeTbl[sel], codeTbl[sel]); opAVX_X_X_XM(*static_cast<const Xmm*>(p2), xm0, *p1, typeTbl[sel], codeTbl[sel]);

View file

@ -1,4 +1,4 @@
const char *getVersionString() const { return "7.20"; } const char *getVersionString() const { return "7.20.1"; }
void aadd(const Address& addr, const Reg32e &reg) { opMR(addr, reg, T_0F38, 0x0FC, T_APX); } void aadd(const Address& addr, const Reg32e &reg) { opMR(addr, reg, T_0F38, 0x0FC, T_APX); }
void aand(const Address& addr, const Reg32e &reg) { opMR(addr, reg, T_0F38|T_66, 0x0FC, T_APX|T_66); } void aand(const Address& addr, const Reg32e &reg) { opMR(addr, reg, T_0F38|T_66, 0x0FC, T_APX|T_66); }
void adc(const Operand& op, uint32_t imm) { opOI(op, imm, 0x10, 2); } void adc(const Operand& op, uint32_t imm) { opOI(op, imm, 0x10, 2); }
@ -2186,9 +2186,9 @@ void vcomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_MAP
void vcompresspd(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N8|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX, 0x8A); } void vcompresspd(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N8|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX, 0x8A); }
void vcompressps(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N4|T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x8A); } void vcompressps(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N4|T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x8A); }
void vcomsbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_66|T_MAP5|T_EW0|T_MUST_EVEX, 0x2F); } void vcomsbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_66|T_MAP5|T_EW0|T_MUST_EVEX, 0x2F); }
void vcomxsd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8|T_F3|T_0F|T_EW1|T_SAE_X|T_MUST_EVEX, 0x2F); } void vcomxsd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8|T_F2|T_0F|T_EW1|T_SAE_X|T_MUST_EVEX, 0x2F); }
void vcomxsh(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_F2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2F); } void vcomxsh(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2F); }
void vcomxss(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N4|T_F2|T_0F|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2F); } void vcomxss(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N4|T_F3|T_0F|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2F); }
void vcvt2ps2phx(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x67); } void vcvt2ps2phx(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x67); }
void vcvtbiasph2bf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt6(x1, x2, op, T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); } void vcvtbiasph2bf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt6(x1, x2, op, T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); }
void vcvtbiasph2bf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt6(x1, x2, op, T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); } void vcvtbiasph2bf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt6(x1, x2, op, T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); }
@ -2372,7 +2372,7 @@ void vgatherpf1qpd(const Address& addr) { opGatherFetch(addr, zm2, T_N8|T_66|T_0
void vgatherpf1qps(const Address& addr) { opGatherFetch(addr, zm2, T_N4|T_66|T_0F38|T_EW0|T_MUST_EVEX|T_M_K|T_VSIB, 0xC7, Operand::ZMM); } void vgatherpf1qps(const Address& addr) { opGatherFetch(addr, zm2, T_N4|T_66|T_0F38|T_EW0|T_MUST_EVEX|T_M_K|T_VSIB, 0xC7, Operand::ZMM); }
void vgatherqpd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_VSIB, 0x93, 0); } void vgatherqpd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_VSIB, 0x93, 0); }
void vgatherqps(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4|T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_VSIB, 0x93, 2); } void vgatherqps(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4|T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_VSIB, 0x93, 2); }
void vgetexppbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x42); } void vgetexppbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x42); }
void vgetexppd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x42); } void vgetexppd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x42); }
void vgetexpph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP6|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x42); } void vgetexpph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP6|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x42); }
void vgetexpps(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x42); } void vgetexpps(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x42); }
@ -2656,9 +2656,9 @@ void vsubnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM
void vsubph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5C); } void vsubph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5C); }
void vsubsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5C); } void vsubsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5C); }
void vucomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2E); } void vucomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2E); }
void vucomxsd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8|T_F3|T_0F|T_EW1|T_SAE_X|T_MUST_EVEX, 0x2E); } void vucomxsd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8|T_F2|T_0F|T_EW1|T_SAE_X|T_MUST_EVEX, 0x2E); }
void vucomxsh(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_F2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2E); } void vucomxsh(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2E); }
void vucomxss(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N4|T_F2|T_0F|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2E); } void vucomxss(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N4|T_F3|T_0F|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2E); }
#ifdef XBYAK64 #ifdef XBYAK64
void kmovq(const Reg64& r, const Opmask& k) { opKmov(k, r, true, 64); } void kmovq(const Reg64& r, const Opmask& k) { opKmov(k, r, true, 64); }
void vpbroadcastq(const Xmm& x, const Reg64& r) { opVex(x, 0, r, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX, 0x7C); } void vpbroadcastq(const Xmm& x, const Reg64& r) { opVex(x, 0, r, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX, 0x7C); }