Merge branch 'dev'
Some checks are pending
test / test (push) Waiting to run

This commit is contained in:
MITSUNARI Shigeo 2024-10-15 10:02:23 +09:00
commit 3ee31be62d
18 changed files with 624 additions and 131 deletions

View file

@ -1,6 +1,6 @@
cmake_minimum_required(VERSION 3.5)
project(xbyak LANGUAGES CXX VERSION 7.10)
project(xbyak LANGUAGES CXX VERSION 7.20)
file(GLOB headers xbyak/*.h)

View file

@ -1,5 +1,7 @@
# History
* 2024/Oct/15 ver 7.20 Fixed the specification of setDefaultEncoding, setDefaultEncodingAVX10./
* 2024/Oct/15 ver 7.11 Added full support for AVX10.2
* 2024/Oct/13 ver 7.10 support AVX10 integer and fp16 vnni, media new instructions. setDefaultEncoding is extended.
* 2024/Oct/10 ver 7.09.1 fix the names of vpcompressb and vpcompressw
* 2024/Oct/08 ver 7.09 support YMM embedded rounding of AVX10.2 and fix some mnemonics with {sae}/{er}.

View file

@ -1,7 +1,7 @@
# Usage
Inherit `Xbyak::CodeGenerator` class and make the class method.
```
```cpp
#include <xbyak/xbyak.h>
struct Code : Xbyak::CodeGenerator {
@ -13,7 +13,7 @@ struct Code : Xbyak::CodeGenerator {
};
```
Or you can pass the instance of CodeGenerator without inheriting.
```
```cpp
void genCode(Xbyak::CodeGenerator& code, int x) {
using namespace Xbyak::util;
code.mov(eax, x);
@ -23,7 +23,7 @@ void genCode(Xbyak::CodeGenerator& code, int x) {
Make an instance of the class and get the function
pointer by calling `getCode()` and call it.
```
```cpp
Code c(5);
int (*f)() = c.getCode<int (*)()>();
printf("ret=%d\n", f()); // ret = 5
@ -32,7 +32,7 @@ printf("ret=%d\n", f()); // ret = 5
## Syntax
Similar to MASM/NASM syntax with parentheses.
```
```cpp
NASM Xbyak
mov eax, ebx --> mov(eax, ebx);
inc ecx inc(ecx);
@ -43,7 +43,7 @@ ret --> ret();
Use `qword`, `dword`, `word` and `byte` if it is necessary to specify the size of memory,
otherwise use `ptr`.
```
```cpp
(ptr|qword|dword|word|byte) [base + index * (1|2|4|8) + displacement]
[rip + 32bit disp] ; x64 only
@ -53,19 +53,21 @@ mov al, [ebx+ecx] --> mov(al, ptr [ebx + ecx]);
test byte [esp], 4 --> test(byte [esp], 4);
inc qword [rax] --> inc(qword [rax]);
```
**Note**: `qword`, ... are member variables, then don't use `dword` as unsigned int type.
### How to use Selector (Segment Register)
```
```cpp
mov eax, [fs:eax] --> putSeg(fs);
mov(eax, ptr [eax]);
mov ax, cs --> mov(ax, cs);
```
**Note**: Segment class is not derived from `Operand`.
## AVX
```
```cpp
vaddps(xmm1, xmm2, xmm3); // xmm1 <- xmm2 + xmm3
vaddps(xmm2, xmm3, ptr [rax]); // use ptr to access memory
vgatherdpd(xmm1, ptr [ebp + 256 + xmm2*4], xmm3);
@ -74,13 +76,13 @@ vgatherdpd(xmm1, ptr [ebp + 256 + xmm2*4], xmm3);
**Note**:
If `XBYAK_ENABLE_OMITTED_OPERAND` is defined, then you can use two operand version for backward compatibility.
But the newer version will not support it.
```
```cpp
vaddps(xmm2, xmm3); // xmm2 <- xmm2 + xmm3
```
## AVX-512
```
```cpp
vaddpd zmm2, zmm5, zmm30 --> vaddpd(zmm2, zmm5, zmm30);
vaddpd xmm30, xmm20, [rax] --> vaddpd(xmm30, xmm20, ptr [rax]);
vaddps xmm30, xmm20, [rax] --> vaddps(xmm30, xmm20, ptr [rax]);
@ -108,35 +110,44 @@ vfpclasspd k5{k3}, [rax+64]{1to2}, 5 --> vfpclasspd(k5|k3, xword_b [rax+64],
vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, yword_b [rax+64], 5); // broadcast 64-bit to 256-bit
```
## Selecting AVX512-VNNI, AVX-VNNI, AVX-VNNI-INT8 etc.
Some mnemonics have two types of encodings: VEX and EVEX.
## Selecting AVX512-VNNI, AVX-VNNI, AVX-VNNI-INT8, AVX10.2.
Some mnemonics have some types of encodings: VEX, EVEX, AVX10.2.
The functions for these mnemonics include an optional parameter as the last argument to specify the encoding.
The default behavior depends on the order in which the instruction was introduced (whether VEX or EVEX came first),
The default behavior depends on the order in which the instruction was introduced (whether VEX, EVEX or AVX10.2 came first),
and can be specified using setDefaultEncoding.
```
```cpp
vpdpbusd(xm0, xm1, xm2); // default encoding: EVEX (AVX512-VNNI)
vpdpbusd(xm0, xm1, xm2, EvexEncoding); // same as the above
vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX (AVX-VNNI)
setDefaultEncoding(VexEncoding); // default encoding is VEX
setDefaultEncoding(VexEncoding); // change default encoding
vpdpbusd(xm0, xm1, xm2); // VEX
vmpsadbw(xm1, xm3, xm15, 3); // default encoding: VEX (AVX-VNNI)
vmpsadbw(xm1, xm3, xm15, 3, VexEncoding); // same as the above
vmpsadbw(xm1, xm3, xm15, 3, EvexEncoding); // EVEX (AVX10.2)
setDefaultEncoding(VexEncoding, EvexEncoding); // use 2nd argument.
vmpsadbw(xm1, xm3, xm15, 3); // EVEX
vmpsadbw(xm1, xm3, xm15, 3); // default encoding: AVX
vmpsadbw(xm1, xm3, xm15, 3, PreAVX10v2Encoding); // same as the above
vmpsadbw(xm1, xm3, xm15, 3, AVX10v2Encoding); // AVX10.2
setDefaultEncodingAVX10(AVX10v2Encoding); // change default encoding
vmpsadbw(xm1, xm3, xm15, 3); // AVX10.2
```
- `setDefaultEncoding(PreferredEncoding vnniEnc = EvexEncoding, PreferredEncoding avx10Enc = VexEncoding)`
Control the default encoding of mnemonics with `Xbyak::PreferredEncoding` param.
- `setDefaultEncoding(PreferredEncoding enc = EvexEncoding)`
- Configure encoding for AVX512-VNNI or AVX-VNNI instructions.
- `setDefaultEncodingAVX10(PreferredEncoding enc = PreAVXv2Encoding)`
- Configure encoding for pre-AVX10.2 and AVX10.2 instructions.
param|vnniEnc|avx10Enc
`setDefaultEncoding`|EvexEncoding (default)|VexEncoding
-|-|-
EvexEncoding|AVX512-VNNI|AVX10.2
VexEncoding|AVX-VNNI|AVX-VNNI-INT8
default|EvexEncoding|VexEncoding
mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd, vpdpbssds, vpdpbsud, vpdpbsuds, vpdpbuud, vpdpbuuds, vpdpwsud vpdpwsuds vpdpwusd vpdpwusds vpdpwuud, vpdpwuuds
feature|AVX512-VNNI|AVX-VNNI
- Target functions: vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds
`setDefaultEncodingAVX10`|PreAVX10v2Encoding (default)|AVX10v2Encoding
-|-|-
feature|AVX-VNNI-INT8, AVX512-FP16|AVX10.2
- Target functions: vmpsadbw, vpdpbssd, vpdpbssds, vpdpbsud, vpdpbsuds, vpdpbuud, vpdpbuuds, vpdpwsud vpdpwsuds vpdpwusd vpdpwusds vpdpwuud, vpdpwuuds, vmovd, vmovw
- Remark: vmovd and vmovw several kinds of encoding such as AVX/AVX512F/AVX512-FP16/AVX10.2.
### Remark
* `k1`, ..., `k7` are opmask registers.
@ -179,7 +190,7 @@ mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd, vpdpbssds,
Two kinds of Label are supported. (String literal and Label class).
### String literal
```
```cpp
L("L1");
jmp("L1");
@ -201,7 +212,7 @@ L("L3");
### Support `@@`, `@f`, `@b` like MASM
```
```cpp
L("@@"); // <A>
jmp("@b"); // jmp to <A>
jmp("@f"); // jmp to <B>
@ -217,7 +228,7 @@ Label symbols beginning with a period between `inLocalLabel()` and `outLocalLabe
are treated as a local label.
`inLocalLabel()` and `outLocalLabel()` can be nested.
```
```cpp
void func1()
{
inLocalLabel();
@ -240,7 +251,7 @@ void func1()
Xbyak deals with jump mnemonics of an undefined label as short jump if no type is specified.
So if the size between jmp and label is larger than 127 byte, then xbyak will cause an error.
```
```cpp
jmp("short-jmp"); // short jmp
// small code
L("short-jmp");
@ -249,14 +260,16 @@ jmp("long-jmp");
// long code
L("long-jmp"); // throw exception
```
Then specify T_NEAR for jmp.
```
```cpp
jmp("long-jmp", T_NEAR); // long jmp
// long code
L("long-jmp");
```
Or call `setDefaultJmpNEAR(true);` once, then the default type is set to T_NEAR.
```
```cpp
jmp("long-jmp"); // long jmp
// long code
L("long-jmp");
@ -266,7 +279,7 @@ L("long-jmp");
`L()` and `jxx()` support Label class.
```
```cpp
Xbyak::Label label1, label2;
L(label1);
...
@ -278,7 +291,7 @@ L(label2);
```
Use `putL` for jmp table
```
```cpp
Label labelTbl, L0, L1, L2;
mov(rax, labelTbl);
// rdx is an index of jump table
@ -295,7 +308,7 @@ L(L1);
`assignL(dstLabel, srcLabel)` binds dstLabel with srcLabel.
```
```cpp
Label label2;
Label label1 = L(); // make label1 ; same to Label label1; L(label1);
...
@ -310,7 +323,7 @@ The `jmp` in the above code jumps to label1 assigned by `assignL`.
* dstLabel must not be used in `L()`.
`Label::getAddress()` returns the address specified by the label instance and 0 if not specified.
```
```cpp
// not AutoGrow mode
Label label;
assert(label.getAddress() == 0);
@ -319,7 +332,7 @@ assert(label.getAddress() == getCurr());
```
### Rip ; relative addressing
```
```cpp
Label label;
mov(eax, ptr [rip + label]); // eax = 4
...
@ -327,7 +340,7 @@ mov(eax, ptr [rip + label]); // eax = 4
L(label);
dd(4);
```
```
```cpp
int x;
...
mov(eax, ptr[rip + &x]); // throw exception if the difference between &x and current position is larger than 2GiB
@ -338,13 +351,13 @@ int x;
Use `word|dword|qword` instead of `ptr` to specify the address size.
### 32 bit mode
```
```cpp
jmp(word[eax], T_FAR); // jmp m16:16(FF /5)
jmp(dword[eax], T_FAR); // jmp m16:32(FF /5)
```
### 64 bit mode
```
```cpp
jmp(word[rax], T_FAR); // jmp m16:16(FF /5)
jmp(dword[rax], T_FAR); // jmp m16:32(FF /5)
jmp(qword[rax], T_FAR); // jmp m16:64(REX.W FF /5)
@ -355,7 +368,7 @@ The same applies to `call`.
The default max code size is 4096 bytes.
Specify the size in constructor of `CodeGenerator()` if necessary.
```
```cpp
class Quantize : public Xbyak::CodeGenerator {
public:
Quantize()
@ -372,7 +385,7 @@ You can make jit code on prepared memory.
Call `setProtectModeRE` yourself to change memory mode if using the prepared memory.
```
```cpp
uint8_t alignas(4096) buf[8192]; // C++11 or later
struct Code : Xbyak::CodeGenerator {
@ -398,7 +411,7 @@ int main()
The memory region for jit is automatically extended if necessary when `AutoGrow` is specified in a constructor of `CodeGenerator`.
Call `ready()` or `readyRE()` before calling `getCode()` to fix jump address.
```
```cpp
struct Code : Xbyak::CodeGenerator {
Code()
: Xbyak::CodeGenerator(<default memory size>, Xbyak::AutoGrow)
@ -419,7 +432,7 @@ Xbyak set Read/Write/Exec mode to memory to run jit code.
If you want to use Read/Exec mode for security, then specify `DontSetProtectRWE` for `CodeGenerator` and
call `setProtectModeRE()` after generating jit code.
```
```cpp
struct Code : Xbyak::CodeGenerator {
Code()
: Xbyak::CodeGenerator(4096, Xbyak::DontSetProtectRWE)

View file

@ -209,6 +209,30 @@ void putX_XM()
{ 0x2E, "vucomxsd", T_MUST_EVEX | T_F3 | T_0F | T_EW1 | T_SAE_X | T_N8 },
{ 0x2E, "vucomxsh", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_SAE_X | T_N2 },
{ 0x2E, "vucomxss", T_MUST_EVEX | T_F2 | T_0F | T_EW0 | T_SAE_X | T_N4 },
// 13.1
{ 0x69, "vcvtnebf162ibs", T_MUST_EVEX | T_YMM | T_F2 | T_MAP5 | T_EW0 | T_B16 },
{ 0x6B, "vcvtnebf162iubs", T_MUST_EVEX | T_YMM | T_F2 | T_MAP5 | T_EW0 | T_B16 },
{ 0x68, "vcvttnebf162ibs", T_MUST_EVEX | T_YMM | T_F2 | T_MAP5 | T_EW0 | T_B16 },
{ 0x6A, "vcvttnebf162iubs", T_MUST_EVEX | T_YMM | T_F2 | T_MAP5 | T_EW0 | T_B16 },
// 13.3
{ 0x6D, "vcvttpd2qqs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW1 | T_B64 | T_SAE_Y | T_SAE_Z },
// 13.5
{ 0x6C, "vcvttpd2uqqs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW1 | T_B64 | T_SAE_Y | T_SAE_Z },
// 13.6
{ 0x69, "vcvtph2ibs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW0 | T_B16 | T_ER_Y | T_ER_Z },
{ 0x6B, "vcvtph2iubs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW0 | T_B16 | T_ER_Y | T_ER_Z },
{ 0x68, "vcvttph2ibs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW0 | T_B16 | T_ER_Y | T_ER_Z },
{ 0x6A, "vcvttph2iubs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW0 | T_B16 | T_ER_Y | T_ER_Z },
// 13.7
{ 0x6D, "vcvttps2dqs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW0 | T_B32 | T_SAE_Y | T_SAE_Z },
// 13.8
{ 0x69, "vcvtps2ibs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW0 | T_B32 | T_ER_Y | T_ER_Z },
{ 0x6B, "vcvtps2iubs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW0 | T_B32 | T_ER_Y | T_ER_Z },
{ 0x68, "vcvttps2ibs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW0 | T_B32 | T_ER_Y | T_ER_Z },
{ 0x6A, "vcvttps2iubs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW0 | T_B32 | T_ER_Y | T_ER_Z },
// 13.10
{ 0x6C, "vcvttps2udqs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW0 | T_B32 | T_SAE_Y | T_SAE_Z },
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i];
@ -240,7 +264,6 @@ void putM_X()
{ 0x7F, "vmovdqu32", T_F3 | T_0F | T_MUST_EVEX | T_YMM | T_EW0 | T_ER_X | T_ER_Y | T_ER_Z | T_M_K },
{ 0x7F, "vmovdqu64", T_F3 | T_0F | T_MUST_EVEX | T_YMM | T_EW1 | T_ER_X | T_ER_Y | T_ER_Z | T_M_K },
{ 0x11, "vmovsh", T_F3 | T_MAP5 | T_MUST_EVEX | T_EW0 | T_N2 | T_M_K },
{ 0x7E, "vmovw", T_66 | T_MAP5 | T_MUST_EVEX | T_N2 },
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i];
@ -447,6 +470,13 @@ void putX_X_XM_IMM()
{ 0x1B, "vcvtne2ph2hf8s", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_B16 | T_N1, false },
{ 0x52, "vdpphps", T_MUST_EVEX | T_0F38 | T_EW0 | T_YMM | T_B32, false },
{ 0x52, "vminmaxnepbf16", T_MUST_EVEX | T_F2 | T_0F3A | T_EW0 | T_YMM | T_B16, true },
{ 0x52, "vminmaxpd", T_MUST_EVEX | T_66 | T_0F3A | T_EW1 | T_YMM | T_B64 | T_SAE_Y | T_SAE_Z, true },
{ 0x52, "vminmaxph", T_MUST_EVEX | T_0F3A | T_EW0 | T_YMM | T_B16 | T_SAE_Y | T_SAE_Z, true },
{ 0x52, "vminmaxps", T_MUST_EVEX | T_66 | T_0F3A | T_EW0 | T_YMM | T_B32 | T_SAE_Y | T_SAE_Z, true },
{ 0x53, "vminmaxsd", T_MUST_EVEX | T_66 | T_0F3A | T_EW1 | T_SAE_X | T_N8, true },
{ 0x53, "vminmaxsh", T_MUST_EVEX | T_0F3A | T_EW0 | T_SAE_X | T_N2, true },
{ 0x53, "vminmaxss", T_MUST_EVEX | T_66 | T_0F3A | T_EW0 | T_SAE_X | T_N4, true },
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i];
@ -658,6 +688,22 @@ void putCvt()
{ 0x7B, "vcvtusi2sh", T_F3 | T_MAP5 | T_MUST_EVEX | T_ER_R | T_M_K, 6 },
{ 0x72, "vcvtneps2bf16", T_MUST_EVEX | T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 2 },
// 13.2
{ 0x6D, "vcvttpd2dqs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW1 | T_B64 | T_SAE_Y | T_SAE_Z, 2 },
// 13.4
{ 0x6C, "vcvttpd2udqs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW1 | T_B64 | T_SAE_Y | T_SAE_Z, 2 },
// 13.9
{ 0x6D, "vcvttps2qqs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW0 | T_B32 | T_SAE_X | T_SAE_Y | T_N8 | T_N_VL, 1 },
// 13.11
{ 0x6C, "vcvttps2uqqs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW0 | T_B32 | T_SAE_X | T_SAE_Y | T_N8 | T_N_VL, 1 },
// 13.12
{ 0x6D, "vcvttsd2sis", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_SAE_X | T_N8, 0 },
// 13.13
{ 0x6C, "vcvttsd2usis", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_SAE_X | T_N8, 0 },
// 13.14
{ 0x6D, "vcvttss2sis", T_MUST_EVEX | T_F3 | T_MAP5 | T_EW0 | T_SAE_X | T_N4, 0 },
// 13.15
{ 0x6C, "vcvttss2usis", T_MUST_EVEX | T_F3 | T_MAP5 | T_EW0 | T_SAE_X | T_N4, 0 },
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl& p = tbl[i];
@ -666,10 +712,10 @@ void putCvt()
case 0:
printf("void %s(const Reg32e& r, const Operand& op) { uint64_t type = (%s) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x%02X); }\n", p.name, s.c_str(), p.code);
break;
case 1:
case 1: // (x, x/m), (y, x/m256), (z, y/m)
printf("void %s(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, %s, 0x%02X); }\n", p.name, s.c_str(), p.code);
break;
case 2:
case 2: // (x, x/m), (x, y/m256), (y, z/m)
printf("void %s(const Xmm& x, const Operand& op) { opCvt2(x, op, %s, 0x%02X); }\n", p.name, s.c_str(), p.code);
break;
case 3:
@ -1032,12 +1078,6 @@ void putFP16_2()
printf("void vmovsh(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, %s, 0x10); }\n", s.c_str());
printf("void vmovsh(const Xmm& x1, const Xmm& x2, const Xmm& x3) { opAVX_X_X_XM(x1, x2, x3, %s, 0x10); }\n", s.c_str());
}
{
uint64_t type = T_66 | T_MAP5 | T_MUST_EVEX | T_N2;
std::string s = type2String(type);
printf("void vmovw(const Xmm& x, const Operand& op) { if (!op.isREG(32|64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, %s, 0x6E); }\n", s.c_str());
printf("void vmovw(const Reg32e& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, r, %s, 0x7E); }\n", s.c_str());
}
}
void putFP16()

View file

@ -1443,6 +1443,7 @@ void put()
printf("void %s(const Xmm& x, const Operand& op) { opSSE_APX(x, op, T_0F38, 0x%02X, T_MUST_EVEX, 0x%02X); }\n", p->name, p->code, p->code2);
}
puts("void sha1rnds4(const Xmm& x, const Operand& op, uint8_t imm) { opSSE_APX(x, op, T_0F3A, 0xCC, T_MUST_EVEX, 0xD4, imm); }");
puts("void sha1msg12(const Xmm& x, const Operand& op) { opROO(Reg(), op, x, T_MUST_EVEX, 0xD9); }");
}
// (m, x), (m, y)
{
@ -1733,9 +1734,6 @@ void put()
}
// mov
{
printf("void vmovd(const Xmm& x, const Operand& op) { if (!op.isREG(32) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x6E); }\n");
printf("void vmovd(const Operand& op, const Xmm& x) { if (!op.isREG(32) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x7E); }\n");
printf("void vmovq(const Xmm& x, const Address& addr) { uint64_t type; uint8_t code; if (x.getIdx() < 16) { type = T_0F | T_F3; code = 0x7E; } else { type = T_0F | T_66 | T_EVEX | T_EW1 | T_N8; code = 0x6E; } opAVX_X_X_XM(x, xm0, addr, type, code); }\n");
printf("void vmovq(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, x.getIdx() < 16 ? 0xD6 : 0x7E); }\n");
printf("void vmovq(const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x1, xm0, x2, T_0F | T_F3 | T_EVEX | T_EW1 | T_N8, 0x7E); }\n");
@ -1899,36 +1897,6 @@ void put()
printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, %s, 0x%02X, encoding); }\n", p->name, s.c_str(), p->code);
}
}
// avx-vnni-int8
// avx-vnni-int16
#if 0
{
const struct Tbl {
uint8_t code;
const char *name;
uint64_t type;
} tbl[] = {
// { 0x50, "vpdpbssd", T_F2 | T_0F38 | T_W0 | T_YMM },
// { 0x51, "vpdpbssds", T_F2 | T_0F38 | T_W0 | T_YMM },
// { 0x50, "vpdpbsud", T_F3 | T_0F38 | T_W0 | T_YMM },
// { 0x51, "vpdpbsuds", T_F3 | T_0F38 | T_W0 | T_YMM },
// { 0x50, "vpdpbuud", T_0F38 | T_W0 | T_YMM },
// { 0x51, "vpdpbuuds", T_0F38 | T_W0 | T_YMM },
// { 0xD2, "vpdpwsud", T_F3 | T_0F38 | T_W0 | T_YMM },
// { 0xD3, "vpdpwsuds", T_F3 | T_0F38 | T_W0 | T_YMM },
// { 0xD2, "vpdpwusd", T_66 | T_0F38 | T_W0 | T_YMM },
// { 0xD3, "vpdpwusds", T_66 | T_0F38 | T_W0 | T_YMM },
// { 0xD2, "vpdpwuud", T_0F38 | T_W0 | T_YMM },
// { 0xD3, "vpdpwuuds", T_0F38 | T_W0 | T_YMM },
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i];
std::string s = type2String(p->type);
printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, %s, 0x%02X); }\n", p->name, s.c_str(), p->code);
}
}
#endif
}
void put32()

View file

@ -5,7 +5,7 @@
project(
'xbyak',
'cpp',
version: '7.10',
version: '7.20',
license: 'BSD-3-Clause',
default_options: 'b_ndebug=if-release'
)

View file

@ -1,5 +1,5 @@
# Xbyak 7.10 [![Badge Build]][Build Status]
# Xbyak 7.20 [![Badge Build]][Build Status]
*A C++ JIT assembler for x86 (IA32), x64 (AMD64, x86-64)*
@ -20,8 +20,7 @@ It is named from a Japanese word [開闢](https://translate.google.com/?hl=ja&sl
- header file only
- Intel/MASM like syntax
- fully support AVX-512
- support APX/AVX10
- Full support for AVX-512, APX, and AVX10.2
**Note**:
Use `and_()`, `or_()`, ... instead of `and()`, `or()`.
@ -33,6 +32,7 @@ If you want to use them, then specify `-fno-operator-names` option to gcc/clang.
### News
- support AVX10.2
- support xresldtrk/xsusldtrk
- support RAO-INT for APX
- support AVX10 detection, AESKLE, WIDE_KL, KEYLOCKER, KEYLOCKER_WIDE

View file

@ -1,5 +1,5 @@
C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 7.10
C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 7.20
-----------------------------------------------------------------------------
◎概要
@ -14,7 +14,7 @@
xbyak.hをインクルードするだけですぐ利用することができます。
C++の枠組み内で閉じているため、外部アセンブラは不要です。
32bit/64bit両対応です。
対応ニーモニック:特権命令除くx86, MMX/MMX2/SSE/SSE2/SSE3/SSSE3/SSE4/FPU(一部)/AVX/AVX2/FMA/VEX-encoded GPR
対応ニーモニック:特権命令除くx86, MMX/MMX2/SSE/SSE2/SSE3/SSSE3/SSE4/FPU(一部)/AVX/AVX2/FMA/AVX-512/APX/AVX10.2
・Windows Xp(32bit, 64bit), Windows 7/Linux(32bit, 64bit)/Intel Mac対応
Windows Xp, Windows 7上ではVC2008, VC2010, VC2012
@ -46,7 +46,7 @@ Linuxではmake installで/usr/local/include/xbyakにコピーされます。
-----------------------------------------------------------------------------
◎新機能
APX/AVX10対応
APX/AVX10.2対応
例外なしモード追加
XBYAK_NO_EXCEPTIONを定義してコンパイルするとgcc/clangで-fno-exceptionsオプションでコンパイルできます。
@ -404,6 +404,9 @@ sample/{echo,hello}.bfは http://www.kmonos.net/alang/etc/brainfuck.php から
-----------------------------------------------------------------------------
◎履歴
2024/10/15 ver 7.20 setDefaultEncoding/setDefaultEncodingAVX10の仕様確定
2024/10/15 ver 7.11 AVX10.2完全サポート
2024/10/13 ver 7.10 AVX10 integer and fp16 vnni, mediaの新命令対応. setDefaultEncodingの拡張.
2024/10/10 ver 7.09.1 vpcompressbとvpcompresswの名前修正
2024/10/08 ver 7.09 AVX10.2のYMMレジスタの埋め込み丸め対応
2024/10/07 ver 7.08 rdfabaseなどサポート

View file

@ -60,9 +60,12 @@ apx: apx.cpp $(XBYAK_INC)
avx10_test: avx10_test.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) avx10_test.cpp -o $@ -DXBYAK64
TEST_FILES=old.txt new-ymm.txt bf16.txt comp.txt convert.txt
TEST_FILES=old.txt new-ymm.txt bf16.txt comp.txt misc.txt convert.txt minmax.txt saturation.txt
xed_test:
@for target in $(addprefix avx10/, $(TEST_FILES)); do ./test_by_xed.sh $$target; done
@set -e; \
for target in $(addprefix avx10/, $(TEST_FILES)); do \
./test_by_xed.sh $$target || exit 1; \
done
test_nm: normalize_prefix $(TARGET)
$(MAKE) -C ../gen

66
test/avx10/minmax.txt Normal file
View file

@ -0,0 +1,66 @@
vminmaxnepbf16(xm1|k3|T_z, xm2, xm3, 5);
vminmaxnepbf16(xm1|k3|T_z, xm2, ptr[rax+128], 5);
vminmaxnepbf16(xm1|k3|T_z, xm2, ptr_b[rax+128], 5);
vminmaxnepbf16(ym1|k3|T_z, ym2, ym3, 5);
vminmaxnepbf16(ym1|k3|T_z, ym2, ptr[rax+128], 5);
vminmaxnepbf16(ym1|k3|T_z, ym2, ptr_b[rax+128], 5);
vminmaxnepbf16(zm1|k3|T_z, zm2, zm3, 5);
vminmaxnepbf16(zm1|k3|T_z, zm2, ptr[rax+128], 5);
vminmaxnepbf16(zm1|k3|T_z, zm2, ptr_b[rax+128], 5);
//
vminmaxpd(xm1|k3|T_z, xm2, xm3, 5);
vminmaxpd(xm1|k3|T_z, xm2, ptr[rax+128], 5);
vminmaxpd(xm1|k3|T_z, xm2, ptr_b[rax+128], 5);
vminmaxpd(ym1|k3|T_z, ym2, ym3, 5);
vminmaxpd(ym1|k3|T_z, ym2, ym3|T_sae, 5);
vminmaxpd(ym1|k3|T_z, ym2, ptr[rax+128], 5);
vminmaxpd(ym1|k3|T_z, ym2, ptr_b[rax+128], 5);
vminmaxpd(zm1|k3|T_z, zm2, zm3, 5);
vminmaxpd(zm1|k3|T_z, zm2, zm3|T_sae, 5);
vminmaxpd(zm1|k3|T_z, zm2, ptr[rax+128], 5);
vminmaxpd(zm1|k3|T_z, zm2, ptr_b[rax+128], 5);
//
vminmaxph(xm1|k3|T_z, xm2, xm3, 5);
vminmaxph(xm1|k3|T_z, xm2, ptr[rax+128], 5);
vminmaxph(xm1|k3|T_z, xm2, ptr[rax+128], 5);
vminmaxph(xm1|k3|T_z, xm2, ptr_b[rax+128], 5);
vminmaxph(ym1|k3|T_z, ym2, ym3, 5);
vminmaxph(ym1|k3|T_z, ym2, ym3|T_sae, 5);
vminmaxph(ym1|k3|T_z, ym2, ptr[rax+128], 5);
vminmaxph(ym1|k3|T_z, ym2, ptr_b[rax+128], 5);
vminmaxph(zm1|k3|T_z, zm2, zm3, 5);
vminmaxph(zm1|k3|T_z, zm2, zm3|T_sae, 5);
vminmaxph(zm1|k3|T_z, zm2, ptr[rax+128], 5);
vminmaxph(zm1|k3|T_z, zm2, ptr_b[rax+128], 5);
//
vminmaxps(xm1|k3|T_z, xm2, xm3, 5);
vminmaxps(xm1|k3|T_z, xm2, ptr[rax+128], 5);
vminmaxps(xm1|k3|T_z, xm2, ptr_b[rax+128], 5);
vminmaxps(ym1|k3|T_z, ym2, ym3, 5);
vminmaxps(ym1|k3|T_z, ym2, ym3|T_sae, 5);
vminmaxps(ym1|k3|T_z, ym2, ptr[rax+128], 5);
vminmaxps(ym1|k3|T_z, ym2, ptr_b[rax+128], 5);
vminmaxps(zm1|k3|T_z, zm2, zm3, 5);
vminmaxps(zm1|k3|T_z, zm2, zm3|T_sae, 5);
vminmaxps(zm1|k3|T_z, zm2, ptr[rax+128], 5);
vminmaxps(zm1|k3|T_z, zm2, ptr_b[rax+128], 5);
//
vminmaxsd(xm1|k3|T_z, xm2, xm3, 5);
vminmaxsd(xm1|k3|T_z, xm2, xm3|T_sae, 5);
vminmaxsd(xm1|k3|T_z, xm2, ptr[rax+128], 5);
//
vminmaxsh(xm1|k3|T_z, xm2, xm3, 5);
vminmaxsh(xm1|k3|T_z, xm2, xm3|T_sae, 5);
vminmaxsh(xm1|k3|T_z, xm2, ptr[rax+128], 5);
//
vminmaxss(xm1|k3|T_z, xm2, xm3, 5);
vminmaxss(xm1|k3|T_z, xm2, xm3|T_sae, 5);
vminmaxss(xm1|k3|T_z, xm2, ptr[rax+128], 5);

View file

@ -1,3 +1,4 @@
// AVX10 integer and FP16 VNNI, media and zero-extending
vdpphps(xm1, xm2, xm3);
vdpphps(xm1, xm2, ptr[rax+128]);
vdpphps(xm1, xm2, ptr_b[rax+128]);
@ -165,3 +166,14 @@ vpdpwuuds(ym1, ym2, ptr_b[rax+128]);
vpdpwuuds(zm1, zm2, zm3);
vpdpwuuds(zm1, zm2, ptr[rax+128]);
vpdpwuuds(zm1, zm2, ptr_b[rax+128]);
//
vmovd(xm10, xm20);
vmovd(xm1, xm2);
vmovd(xm10, ptr[rax+128]);
vmovd(ptr[rax+128], xm30);
//
vmovw(xm1, xm20);
vmovw(xm1, xm2);
vmovw(xm3, ptr [rax+0x40]);
vmovw(ptr [rax+0x40], xm7);

View file

@ -355,10 +355,6 @@ vgetmantsh(xmm1|k1|T_z|T_sae, xmm3, xmm5, 0x6);
vmovsh(xmm1|k1|T_z, ptr [rax+0x40]);
vmovsh(ptr [rax+0x40]|k1, xmm1);
vmovsh(xmm1|k2|T_z, xmm3, xmm5);
vmovw(xmm1, r13d);
vmovw(xmm3, ptr [rax+0x40]);
vmovw(r9d, xmm1);
vmovw(ptr [rax+0x40], xmm7);
vcvtsd2sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3);
vcvtsd2sh(xmm1, xmm2, ptr [rax+0x40]);
vcvtsh2sd(xmm1|k1|T_z|T_sae, xmm2, xmm3);

310
test/avx10/saturation.txt Normal file
View file

@ -0,0 +1,310 @@
//
vcvtnebf162ibs(xm1, xm2);
vcvtnebf162ibs(xm1, ptr[rax+128]);
vcvtnebf162ibs(xm1, ptr_b[rax+128]);
vcvtnebf162ibs(ym1, ym2);
vcvtnebf162ibs(ym1, ptr[rax+128]);
vcvtnebf162ibs(ym1, ptr_b[rax+128]);
vcvtnebf162ibs(zm1, zm2);
vcvtnebf162ibs(zm1, ptr[rax+128]);
vcvtnebf162ibs(zm1, ptr_b[rax+128]);
//
vcvtnebf162iubs(xm1, xm2);
vcvtnebf162iubs(xm1, ptr[rax+128]);
vcvtnebf162iubs(xm1, ptr_b[rax+128]);
vcvtnebf162iubs(ym1, ym2);
vcvtnebf162iubs(ym1, ptr[rax+128]);
vcvtnebf162iubs(ym1, ptr_b[rax+128]);
vcvtnebf162iubs(zm1, zm2);
vcvtnebf162iubs(zm1, ptr[rax+128]);
vcvtnebf162iubs(zm1, ptr_b[rax+128]);
//
vcvttnebf162ibs(xm1, xm2);
vcvttnebf162ibs(xm1, ptr[rax+128]);
vcvttnebf162ibs(xm1, ptr_b[rax+128]);
vcvttnebf162ibs(ym1, ym2);
vcvttnebf162ibs(ym1, ptr[rax+128]);
vcvttnebf162ibs(ym1, ptr_b[rax+128]);
vcvttnebf162ibs(zm1, zm2);
vcvttnebf162ibs(zm1, ptr[rax+128]);
vcvttnebf162ibs(zm1, ptr_b[rax+128]);
//
vcvttnebf162iubs(xm1, xm2);
vcvttnebf162iubs(xm1, ptr[rax+128]);
vcvttnebf162iubs(xm1, ptr_b[rax+128]);
vcvttnebf162iubs(ym1, ym2);
vcvttnebf162iubs(ym1, ptr[rax+128]);
vcvttnebf162iubs(ym1, ptr_b[rax+128]);
vcvttnebf162iubs(zm1, zm2);
vcvttnebf162iubs(zm1, ptr[rax+128]);
vcvttnebf162iubs(zm1, ptr_b[rax+128]);
//
vcvttpd2qqs(xm1, xm2);
vcvttpd2qqs(xm1, ptr[rax+128]);
vcvttpd2qqs(xm1, ptr_b[rax+128]);
vcvttpd2qqs(ym1, ym2);
vcvttpd2qqs(ym1, ym2|T_sae);
vcvttpd2qqs(ym1, ptr[rax+128]);
vcvttpd2qqs(ym1, ptr_b[rax+128]);
vcvttpd2qqs(zm1, zm2);
vcvttpd2qqs(zm1, zm2|T_sae);
vcvttpd2qqs(zm1, ptr[rax+128]);
vcvttpd2qqs(zm1, ptr_b[rax+128]);
//
vcvttpd2uqqs(xm1, xm2);
vcvttpd2uqqs(xm1, ptr[rax+128]);
vcvttpd2uqqs(xm1, ptr_b[rax+128]);
vcvttpd2uqqs(ym1, ym2);
vcvttpd2uqqs(ym1, ym2|T_sae);
vcvttpd2uqqs(ym1, ptr[rax+128]);
vcvttpd2uqqs(ym1, ptr_b[rax+128]);
vcvttpd2uqqs(zm1, zm2);
vcvttpd2uqqs(zm1, zm2|T_sae);
vcvttpd2uqqs(zm1, ptr[rax+128]);
vcvttpd2uqqs(zm1, ptr_b[rax+128]);
//
vcvtph2ibs(xm1, xm2);
vcvtph2ibs(xm1, ptr[rax+128]);
vcvtph2ibs(xm1, ptr_b[rax+128]);
vcvtph2ibs(ym1, ym2);
vcvtph2ibs(ym1, ym2|T_rd_sae);
vcvtph2ibs(ym1, ptr[rax+128]);
vcvtph2ibs(ym1, ptr_b[rax+128]);
vcvtph2ibs(zm1, zm2);
vcvtph2ibs(zm1, zm2|T_ru_sae);
vcvtph2ibs(zm1, ptr[rax+128]);
vcvtph2ibs(zm1, ptr_b[rax+128]);
//
vcvtph2iubs(xm1, xm2);
vcvtph2iubs(xm1, ptr[rax+128]);
vcvtph2iubs(xm1, ptr_b[rax+128]);
vcvtph2iubs(ym1, ym2);
vcvtph2iubs(ym1, ym2|T_rd_sae);
vcvtph2iubs(ym1, ptr[rax+128]);
vcvtph2iubs(ym1, ptr_b[rax+128]);
vcvtph2iubs(zm1, zm2);
vcvtph2iubs(zm1, zm2|T_ru_sae);
vcvtph2iubs(zm1, ptr[rax+128]);
vcvtph2iubs(zm1, ptr_b[rax+128]);
//
vcvttph2ibs(xm1, xm2);
vcvttph2ibs(xm1, ptr[rax+128]);
vcvttph2ibs(xm1, ptr_b[rax+128]);
vcvttph2ibs(ym1, ym2);
vcvttph2ibs(ym1, ym2|T_rd_sae);
vcvttph2ibs(ym1, ptr[rax+128]);
vcvttph2ibs(ym1, ptr_b[rax+128]);
vcvttph2ibs(zm1, zm2);
vcvttph2ibs(zm1, zm2|T_ru_sae);
vcvttph2ibs(zm1, ptr[rax+128]);
vcvttph2ibs(zm1, ptr_b[rax+128]);
//
vcvttph2iubs(xm1, xm2);
vcvttph2iubs(xm1, ptr[rax+128]);
vcvttph2iubs(xm1, ptr_b[rax+128]);
vcvttph2iubs(ym1, ym2);
vcvttph2iubs(ym1, ym2|T_rd_sae);
vcvttph2iubs(ym1, ptr[rax+128]);
vcvttph2iubs(ym1, ptr_b[rax+128]);
vcvttph2iubs(zm1, zm2);
vcvttph2iubs(zm1, zm2|T_ru_sae);
vcvttph2iubs(zm1, ptr[rax+128]);
vcvttph2iubs(zm1, ptr_b[rax+128]);
//
vcvttps2dqs(xm1, xm2);
vcvttps2dqs(xm1, ptr[rax+128]);
vcvttps2dqs(xm1, ptr_b[rax+128]);
vcvttps2dqs(ym1, ym2);
vcvttps2dqs(ym1, ym2|T_sae);
vcvttps2dqs(ym1, ptr[rax+128]);
vcvttps2dqs(ym1, ptr_b[rax+128]);
vcvttps2dqs(zm1, zm2);
vcvttps2dqs(zm1, zm2|T_sae);
vcvttps2dqs(zm1, ptr[rax+128]);
vcvttps2dqs(zm1, ptr_b[rax+128]);
//
vcvtps2ibs(xm1, xm2);
vcvtps2ibs(xm1, ptr[rax+128]);
vcvtps2ibs(xm1, ptr_b[rax+128]);
vcvtps2ibs(ym1, ym2);
vcvtps2ibs(ym1, ym2|T_rd_sae);
vcvtps2ibs(ym1, ptr[rax+128]);
vcvtps2ibs(ym1, ptr_b[rax+128]);
vcvtps2ibs(zm1, zm2);
vcvtps2ibs(zm1, zm2|T_ru_sae);
vcvtps2ibs(zm1, ptr[rax+128]);
vcvtps2ibs(zm1, ptr_b[rax+128]);
//
vcvtps2iubs(xm1, xm2);
vcvtps2iubs(xm1, ptr[rax+128]);
vcvtps2iubs(xm1, ptr_b[rax+128]);
vcvtps2iubs(ym1, ym2);
vcvtps2iubs(ym1, ym2|T_rd_sae);
vcvtps2iubs(ym1, ptr[rax+128]);
vcvtps2iubs(ym1, ptr_b[rax+128]);
vcvtps2iubs(zm1, zm2);
vcvtps2iubs(zm1, zm2|T_ru_sae);
vcvtps2iubs(zm1, ptr[rax+128]);
vcvtps2iubs(zm1, ptr_b[rax+128]);
//
vcvttps2ibs(xm1, xm2);
vcvttps2ibs(xm1, ptr[rax+128]);
vcvttps2ibs(xm1, ptr_b[rax+128]);
vcvttps2ibs(ym1, ym2);
vcvttps2ibs(ym1, ym2|T_rd_sae);
vcvttps2ibs(ym1, ptr[rax+128]);
vcvttps2ibs(ym1, ptr_b[rax+128]);
vcvttps2ibs(zm1, zm2);
vcvttps2ibs(zm1, zm2|T_ru_sae);
vcvttps2ibs(zm1, ptr[rax+128]);
vcvttps2ibs(zm1, ptr_b[rax+128]);
//
vcvttps2iubs(xm1, xm2);
vcvttps2iubs(xm1, ptr[rax+128]);
vcvttps2iubs(xm1, ptr_b[rax+128]);
vcvttps2iubs(ym1, ym2);
vcvttps2iubs(ym1, ym2|T_rd_sae);
vcvttps2iubs(ym1, ptr[rax+128]);
vcvttps2iubs(ym1, ptr_b[rax+128]);
vcvttps2iubs(zm1, zm2);
vcvttps2iubs(zm1, zm2|T_ru_sae);
vcvttps2iubs(zm1, ptr[rax+128]);
vcvttps2iubs(zm1, ptr_b[rax+128]);
//
vcvttps2udqs(xm1, xm2);
vcvttps2udqs(xm1, ptr[rax+128]);
vcvttps2udqs(xm1, ptr_b[rax+128]);
vcvttps2udqs(ym1, ym2);
vcvttps2udqs(ym1, ym2|T_sae);
vcvttps2udqs(ym1, ptr[rax+128]);
vcvttps2udqs(ym1, ptr_b[rax+128]);
vcvttps2udqs(zm1, zm2);
vcvttps2udqs(zm1, zm2|T_sae);
vcvttps2udqs(zm1, ptr[rax+128]);
vcvttps2udqs(zm1, ptr_b[rax+128]);
//
vcvttpd2dqs(xm1|k1|T_z, xm2);
vcvttpd2dqs(xm1|k1|T_z, xword [rax+128]);
vcvttpd2dqs(xm1|k1|T_z, xword_b[rax+128]);
vcvttpd2dqs(xm1|k1|T_z, ym2);
vcvttpd2dqs(xm1|k1|T_z, ym2|T_sae);
vcvttpd2dqs(xm1|k1|T_z, yword [rax+128]);
vcvttpd2dqs(xm1|k1|T_z, yword_b[rax+128]);
vcvttpd2dqs(ym1|k1|T_z, zm2);
vcvttpd2dqs(ym1|k1|T_z, zm2|T_sae);
vcvttpd2dqs(ym1|k1|T_z, zword [rax+128]);
vcvttpd2dqs(ym1|k1|T_z, zword_b[rax+128]);
//
vcvttpd2udqs(xm1|k1|T_z, xm2);
vcvttpd2udqs(xm1|k1|T_z, xword [rax+128]);
vcvttpd2udqs(xm1|k1|T_z, xword_b[rax+128]);
vcvttpd2udqs(xm1|k1|T_z, ym2);
vcvttpd2udqs(xm1|k1|T_z, ym2|T_sae);
vcvttpd2udqs(xm1|k1|T_z, yword [rax+128]);
vcvttpd2udqs(xm1|k1|T_z, yword_b[rax+128]);
vcvttpd2udqs(ym1|k1|T_z, zm2);
vcvttpd2udqs(ym1|k1|T_z, zm2|T_sae);
vcvttpd2udqs(ym1|k1|T_z, zword [rax+128]);
vcvttpd2udqs(ym1|k1|T_z, zword_b[rax+128]);
//
vcvttps2qqs(xm1|k1|T_z, xm2);
vcvttps2qqs(xm1|k1|T_z, ptr [rax+128]);
vcvttps2qqs(xm1|k1|T_z, ptr_b[rax+128]);
vcvttps2qqs(ym1|k1|T_z, xm2);
vcvttps2qqs(ym1|k1|T_z, xm2|T_sae);
vcvttps2qqs(ym1|k1|T_z, ptr [rax+128]);
vcvttps2qqs(ym1|k1|T_z, ptr_b[rax+128]);
vcvttps2qqs(zm1, ym2);
vcvttps2qqs(zm1|k1|T_z, ym2);
vcvttps2qqs(zm1|k1|T_z|T_sae, ym2);
vcvttps2qqs(zm1|k1|T_z, ptr [rax+128]);
vcvttps2qqs(zm1|k1|T_z, ptr_b[rax+128]);
//
vcvttps2uqqs(xm1|k1|T_z, xm2);
vcvttps2uqqs(xm1|k1|T_z, ptr [rax+128]);
vcvttps2uqqs(xm1|k1|T_z, ptr_b[rax+128]);
vcvttps2uqqs(ym1|k1|T_z, xm2);
vcvttps2uqqs(ym1|k1|T_z, xm2|T_sae);
vcvttps2uqqs(ym1|k1|T_z, ptr [rax+128]);
vcvttps2uqqs(ym1|k1|T_z, ptr_b[rax+128]);
vcvttps2uqqs(zm1, ym2);
vcvttps2uqqs(zm1|k1|T_z, ym2);
vcvttps2uqqs(zm1|k1|T_z|T_sae, ym2);
vcvttps2uqqs(zm1|k1|T_z, ptr [rax+128]);
vcvttps2uqqs(zm1|k1|T_z, ptr_b[rax+128]);
//
vcvttsd2sis(eax, xm1);
vcvttsd2sis(eax, xm1|T_sae);
vcvttsd2sis(eax, ptr[rax+128]);
vcvttsd2sis(r30, xm1);
vcvttsd2sis(r30, xm1|T_sae);
vcvttsd2sis(r30, ptr[rax+128]);
//
vcvttsd2usis(eax, xm1);
vcvttsd2usis(eax, xm1|T_sae);
vcvttsd2usis(eax, ptr[rax+128]);
vcvttsd2usis(r30, xm1);
vcvttsd2usis(r30, xm1|T_sae);
vcvttsd2usis(r30, ptr[rax+128]);
//
vcvttss2sis(eax, xm1);
vcvttss2sis(eax, xm1|T_sae);
vcvttss2sis(eax, ptr[rax+128]);
vcvttss2sis(r30, xm1);
vcvttss2sis(r30, xm1|T_sae);
vcvttss2sis(r30, ptr[rax+128]);
//
vcvttss2usis(eax, xm1);
vcvttss2usis(eax, xm1|T_sae);
vcvttss2usis(eax, ptr[rax+128]);
vcvttss2usis(r30, xm1);
vcvttss2usis(r30, xm1|T_sae);
vcvttss2usis(r30, ptr[rax+128]);

View file

@ -234,10 +234,10 @@ CYBOZU_TEST_AUTO(vmpsadbw)
struct Code : Xbyak::CodeGenerator {
Code()
{
setDefaultEncoding();
setDefaultEncodingAVX10();
vmpsadbw(xm1, xm3, xm15, 3); // vex(avx)
vmpsadbw(ym1, ym3, ptr[rax+128], 3); // vex(avx2)
setDefaultEncoding(VexEncoding, EvexEncoding);
setDefaultEncodingAVX10(AVX10v2Encoding);
vmpsadbw(ym1, ym3, ym15, 3); // evex(avx10.2)
vmpsadbw(ym1, ym3, ptr[rax+128], 3); // evex(avx10.2)
}

View file

@ -7,7 +7,7 @@ struct Code : Xbyak::CodeGenerator {
Code()
: Xbyak::CodeGenerator(4096*8)
{
setDefaultEncoding(VexEncoding, EvexEncoding);
setDefaultEncodingAVX10(AVX10v2Encoding);
#include "tmp.cpp"
}
};

View file

@ -366,7 +366,7 @@ def parseNmemonicTest():
('vpshldw(xmm9|k3|T_z, xmm2, ptr [rax + 0x40], 5);', Nmemonic('vpshldw', [xmm9, xmm2, Memory(0, rax, None, 0, 0x40), 5], [k3, T_z])),
('vpshrdd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5);', Nmemonic('vpshrdd', [xmm5, xmm2, Memory(0, rax, None, 0, 0x40, True), 5], [k3, T_z])),
('vpshrdd xmm5{k3}{z}, xmm2, dword ptr [rax+0x40]{1to4}, 0x5', Nmemonic('vpshrdd', [xmm5, xmm2, Memory(0, rax, None, 0, 0x40, True), 5], [k3, T_z])),
('vcmpph(k1, xm15, ptr[rax+64], 1);', Nmemonic('vcmpph', [k1, xm15, Memory(0, rax, None, 0, 64), 1])),
('vcmpph(k1, xmm15, ptr[rax+64], 1);', Nmemonic('vcmpph', [k1, xmm15, Memory(0, rax, None, 0, 64), 1])),
]
for (s, expected) in tbl:
e = parseNmemonic(s)

View file

@ -155,7 +155,7 @@ namespace Xbyak {
enum {
DEFAULT_MAX_CODE_SIZE = 4096,
VERSION = 0x7100 /* 0xABCD = A.BC(.D) */
VERSION = 0x7200 /* 0xABCD = A.BC(.D) */
};
#ifndef MIE_INTEGER_TYPE_DEFINED
@ -232,6 +232,7 @@ enum {
ERR_CANT_USE_REX2,
ERR_INVALID_DFV,
ERR_INVALID_REG_IDX,
ERR_BAD_ENCODING_MODE,
ERR_INTERNAL // Put it at last.
};
@ -290,6 +291,7 @@ inline const char *ConvertErrorToString(int err)
"can't use rex2",
"invalid dfv",
"invalid reg index",
"bad encoding mode",
"internal error"
};
assert(ERR_INTERNAL + 1 == sizeof(errTbl) / sizeof(*errTbl));
@ -1673,7 +1675,9 @@ inline const uint8_t* Label::getAddress() const
typedef enum {
DefaultEncoding,
VexEncoding,
EvexEncoding
EvexEncoding,
PreAVX10v2Encoding,
AVX10v2Encoding
} PreferredEncoding;
class CodeGenerator : public CodeArray {
@ -2661,21 +2665,24 @@ private:
if (addr.getRegExp().getIndex().getKind() != kind) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
opVex(x, 0, addr, type, code);
}
void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, uint64_t type, int code, PreferredEncoding encoding, int imm = NONE, uint64_t typeVex = 0, uint64_t typeEvex = 0, int sel = 0)
void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, uint64_t type, int code, PreferredEncoding enc, int imm = NONE, uint64_t typeVex = 0, uint64_t typeEvex = 0, int sel = 0)
{
opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding, typeVex, typeEvex, sel), code, imm);
opAVX_X_X_XM(x1, x2, op, type | orEvexIf(enc, typeVex, typeEvex, sel), code, imm);
}
int orEvexIf(PreferredEncoding encoding, uint64_t typeVex, uint64_t typeEvex, int sel) {
if (encoding == DefaultEncoding) {
encoding = defaultEncoding_[sel];
PreferredEncoding getEncoding(PreferredEncoding enc, int sel) const
{
if (enc == DefaultEncoding) {
enc = defaultEncoding_[sel];
}
if (encoding == EvexEncoding) {
if ((sel == 0 && enc != VexEncoding && enc != EvexEncoding) || (sel == 1 && enc != PreAVX10v2Encoding && enc != AVX10v2Encoding)) XBYAK_THROW_RET(ERR_BAD_ENCODING_MODE, VexEncoding)
#ifdef XBYAK_DISABLE_AVX512
XBYAK_THROW(ERR_EVEX_IS_INVALID)
if (enc == EvexEncoding || enc == AVX10v2Encoding) XBYAK_THROW(ERR_EVEX_IS_INVALID)
#endif
return T_MUST_EVEX | typeEvex;
}
return typeVex;
return enc;
}
uint64_t orEvexIf(PreferredEncoding enc, uint64_t typeVex, uint64_t typeEvex, int sel) {
enc = getEncoding(enc, sel);
return ((sel == 0 && enc == VexEncoding) || (sel == 1 && enc != AVX10v2Encoding)) ? typeVex : (T_MUST_EVEX | typeEvex);
}
void opInOut(const Reg& a, const Reg& d, uint8_t code)
{
@ -3132,8 +3139,8 @@ public:
#endif
, isDefaultJmpNEAR_(false)
{
// select avx512-vnni, vmpsadbw(avx)
setDefaultEncoding();
setDefaultEncodingAVX10();
labelMgr_.set(this);
}
void reset()
@ -3170,16 +3177,20 @@ public:
#undef jnl
#endif
// set default encoding
// vnniEnc : control AVX512_VNNI (evex:default) or AVX-VNNI (vex)
// avx10Enc : control mpsadbw, AVX-VNNI-INT8 (vex:default) or AVX10.2 (evex)
void setDefaultEncoding(PreferredEncoding vnniEnc = EvexEncoding, PreferredEncoding avx10Enc = VexEncoding)
{ defaultEncoding_[0] = vnniEnc; defaultEncoding_[1] = avx10Enc; }
void sha1msg12(const Xmm& x, const Operand& op)
// set default encoding of VNNI
// EvexEncoding : AVX512_VNNI, VexEncoding : AVX-VNNI
void setDefaultEncoding(PreferredEncoding enc = EvexEncoding)
{
opROO(Reg(), op, x, T_MUST_EVEX, 0xD9);
if (enc != VexEncoding && enc != EvexEncoding) XBYAK_THROW(ERR_BAD_ENCODING_MODE)
defaultEncoding_[0] = enc;
}
// default : PreferredEncoding : AVX-VNNI-INT8/AVX512-FP16
void setDefaultEncodingAVX10(PreferredEncoding enc = PreAVX10v2Encoding)
{
if (enc != PreAVX10v2Encoding && enc != AVX10v2Encoding) XBYAK_THROW(ERR_BAD_ENCODING_MODE)
defaultEncoding_[1] = enc;
}
void bswap(const Reg32e& r)
{
int idx = r.getIdx();
@ -3192,6 +3203,48 @@ public:
}
db(0xC8 + (idx & 7));
}
// AVX10 zero-extending for vmovd, vmovw
void opAVX10ZeroExt(const Operand& op1, const Operand& op2, const uint64_t typeTbl[4], const int codeTbl[4], PreferredEncoding enc, int bit)
{
const Operand *p1 = &op1;
const Operand *p2 = &op2;
bool rev = false;
if (p1->isMEM()) {
std::swap(p1, p2);
rev = true;
}
if (p1->isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION)
if (p1->isXMM()) {
std::swap(p1, p2);
rev = !rev;
}
int sel = -1;
if (getEncoding(enc, 1) == AVX10v2Encoding) {
if ((p1->isXMM() || p1->isMEM()) && p2->isXMM()) sel = 2 + int(rev);
} else {
if ((p1->isREG(bit) || p1->isMEM()) && p2->isXMM()) sel = int(rev);
}
if (sel == -1) XBYAK_THROW(ERR_BAD_COMBINATION)
opAVX_X_X_XM(*static_cast<const Xmm*>(p2), xm0, *p1, typeTbl[sel], codeTbl[sel]);
}
void vmovd(const Operand& op1, const Operand& op2, PreferredEncoding enc = DefaultEncoding)
{
const uint64_t typeTbl[] = {
T_EVEX|T_66|T_0F|T_W0|T_N4, T_EVEX|T_66|T_0F|T_W0|T_N4, // legacy, avx, avx512
T_MUST_EVEX|T_66|T_0F|T_EW0|T_N4, T_MUST_EVEX|T_F3|T_0F|T_EW0|T_N4, // avx10.2
};
const int codeTbl[] = { 0x7E, 0x6E, 0xD6, 0x7E };
opAVX10ZeroExt(op1, op2, typeTbl, codeTbl, enc, 32);
}
void vmovw(const Operand& op1, const Operand& op2, PreferredEncoding enc = DefaultEncoding)
{
const uint64_t typeTbl[] = {
T_MUST_EVEX|T_66|T_MAP5|T_N2, T_MUST_EVEX|T_66|T_MAP5|T_N2, // avx512-fp16
T_MUST_EVEX|T_F3|T_MAP5|T_EW0|T_N2, T_MUST_EVEX|T_F3|T_MAP5|T_EW0|T_N2, // avx10.2
};
const int codeTbl[] = { 0x7E, 0x6E, 0x7E, 0x6E };
opAVX10ZeroExt(op1, op2, typeTbl, codeTbl, enc, 16|32|64);
}
/*
use single byte nop if useMultiByteNop = false
*/

View file

@ -1,4 +1,4 @@
const char *getVersionString() const { return "7.10"; }
const char *getVersionString() const { return "7.20"; }
void aadd(const Address& addr, const Reg32e &reg) { opMR(addr, reg, T_0F38, 0x0FC, T_APX); }
void aand(const Address& addr, const Reg32e &reg) { opMR(addr, reg, T_0F38|T_66, 0x0FC, T_APX|T_66); }
void adc(const Operand& op, uint32_t imm) { opOI(op, imm, 0x10, 2); }
@ -988,6 +988,7 @@ void sets(const Operand& op) { opSetCC(op, 8); }//-V524
void setz(const Operand& op) { opSetCC(op, 4); }//-V524
void sfence() { db(0x0F); db(0xAE); db(0xF8); }
void sha1msg1(const Xmm& x, const Operand& op) { opSSE_APX(x, op, T_0F38, 0xC9, T_MUST_EVEX, 0xD9); }
void sha1msg12(const Xmm& x, const Operand& op) { opROO(Reg(), op, x, T_MUST_EVEX, 0xD9); }
void sha1msg2(const Xmm& x, const Operand& op) { opSSE_APX(x, op, T_0F38, 0xCA, T_MUST_EVEX, 0xDA); }
void sha1nexte(const Xmm& x, const Operand& op) { opSSE_APX(x, op, T_0F38, 0xC8, T_MUST_EVEX, 0xD8); }
void sha1rnds4(const Xmm& x, const Operand& op, uint8_t imm) { opSSE_APX(x, op, T_0F3A, 0xCC, T_MUST_EVEX, 0xD4, imm); }
@ -1331,8 +1332,6 @@ void vmovapd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_
void vmovapd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F|T_EW1|T_YMM|T_EVEX, 0x28); }
void vmovaps(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_0F|T_EW0|T_YMM|T_EVEX|T_M_K, 0x29); }
void vmovaps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F|T_EW0|T_YMM|T_EVEX, 0x28); }
void vmovd(const Operand& op, const Xmm& x) { if (!op.isREG(32) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x7E); }
void vmovd(const Xmm& x, const Operand& op) { if (!op.isREG(32) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x6E); }
void vmovddup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_DUP|T_F2|T_0F|T_EW1|T_YMM|T_EVEX|T_ER_X|T_ER_Y|T_ER_Z, 0x12); }
void vmovdqa(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_66|T_0F|T_YMM, 0x7F); }
void vmovdqa(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F|T_YMM, 0x6F); }
@ -2202,6 +2201,8 @@ void vcvtne2ph2bf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X
void vcvtne2ph2hf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x18); }
void vcvtne2ph2hf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x1B); }
void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x72); }
void vcvtnebf162ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x69); }
void vcvtnebf162iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x6B); }
void vcvtneph2bf8(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); }
void vcvtneph2bf8s(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); }
void vcvtneph2hf8(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x18); }
@ -2212,6 +2213,8 @@ void vcvtpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0
void vcvtpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x79); }
void vcvtpd2uqq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x79); }
void vcvtph2dq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_MUST_EVEX|T_B16, 0x5B); }
void vcvtph2ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B16, 0x69); }
void vcvtph2iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B16, 0x6B); }
void vcvtph2pd(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_MUST_EVEX|T_B16, 0x5A); }
void vcvtph2psx(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_MAP6|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B16, 0x13); }
void vcvtph2qq(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_ER_X|T_MUST_EVEX|T_B16, 0x7B); }
@ -2219,6 +2222,8 @@ void vcvtph2udq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0,
void vcvtph2uqq(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_ER_X|T_MUST_EVEX|T_B16, 0x79); }
void vcvtph2uw(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x7D); }
void vcvtph2w(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x7D); }
void vcvtps2ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x69); }
void vcvtps2iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x6B); }
void vcvtps2phx(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, T_N16|T_N_VL|T_66|T_MAP5|T_EW0|T_ER_Z|T_MUST_EVEX|T_B32, 0x1D); }
void vcvtps2qq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_0F|T_EW0|T_YMM|T_ER_Y|T_MUST_EVEX|T_B32, 0x7B); }
void vcvtps2udq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_0F|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x79); }
@ -2235,22 +2240,40 @@ void vcvtsh2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N2|T_F3
void vcvtsi2sh(const Xmm& x1, const Xmm& x2, const Operand& op) { if (!(x1.isXMM() && x2.isXMM() && op.isBit(32|64))) XBYAK_THROW(ERR_BAD_COMBINATION) uint64_t type = (T_F3|T_MAP5|T_ER_R|T_MUST_EVEX|T_M_K) | (op.isBit(32) ? (T_EW0 | T_N4) : (T_EW1 | T_N8)); opVex(x1, &x2, op, type, 0x2A); }
void vcvtss2sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_MAP5|T_EW0|T_ER_X|T_MUST_EVEX, 0x1D); }
void vcvtss2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N4|T_F3|T_0F|T_ER_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x79); }
void vcvttnebf162ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x68); }
void vcvttnebf162iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x6A); }
void vcvttpd2dqs(const Xmm& x, const Operand& op) { opCvt2(x, op, T_MAP5|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x6D); }
void vcvttpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x7A); }
void vcvttpd2qqs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x6D); }
void vcvttpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x78); }
void vcvttpd2udqs(const Xmm& x, const Operand& op) { opCvt2(x, op, T_MAP5|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x6C); }
void vcvttpd2uqq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x78); }
void vcvttpd2uqqs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x6C); }
void vcvttph2dq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_F3|T_MAP5|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B16, 0x5B); }
void vcvttph2ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B16, 0x68); }
void vcvttph2iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B16, 0x6A); }
void vcvttph2qq(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_MUST_EVEX|T_B16, 0x7A); }
void vcvttph2udq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_MAP5|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B16, 0x78); }
void vcvttph2uqq(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_MUST_EVEX|T_B16, 0x78); }
void vcvttph2uw(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x7C); }
void vcvttph2w(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x7C); }
void vcvttps2dqs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B32, 0x6D); }
void vcvttps2ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x68); }
void vcvttps2iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x6A); }
void vcvttps2qq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_0F|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B32, 0x7A); }
void vcvttps2qqs(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_SAE_Y|T_MUST_EVEX|T_B32, 0x6D); }
void vcvttps2udq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_0F|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x78); }
void vcvttps2udqs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B32, 0x6C); }
void vcvttps2uqq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_0F|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B32, 0x78); }
void vcvttps2uqqs(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_SAE_Y|T_MUST_EVEX|T_B32, 0x6C); }
void vcvttsd2sis(const Reg32e& r, const Operand& op) { uint64_t type = (T_N8|T_F2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x6D); }
void vcvttsd2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N8|T_F2|T_0F|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x78); }
void vcvttsd2usis(const Reg32e& r, const Operand& op) { uint64_t type = (T_N8|T_F2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x6C); }
void vcvttsh2si(const Reg32e& r, const Operand& op) { uint64_t type = (T_N2|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x2C); }
void vcvttsh2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N2|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x78); }
void vcvttss2sis(const Reg32e& r, const Operand& op) { uint64_t type = (T_N4|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x6D); }
void vcvttss2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N4|T_F3|T_0F|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x78); }
void vcvttss2usis(const Reg32e& r, const Operand& op) { uint64_t type = (T_N4|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x6C); }
void vcvtudq2pd(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_F3|T_0F|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x7A); }
void vcvtudq2ph(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, T_N16|T_N_VL|T_F2|T_MAP5|T_EW0|T_ER_Z|T_MUST_EVEX|T_B32, 0x7A); }
void vcvtudq2ps(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_0F|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x7A); }
@ -2374,6 +2397,13 @@ void vinserti64x4(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm)
void vmaxpbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5F); }
void vmaxph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_SAE_Z | T_B16, 0x5F); }
void vmaxsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_SAE_X | T_N2, 0x5F); }
void vminmaxnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x52, imm); }
void vminmaxpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x52, imm); }
void vminmaxph(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_0F3A|T_EW0|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B16, 0x52, imm); }
void vminmaxps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B32, 0x52, imm); }
void vminmaxsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F3A|T_EW1|T_SAE_X|T_MUST_EVEX, 0x53, imm); }
void vminmaxsh(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N2|T_0F3A|T_EW0|T_SAE_X|T_MUST_EVEX, 0x53, imm); }
void vminmaxss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F3A|T_EW0|T_SAE_X|T_MUST_EVEX, 0x53, imm); }
void vminpbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5D); }
void vminph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_SAE_Z | T_B16, 0x5D); }
void vminsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_SAE_X | T_N2, 0x5D); }
@ -2392,9 +2422,6 @@ void vmovdqu8(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_0F
void vmovsh(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_N2|T_F3|T_MAP5|T_EW0|T_MUST_EVEX|T_M_K, 0x11); }
void vmovsh(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, T_N2|T_F3|T_MAP5|T_EW0|T_MUST_EVEX, 0x10); }
void vmovsh(const Xmm& x1, const Xmm& x2, const Xmm& x3) { opAVX_X_X_XM(x1, x2, x3, T_N2|T_F3|T_MAP5|T_EW0|T_MUST_EVEX, 0x10); }
void vmovw(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x7E); }
void vmovw(const Reg32e& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, r, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x7E); }
void vmovw(const Xmm& x, const Operand& op) { if (!op.isREG(32|64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x6E); }
void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F3A|T_YMM, 0x42, encoding, imm, T_66|T_W0|T_YMM, T_F3|T_0F3A|T_EW0|T_B32, 1); }
void vmulnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x59); }
void vmulph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x59); }