mirror of
https://github.com/herumi/xbyak
synced 2024-11-20 16:06:14 -07:00
This commit is contained in:
commit
3ee31be62d
18 changed files with 624 additions and 131 deletions
|
@ -1,6 +1,6 @@
|
||||||
cmake_minimum_required(VERSION 3.5)
|
cmake_minimum_required(VERSION 3.5)
|
||||||
|
|
||||||
project(xbyak LANGUAGES CXX VERSION 7.10)
|
project(xbyak LANGUAGES CXX VERSION 7.20)
|
||||||
|
|
||||||
file(GLOB headers xbyak/*.h)
|
file(GLOB headers xbyak/*.h)
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
# History
|
# History
|
||||||
|
|
||||||
|
* 2024/Oct/15 ver 7.20 Fixed the specification of setDefaultEncoding, setDefaultEncodingAVX10./
|
||||||
|
* 2024/Oct/15 ver 7.11 Added full support for AVX10.2
|
||||||
* 2024/Oct/13 ver 7.10 support AVX10 integer and fp16 vnni, media new instructions. setDefaultEncoding is extended.
|
* 2024/Oct/13 ver 7.10 support AVX10 integer and fp16 vnni, media new instructions. setDefaultEncoding is extended.
|
||||||
* 2024/Oct/10 ver 7.09.1 fix the names of vpcompressb and vpcompressw
|
* 2024/Oct/10 ver 7.09.1 fix the names of vpcompressb and vpcompressw
|
||||||
* 2024/Oct/08 ver 7.09 support YMM embedded rounding of AVX10.2 and fix some mnemonics with {sae}/{er}.
|
* 2024/Oct/08 ver 7.09 support YMM embedded rounding of AVX10.2 and fix some mnemonics with {sae}/{er}.
|
||||||
|
|
101
doc/usage.md
101
doc/usage.md
|
@ -1,7 +1,7 @@
|
||||||
# Usage
|
# Usage
|
||||||
|
|
||||||
Inherit `Xbyak::CodeGenerator` class and make the class method.
|
Inherit `Xbyak::CodeGenerator` class and make the class method.
|
||||||
```
|
```cpp
|
||||||
#include <xbyak/xbyak.h>
|
#include <xbyak/xbyak.h>
|
||||||
|
|
||||||
struct Code : Xbyak::CodeGenerator {
|
struct Code : Xbyak::CodeGenerator {
|
||||||
|
@ -13,7 +13,7 @@ struct Code : Xbyak::CodeGenerator {
|
||||||
};
|
};
|
||||||
```
|
```
|
||||||
Or you can pass the instance of CodeGenerator without inheriting.
|
Or you can pass the instance of CodeGenerator without inheriting.
|
||||||
```
|
```cpp
|
||||||
void genCode(Xbyak::CodeGenerator& code, int x) {
|
void genCode(Xbyak::CodeGenerator& code, int x) {
|
||||||
using namespace Xbyak::util;
|
using namespace Xbyak::util;
|
||||||
code.mov(eax, x);
|
code.mov(eax, x);
|
||||||
|
@ -23,7 +23,7 @@ void genCode(Xbyak::CodeGenerator& code, int x) {
|
||||||
|
|
||||||
Make an instance of the class and get the function
|
Make an instance of the class and get the function
|
||||||
pointer by calling `getCode()` and call it.
|
pointer by calling `getCode()` and call it.
|
||||||
```
|
```cpp
|
||||||
Code c(5);
|
Code c(5);
|
||||||
int (*f)() = c.getCode<int (*)()>();
|
int (*f)() = c.getCode<int (*)()>();
|
||||||
printf("ret=%d\n", f()); // ret = 5
|
printf("ret=%d\n", f()); // ret = 5
|
||||||
|
@ -32,7 +32,7 @@ printf("ret=%d\n", f()); // ret = 5
|
||||||
## Syntax
|
## Syntax
|
||||||
Similar to MASM/NASM syntax with parentheses.
|
Similar to MASM/NASM syntax with parentheses.
|
||||||
|
|
||||||
```
|
```cpp
|
||||||
NASM Xbyak
|
NASM Xbyak
|
||||||
mov eax, ebx --> mov(eax, ebx);
|
mov eax, ebx --> mov(eax, ebx);
|
||||||
inc ecx inc(ecx);
|
inc ecx inc(ecx);
|
||||||
|
@ -43,7 +43,7 @@ ret --> ret();
|
||||||
Use `qword`, `dword`, `word` and `byte` if it is necessary to specify the size of memory,
|
Use `qword`, `dword`, `word` and `byte` if it is necessary to specify the size of memory,
|
||||||
otherwise use `ptr`.
|
otherwise use `ptr`.
|
||||||
|
|
||||||
```
|
```cpp
|
||||||
(ptr|qword|dword|word|byte) [base + index * (1|2|4|8) + displacement]
|
(ptr|qword|dword|word|byte) [base + index * (1|2|4|8) + displacement]
|
||||||
[rip + 32bit disp] ; x64 only
|
[rip + 32bit disp] ; x64 only
|
||||||
|
|
||||||
|
@ -53,19 +53,21 @@ mov al, [ebx+ecx] --> mov(al, ptr [ebx + ecx]);
|
||||||
test byte [esp], 4 --> test(byte [esp], 4);
|
test byte [esp], 4 --> test(byte [esp], 4);
|
||||||
inc qword [rax] --> inc(qword [rax]);
|
inc qword [rax] --> inc(qword [rax]);
|
||||||
```
|
```
|
||||||
|
|
||||||
**Note**: `qword`, ... are member variables, then don't use `dword` as unsigned int type.
|
**Note**: `qword`, ... are member variables, then don't use `dword` as unsigned int type.
|
||||||
|
|
||||||
### How to use Selector (Segment Register)
|
### How to use Selector (Segment Register)
|
||||||
```
|
```cpp
|
||||||
mov eax, [fs:eax] --> putSeg(fs);
|
mov eax, [fs:eax] --> putSeg(fs);
|
||||||
mov(eax, ptr [eax]);
|
mov(eax, ptr [eax]);
|
||||||
mov ax, cs --> mov(ax, cs);
|
mov ax, cs --> mov(ax, cs);
|
||||||
```
|
```
|
||||||
|
|
||||||
**Note**: Segment class is not derived from `Operand`.
|
**Note**: Segment class is not derived from `Operand`.
|
||||||
|
|
||||||
## AVX
|
## AVX
|
||||||
|
|
||||||
```
|
```cpp
|
||||||
vaddps(xmm1, xmm2, xmm3); // xmm1 <- xmm2 + xmm3
|
vaddps(xmm1, xmm2, xmm3); // xmm1 <- xmm2 + xmm3
|
||||||
vaddps(xmm2, xmm3, ptr [rax]); // use ptr to access memory
|
vaddps(xmm2, xmm3, ptr [rax]); // use ptr to access memory
|
||||||
vgatherdpd(xmm1, ptr [ebp + 256 + xmm2*4], xmm3);
|
vgatherdpd(xmm1, ptr [ebp + 256 + xmm2*4], xmm3);
|
||||||
|
@ -74,13 +76,13 @@ vgatherdpd(xmm1, ptr [ebp + 256 + xmm2*4], xmm3);
|
||||||
**Note**:
|
**Note**:
|
||||||
If `XBYAK_ENABLE_OMITTED_OPERAND` is defined, then you can use two operand version for backward compatibility.
|
If `XBYAK_ENABLE_OMITTED_OPERAND` is defined, then you can use two operand version for backward compatibility.
|
||||||
But the newer version will not support it.
|
But the newer version will not support it.
|
||||||
```
|
```cpp
|
||||||
vaddps(xmm2, xmm3); // xmm2 <- xmm2 + xmm3
|
vaddps(xmm2, xmm3); // xmm2 <- xmm2 + xmm3
|
||||||
```
|
```
|
||||||
|
|
||||||
## AVX-512
|
## AVX-512
|
||||||
|
|
||||||
```
|
```cpp
|
||||||
vaddpd zmm2, zmm5, zmm30 --> vaddpd(zmm2, zmm5, zmm30);
|
vaddpd zmm2, zmm5, zmm30 --> vaddpd(zmm2, zmm5, zmm30);
|
||||||
vaddpd xmm30, xmm20, [rax] --> vaddpd(xmm30, xmm20, ptr [rax]);
|
vaddpd xmm30, xmm20, [rax] --> vaddpd(xmm30, xmm20, ptr [rax]);
|
||||||
vaddps xmm30, xmm20, [rax] --> vaddps(xmm30, xmm20, ptr [rax]);
|
vaddps xmm30, xmm20, [rax] --> vaddps(xmm30, xmm20, ptr [rax]);
|
||||||
|
@ -108,35 +110,44 @@ vfpclasspd k5{k3}, [rax+64]{1to2}, 5 --> vfpclasspd(k5|k3, xword_b [rax+64],
|
||||||
vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, yword_b [rax+64], 5); // broadcast 64-bit to 256-bit
|
vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, yword_b [rax+64], 5); // broadcast 64-bit to 256-bit
|
||||||
```
|
```
|
||||||
|
|
||||||
## Selecting AVX512-VNNI, AVX-VNNI, AVX-VNNI-INT8 etc.
|
## Selecting AVX512-VNNI, AVX-VNNI, AVX-VNNI-INT8, AVX10.2.
|
||||||
Some mnemonics have two types of encodings: VEX and EVEX.
|
Some mnemonics have some types of encodings: VEX, EVEX, AVX10.2.
|
||||||
The functions for these mnemonics include an optional parameter as the last argument to specify the encoding.
|
The functions for these mnemonics include an optional parameter as the last argument to specify the encoding.
|
||||||
The default behavior depends on the order in which the instruction was introduced (whether VEX or EVEX came first),
|
The default behavior depends on the order in which the instruction was introduced (whether VEX, EVEX or AVX10.2 came first),
|
||||||
and can be specified using setDefaultEncoding.
|
and can be specified using setDefaultEncoding.
|
||||||
|
|
||||||
```
|
```cpp
|
||||||
vpdpbusd(xm0, xm1, xm2); // default encoding: EVEX (AVX512-VNNI)
|
vpdpbusd(xm0, xm1, xm2); // default encoding: EVEX (AVX512-VNNI)
|
||||||
vpdpbusd(xm0, xm1, xm2, EvexEncoding); // same as the above
|
vpdpbusd(xm0, xm1, xm2, EvexEncoding); // same as the above
|
||||||
vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX (AVX-VNNI)
|
vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX (AVX-VNNI)
|
||||||
setDefaultEncoding(VexEncoding); // default encoding is VEX
|
setDefaultEncoding(VexEncoding); // change default encoding
|
||||||
vpdpbusd(xm0, xm1, xm2); // VEX
|
vpdpbusd(xm0, xm1, xm2); // VEX
|
||||||
|
|
||||||
vmpsadbw(xm1, xm3, xm15, 3); // default encoding: VEX (AVX-VNNI)
|
vmpsadbw(xm1, xm3, xm15, 3); // default encoding: AVX
|
||||||
vmpsadbw(xm1, xm3, xm15, 3, VexEncoding); // same as the above
|
vmpsadbw(xm1, xm3, xm15, 3, PreAVX10v2Encoding); // same as the above
|
||||||
vmpsadbw(xm1, xm3, xm15, 3, EvexEncoding); // EVEX (AVX10.2)
|
vmpsadbw(xm1, xm3, xm15, 3, AVX10v2Encoding); // AVX10.2
|
||||||
setDefaultEncoding(VexEncoding, EvexEncoding); // use 2nd argument.
|
setDefaultEncodingAVX10(AVX10v2Encoding); // change default encoding
|
||||||
vmpsadbw(xm1, xm3, xm15, 3); // EVEX
|
vmpsadbw(xm1, xm3, xm15, 3); // AVX10.2
|
||||||
```
|
```
|
||||||
|
|
||||||
- `setDefaultEncoding(PreferredEncoding vnniEnc = EvexEncoding, PreferredEncoding avx10Enc = VexEncoding)`
|
- `setDefaultEncoding(PreferredEncoding enc = EvexEncoding)`
|
||||||
Control the default encoding of mnemonics with `Xbyak::PreferredEncoding` param.
|
- Configure encoding for AVX512-VNNI or AVX-VNNI instructions.
|
||||||
|
- `setDefaultEncodingAVX10(PreferredEncoding enc = PreAVXv2Encoding)`
|
||||||
|
- Configure encoding for pre-AVX10.2 and AVX10.2 instructions.
|
||||||
|
|
||||||
param|vnniEnc|avx10Enc
|
`setDefaultEncoding`|EvexEncoding (default)|VexEncoding
|
||||||
-|-|-
|
-|-|-
|
||||||
EvexEncoding|AVX512-VNNI|AVX10.2
|
feature|AVX512-VNNI|AVX-VNNI
|
||||||
VexEncoding|AVX-VNNI|AVX-VNNI-INT8
|
|
||||||
default|EvexEncoding|VexEncoding
|
- Target functions: vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds
|
||||||
mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd, vpdpbssds, vpdpbsud, vpdpbsuds, vpdpbuud, vpdpbuuds, vpdpwsud vpdpwsuds vpdpwusd vpdpwusds vpdpwuud, vpdpwuuds
|
|
||||||
|
`setDefaultEncodingAVX10`|PreAVX10v2Encoding (default)|AVX10v2Encoding
|
||||||
|
-|-|-
|
||||||
|
feature|AVX-VNNI-INT8, AVX512-FP16|AVX10.2
|
||||||
|
|
||||||
|
- Target functions: vmpsadbw, vpdpbssd, vpdpbssds, vpdpbsud, vpdpbsuds, vpdpbuud, vpdpbuuds, vpdpwsud vpdpwsuds vpdpwusd vpdpwusds vpdpwuud, vpdpwuuds, vmovd, vmovw
|
||||||
|
|
||||||
|
- Remark: vmovd and vmovw several kinds of encoding such as AVX/AVX512F/AVX512-FP16/AVX10.2.
|
||||||
|
|
||||||
### Remark
|
### Remark
|
||||||
* `k1`, ..., `k7` are opmask registers.
|
* `k1`, ..., `k7` are opmask registers.
|
||||||
|
@ -179,7 +190,7 @@ mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd, vpdpbssds,
|
||||||
Two kinds of Label are supported. (String literal and Label class).
|
Two kinds of Label are supported. (String literal and Label class).
|
||||||
|
|
||||||
### String literal
|
### String literal
|
||||||
```
|
```cpp
|
||||||
L("L1");
|
L("L1");
|
||||||
jmp("L1");
|
jmp("L1");
|
||||||
|
|
||||||
|
@ -201,7 +212,7 @@ L("L3");
|
||||||
|
|
||||||
### Support `@@`, `@f`, `@b` like MASM
|
### Support `@@`, `@f`, `@b` like MASM
|
||||||
|
|
||||||
```
|
```cpp
|
||||||
L("@@"); // <A>
|
L("@@"); // <A>
|
||||||
jmp("@b"); // jmp to <A>
|
jmp("@b"); // jmp to <A>
|
||||||
jmp("@f"); // jmp to <B>
|
jmp("@f"); // jmp to <B>
|
||||||
|
@ -217,7 +228,7 @@ Label symbols beginning with a period between `inLocalLabel()` and `outLocalLabe
|
||||||
are treated as a local label.
|
are treated as a local label.
|
||||||
`inLocalLabel()` and `outLocalLabel()` can be nested.
|
`inLocalLabel()` and `outLocalLabel()` can be nested.
|
||||||
|
|
||||||
```
|
```cpp
|
||||||
void func1()
|
void func1()
|
||||||
{
|
{
|
||||||
inLocalLabel();
|
inLocalLabel();
|
||||||
|
@ -240,7 +251,7 @@ void func1()
|
||||||
Xbyak deals with jump mnemonics of an undefined label as short jump if no type is specified.
|
Xbyak deals with jump mnemonics of an undefined label as short jump if no type is specified.
|
||||||
So if the size between jmp and label is larger than 127 byte, then xbyak will cause an error.
|
So if the size between jmp and label is larger than 127 byte, then xbyak will cause an error.
|
||||||
|
|
||||||
```
|
```cpp
|
||||||
jmp("short-jmp"); // short jmp
|
jmp("short-jmp"); // short jmp
|
||||||
// small code
|
// small code
|
||||||
L("short-jmp");
|
L("short-jmp");
|
||||||
|
@ -249,14 +260,16 @@ jmp("long-jmp");
|
||||||
// long code
|
// long code
|
||||||
L("long-jmp"); // throw exception
|
L("long-jmp"); // throw exception
|
||||||
```
|
```
|
||||||
|
|
||||||
Then specify T_NEAR for jmp.
|
Then specify T_NEAR for jmp.
|
||||||
```
|
```cpp
|
||||||
jmp("long-jmp", T_NEAR); // long jmp
|
jmp("long-jmp", T_NEAR); // long jmp
|
||||||
// long code
|
// long code
|
||||||
L("long-jmp");
|
L("long-jmp");
|
||||||
```
|
```
|
||||||
|
|
||||||
Or call `setDefaultJmpNEAR(true);` once, then the default type is set to T_NEAR.
|
Or call `setDefaultJmpNEAR(true);` once, then the default type is set to T_NEAR.
|
||||||
```
|
```cpp
|
||||||
jmp("long-jmp"); // long jmp
|
jmp("long-jmp"); // long jmp
|
||||||
// long code
|
// long code
|
||||||
L("long-jmp");
|
L("long-jmp");
|
||||||
|
@ -266,7 +279,7 @@ L("long-jmp");
|
||||||
|
|
||||||
`L()` and `jxx()` support Label class.
|
`L()` and `jxx()` support Label class.
|
||||||
|
|
||||||
```
|
```cpp
|
||||||
Xbyak::Label label1, label2;
|
Xbyak::Label label1, label2;
|
||||||
L(label1);
|
L(label1);
|
||||||
...
|
...
|
||||||
|
@ -278,7 +291,7 @@ L(label2);
|
||||||
```
|
```
|
||||||
|
|
||||||
Use `putL` for jmp table
|
Use `putL` for jmp table
|
||||||
```
|
```cpp
|
||||||
Label labelTbl, L0, L1, L2;
|
Label labelTbl, L0, L1, L2;
|
||||||
mov(rax, labelTbl);
|
mov(rax, labelTbl);
|
||||||
// rdx is an index of jump table
|
// rdx is an index of jump table
|
||||||
|
@ -295,7 +308,7 @@ L(L1);
|
||||||
|
|
||||||
`assignL(dstLabel, srcLabel)` binds dstLabel with srcLabel.
|
`assignL(dstLabel, srcLabel)` binds dstLabel with srcLabel.
|
||||||
|
|
||||||
```
|
```cpp
|
||||||
Label label2;
|
Label label2;
|
||||||
Label label1 = L(); // make label1 ; same to Label label1; L(label1);
|
Label label1 = L(); // make label1 ; same to Label label1; L(label1);
|
||||||
...
|
...
|
||||||
|
@ -310,7 +323,7 @@ The `jmp` in the above code jumps to label1 assigned by `assignL`.
|
||||||
* dstLabel must not be used in `L()`.
|
* dstLabel must not be used in `L()`.
|
||||||
|
|
||||||
`Label::getAddress()` returns the address specified by the label instance and 0 if not specified.
|
`Label::getAddress()` returns the address specified by the label instance and 0 if not specified.
|
||||||
```
|
```cpp
|
||||||
// not AutoGrow mode
|
// not AutoGrow mode
|
||||||
Label label;
|
Label label;
|
||||||
assert(label.getAddress() == 0);
|
assert(label.getAddress() == 0);
|
||||||
|
@ -319,7 +332,7 @@ assert(label.getAddress() == getCurr());
|
||||||
```
|
```
|
||||||
|
|
||||||
### Rip ; relative addressing
|
### Rip ; relative addressing
|
||||||
```
|
```cpp
|
||||||
Label label;
|
Label label;
|
||||||
mov(eax, ptr [rip + label]); // eax = 4
|
mov(eax, ptr [rip + label]); // eax = 4
|
||||||
...
|
...
|
||||||
|
@ -327,7 +340,7 @@ mov(eax, ptr [rip + label]); // eax = 4
|
||||||
L(label);
|
L(label);
|
||||||
dd(4);
|
dd(4);
|
||||||
```
|
```
|
||||||
```
|
```cpp
|
||||||
int x;
|
int x;
|
||||||
...
|
...
|
||||||
mov(eax, ptr[rip + &x]); // throw exception if the difference between &x and current position is larger than 2GiB
|
mov(eax, ptr[rip + &x]); // throw exception if the difference between &x and current position is larger than 2GiB
|
||||||
|
@ -338,13 +351,13 @@ int x;
|
||||||
Use `word|dword|qword` instead of `ptr` to specify the address size.
|
Use `word|dword|qword` instead of `ptr` to specify the address size.
|
||||||
|
|
||||||
### 32 bit mode
|
### 32 bit mode
|
||||||
```
|
```cpp
|
||||||
jmp(word[eax], T_FAR); // jmp m16:16(FF /5)
|
jmp(word[eax], T_FAR); // jmp m16:16(FF /5)
|
||||||
jmp(dword[eax], T_FAR); // jmp m16:32(FF /5)
|
jmp(dword[eax], T_FAR); // jmp m16:32(FF /5)
|
||||||
```
|
```
|
||||||
|
|
||||||
### 64 bit mode
|
### 64 bit mode
|
||||||
```
|
```cpp
|
||||||
jmp(word[rax], T_FAR); // jmp m16:16(FF /5)
|
jmp(word[rax], T_FAR); // jmp m16:16(FF /5)
|
||||||
jmp(dword[rax], T_FAR); // jmp m16:32(FF /5)
|
jmp(dword[rax], T_FAR); // jmp m16:32(FF /5)
|
||||||
jmp(qword[rax], T_FAR); // jmp m16:64(REX.W FF /5)
|
jmp(qword[rax], T_FAR); // jmp m16:64(REX.W FF /5)
|
||||||
|
@ -355,7 +368,7 @@ The same applies to `call`.
|
||||||
The default max code size is 4096 bytes.
|
The default max code size is 4096 bytes.
|
||||||
Specify the size in constructor of `CodeGenerator()` if necessary.
|
Specify the size in constructor of `CodeGenerator()` if necessary.
|
||||||
|
|
||||||
```
|
```cpp
|
||||||
class Quantize : public Xbyak::CodeGenerator {
|
class Quantize : public Xbyak::CodeGenerator {
|
||||||
public:
|
public:
|
||||||
Quantize()
|
Quantize()
|
||||||
|
@ -372,7 +385,7 @@ You can make jit code on prepared memory.
|
||||||
|
|
||||||
Call `setProtectModeRE` yourself to change memory mode if using the prepared memory.
|
Call `setProtectModeRE` yourself to change memory mode if using the prepared memory.
|
||||||
|
|
||||||
```
|
```cpp
|
||||||
uint8_t alignas(4096) buf[8192]; // C++11 or later
|
uint8_t alignas(4096) buf[8192]; // C++11 or later
|
||||||
|
|
||||||
struct Code : Xbyak::CodeGenerator {
|
struct Code : Xbyak::CodeGenerator {
|
||||||
|
@ -398,7 +411,7 @@ int main()
|
||||||
The memory region for jit is automatically extended if necessary when `AutoGrow` is specified in a constructor of `CodeGenerator`.
|
The memory region for jit is automatically extended if necessary when `AutoGrow` is specified in a constructor of `CodeGenerator`.
|
||||||
|
|
||||||
Call `ready()` or `readyRE()` before calling `getCode()` to fix jump address.
|
Call `ready()` or `readyRE()` before calling `getCode()` to fix jump address.
|
||||||
```
|
```cpp
|
||||||
struct Code : Xbyak::CodeGenerator {
|
struct Code : Xbyak::CodeGenerator {
|
||||||
Code()
|
Code()
|
||||||
: Xbyak::CodeGenerator(<default memory size>, Xbyak::AutoGrow)
|
: Xbyak::CodeGenerator(<default memory size>, Xbyak::AutoGrow)
|
||||||
|
@ -419,7 +432,7 @@ Xbyak set Read/Write/Exec mode to memory to run jit code.
|
||||||
If you want to use Read/Exec mode for security, then specify `DontSetProtectRWE` for `CodeGenerator` and
|
If you want to use Read/Exec mode for security, then specify `DontSetProtectRWE` for `CodeGenerator` and
|
||||||
call `setProtectModeRE()` after generating jit code.
|
call `setProtectModeRE()` after generating jit code.
|
||||||
|
|
||||||
```
|
```cpp
|
||||||
struct Code : Xbyak::CodeGenerator {
|
struct Code : Xbyak::CodeGenerator {
|
||||||
Code()
|
Code()
|
||||||
: Xbyak::CodeGenerator(4096, Xbyak::DontSetProtectRWE)
|
: Xbyak::CodeGenerator(4096, Xbyak::DontSetProtectRWE)
|
||||||
|
|
|
@ -209,6 +209,30 @@ void putX_XM()
|
||||||
{ 0x2E, "vucomxsd", T_MUST_EVEX | T_F3 | T_0F | T_EW1 | T_SAE_X | T_N8 },
|
{ 0x2E, "vucomxsd", T_MUST_EVEX | T_F3 | T_0F | T_EW1 | T_SAE_X | T_N8 },
|
||||||
{ 0x2E, "vucomxsh", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_SAE_X | T_N2 },
|
{ 0x2E, "vucomxsh", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_SAE_X | T_N2 },
|
||||||
{ 0x2E, "vucomxss", T_MUST_EVEX | T_F2 | T_0F | T_EW0 | T_SAE_X | T_N4 },
|
{ 0x2E, "vucomxss", T_MUST_EVEX | T_F2 | T_0F | T_EW0 | T_SAE_X | T_N4 },
|
||||||
|
|
||||||
|
// 13.1
|
||||||
|
{ 0x69, "vcvtnebf162ibs", T_MUST_EVEX | T_YMM | T_F2 | T_MAP5 | T_EW0 | T_B16 },
|
||||||
|
{ 0x6B, "vcvtnebf162iubs", T_MUST_EVEX | T_YMM | T_F2 | T_MAP5 | T_EW0 | T_B16 },
|
||||||
|
{ 0x68, "vcvttnebf162ibs", T_MUST_EVEX | T_YMM | T_F2 | T_MAP5 | T_EW0 | T_B16 },
|
||||||
|
{ 0x6A, "vcvttnebf162iubs", T_MUST_EVEX | T_YMM | T_F2 | T_MAP5 | T_EW0 | T_B16 },
|
||||||
|
// 13.3
|
||||||
|
{ 0x6D, "vcvttpd2qqs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW1 | T_B64 | T_SAE_Y | T_SAE_Z },
|
||||||
|
// 13.5
|
||||||
|
{ 0x6C, "vcvttpd2uqqs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW1 | T_B64 | T_SAE_Y | T_SAE_Z },
|
||||||
|
// 13.6
|
||||||
|
{ 0x69, "vcvtph2ibs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW0 | T_B16 | T_ER_Y | T_ER_Z },
|
||||||
|
{ 0x6B, "vcvtph2iubs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW0 | T_B16 | T_ER_Y | T_ER_Z },
|
||||||
|
{ 0x68, "vcvttph2ibs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW0 | T_B16 | T_ER_Y | T_ER_Z },
|
||||||
|
{ 0x6A, "vcvttph2iubs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW0 | T_B16 | T_ER_Y | T_ER_Z },
|
||||||
|
// 13.7
|
||||||
|
{ 0x6D, "vcvttps2dqs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW0 | T_B32 | T_SAE_Y | T_SAE_Z },
|
||||||
|
// 13.8
|
||||||
|
{ 0x69, "vcvtps2ibs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW0 | T_B32 | T_ER_Y | T_ER_Z },
|
||||||
|
{ 0x6B, "vcvtps2iubs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW0 | T_B32 | T_ER_Y | T_ER_Z },
|
||||||
|
{ 0x68, "vcvttps2ibs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW0 | T_B32 | T_ER_Y | T_ER_Z },
|
||||||
|
{ 0x6A, "vcvttps2iubs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW0 | T_B32 | T_ER_Y | T_ER_Z },
|
||||||
|
// 13.10
|
||||||
|
{ 0x6C, "vcvttps2udqs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW0 | T_B32 | T_SAE_Y | T_SAE_Z },
|
||||||
};
|
};
|
||||||
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
||||||
const Tbl *p = &tbl[i];
|
const Tbl *p = &tbl[i];
|
||||||
|
@ -240,7 +264,6 @@ void putM_X()
|
||||||
{ 0x7F, "vmovdqu32", T_F3 | T_0F | T_MUST_EVEX | T_YMM | T_EW0 | T_ER_X | T_ER_Y | T_ER_Z | T_M_K },
|
{ 0x7F, "vmovdqu32", T_F3 | T_0F | T_MUST_EVEX | T_YMM | T_EW0 | T_ER_X | T_ER_Y | T_ER_Z | T_M_K },
|
||||||
{ 0x7F, "vmovdqu64", T_F3 | T_0F | T_MUST_EVEX | T_YMM | T_EW1 | T_ER_X | T_ER_Y | T_ER_Z | T_M_K },
|
{ 0x7F, "vmovdqu64", T_F3 | T_0F | T_MUST_EVEX | T_YMM | T_EW1 | T_ER_X | T_ER_Y | T_ER_Z | T_M_K },
|
||||||
{ 0x11, "vmovsh", T_F3 | T_MAP5 | T_MUST_EVEX | T_EW0 | T_N2 | T_M_K },
|
{ 0x11, "vmovsh", T_F3 | T_MAP5 | T_MUST_EVEX | T_EW0 | T_N2 | T_M_K },
|
||||||
{ 0x7E, "vmovw", T_66 | T_MAP5 | T_MUST_EVEX | T_N2 },
|
|
||||||
};
|
};
|
||||||
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
||||||
const Tbl *p = &tbl[i];
|
const Tbl *p = &tbl[i];
|
||||||
|
@ -447,6 +470,13 @@ void putX_X_XM_IMM()
|
||||||
{ 0x1B, "vcvtne2ph2hf8s", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_B16 | T_N1, false },
|
{ 0x1B, "vcvtne2ph2hf8s", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_B16 | T_N1, false },
|
||||||
|
|
||||||
{ 0x52, "vdpphps", T_MUST_EVEX | T_0F38 | T_EW0 | T_YMM | T_B32, false },
|
{ 0x52, "vdpphps", T_MUST_EVEX | T_0F38 | T_EW0 | T_YMM | T_B32, false },
|
||||||
|
{ 0x52, "vminmaxnepbf16", T_MUST_EVEX | T_F2 | T_0F3A | T_EW0 | T_YMM | T_B16, true },
|
||||||
|
{ 0x52, "vminmaxpd", T_MUST_EVEX | T_66 | T_0F3A | T_EW1 | T_YMM | T_B64 | T_SAE_Y | T_SAE_Z, true },
|
||||||
|
{ 0x52, "vminmaxph", T_MUST_EVEX | T_0F3A | T_EW0 | T_YMM | T_B16 | T_SAE_Y | T_SAE_Z, true },
|
||||||
|
{ 0x52, "vminmaxps", T_MUST_EVEX | T_66 | T_0F3A | T_EW0 | T_YMM | T_B32 | T_SAE_Y | T_SAE_Z, true },
|
||||||
|
{ 0x53, "vminmaxsd", T_MUST_EVEX | T_66 | T_0F3A | T_EW1 | T_SAE_X | T_N8, true },
|
||||||
|
{ 0x53, "vminmaxsh", T_MUST_EVEX | T_0F3A | T_EW0 | T_SAE_X | T_N2, true },
|
||||||
|
{ 0x53, "vminmaxss", T_MUST_EVEX | T_66 | T_0F3A | T_EW0 | T_SAE_X | T_N4, true },
|
||||||
};
|
};
|
||||||
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
||||||
const Tbl *p = &tbl[i];
|
const Tbl *p = &tbl[i];
|
||||||
|
@ -658,6 +688,22 @@ void putCvt()
|
||||||
{ 0x7B, "vcvtusi2sh", T_F3 | T_MAP5 | T_MUST_EVEX | T_ER_R | T_M_K, 6 },
|
{ 0x7B, "vcvtusi2sh", T_F3 | T_MAP5 | T_MUST_EVEX | T_ER_R | T_M_K, 6 },
|
||||||
|
|
||||||
{ 0x72, "vcvtneps2bf16", T_MUST_EVEX | T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 2 },
|
{ 0x72, "vcvtneps2bf16", T_MUST_EVEX | T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 2 },
|
||||||
|
// 13.2
|
||||||
|
{ 0x6D, "vcvttpd2dqs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW1 | T_B64 | T_SAE_Y | T_SAE_Z, 2 },
|
||||||
|
// 13.4
|
||||||
|
{ 0x6C, "vcvttpd2udqs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW1 | T_B64 | T_SAE_Y | T_SAE_Z, 2 },
|
||||||
|
// 13.9
|
||||||
|
{ 0x6D, "vcvttps2qqs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW0 | T_B32 | T_SAE_X | T_SAE_Y | T_N8 | T_N_VL, 1 },
|
||||||
|
// 13.11
|
||||||
|
{ 0x6C, "vcvttps2uqqs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW0 | T_B32 | T_SAE_X | T_SAE_Y | T_N8 | T_N_VL, 1 },
|
||||||
|
// 13.12
|
||||||
|
{ 0x6D, "vcvttsd2sis", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_SAE_X | T_N8, 0 },
|
||||||
|
// 13.13
|
||||||
|
{ 0x6C, "vcvttsd2usis", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_SAE_X | T_N8, 0 },
|
||||||
|
// 13.14
|
||||||
|
{ 0x6D, "vcvttss2sis", T_MUST_EVEX | T_F3 | T_MAP5 | T_EW0 | T_SAE_X | T_N4, 0 },
|
||||||
|
// 13.15
|
||||||
|
{ 0x6C, "vcvttss2usis", T_MUST_EVEX | T_F3 | T_MAP5 | T_EW0 | T_SAE_X | T_N4, 0 },
|
||||||
};
|
};
|
||||||
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
||||||
const Tbl& p = tbl[i];
|
const Tbl& p = tbl[i];
|
||||||
|
@ -666,10 +712,10 @@ void putCvt()
|
||||||
case 0:
|
case 0:
|
||||||
printf("void %s(const Reg32e& r, const Operand& op) { uint64_t type = (%s) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x%02X); }\n", p.name, s.c_str(), p.code);
|
printf("void %s(const Reg32e& r, const Operand& op) { uint64_t type = (%s) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x%02X); }\n", p.name, s.c_str(), p.code);
|
||||||
break;
|
break;
|
||||||
case 1:
|
case 1: // (x, x/m), (y, x/m256), (z, y/m)
|
||||||
printf("void %s(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, %s, 0x%02X); }\n", p.name, s.c_str(), p.code);
|
printf("void %s(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, %s, 0x%02X); }\n", p.name, s.c_str(), p.code);
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2: // (x, x/m), (x, y/m256), (y, z/m)
|
||||||
printf("void %s(const Xmm& x, const Operand& op) { opCvt2(x, op, %s, 0x%02X); }\n", p.name, s.c_str(), p.code);
|
printf("void %s(const Xmm& x, const Operand& op) { opCvt2(x, op, %s, 0x%02X); }\n", p.name, s.c_str(), p.code);
|
||||||
break;
|
break;
|
||||||
case 3:
|
case 3:
|
||||||
|
@ -1032,12 +1078,6 @@ void putFP16_2()
|
||||||
printf("void vmovsh(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, %s, 0x10); }\n", s.c_str());
|
printf("void vmovsh(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, %s, 0x10); }\n", s.c_str());
|
||||||
printf("void vmovsh(const Xmm& x1, const Xmm& x2, const Xmm& x3) { opAVX_X_X_XM(x1, x2, x3, %s, 0x10); }\n", s.c_str());
|
printf("void vmovsh(const Xmm& x1, const Xmm& x2, const Xmm& x3) { opAVX_X_X_XM(x1, x2, x3, %s, 0x10); }\n", s.c_str());
|
||||||
}
|
}
|
||||||
{
|
|
||||||
uint64_t type = T_66 | T_MAP5 | T_MUST_EVEX | T_N2;
|
|
||||||
std::string s = type2String(type);
|
|
||||||
printf("void vmovw(const Xmm& x, const Operand& op) { if (!op.isREG(32|64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, %s, 0x6E); }\n", s.c_str());
|
|
||||||
printf("void vmovw(const Reg32e& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, r, %s, 0x7E); }\n", s.c_str());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void putFP16()
|
void putFP16()
|
||||||
|
|
|
@ -1443,6 +1443,7 @@ void put()
|
||||||
printf("void %s(const Xmm& x, const Operand& op) { opSSE_APX(x, op, T_0F38, 0x%02X, T_MUST_EVEX, 0x%02X); }\n", p->name, p->code, p->code2);
|
printf("void %s(const Xmm& x, const Operand& op) { opSSE_APX(x, op, T_0F38, 0x%02X, T_MUST_EVEX, 0x%02X); }\n", p->name, p->code, p->code2);
|
||||||
}
|
}
|
||||||
puts("void sha1rnds4(const Xmm& x, const Operand& op, uint8_t imm) { opSSE_APX(x, op, T_0F3A, 0xCC, T_MUST_EVEX, 0xD4, imm); }");
|
puts("void sha1rnds4(const Xmm& x, const Operand& op, uint8_t imm) { opSSE_APX(x, op, T_0F3A, 0xCC, T_MUST_EVEX, 0xD4, imm); }");
|
||||||
|
puts("void sha1msg12(const Xmm& x, const Operand& op) { opROO(Reg(), op, x, T_MUST_EVEX, 0xD9); }");
|
||||||
}
|
}
|
||||||
// (m, x), (m, y)
|
// (m, x), (m, y)
|
||||||
{
|
{
|
||||||
|
@ -1733,9 +1734,6 @@ void put()
|
||||||
}
|
}
|
||||||
// mov
|
// mov
|
||||||
{
|
{
|
||||||
printf("void vmovd(const Xmm& x, const Operand& op) { if (!op.isREG(32) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x6E); }\n");
|
|
||||||
printf("void vmovd(const Operand& op, const Xmm& x) { if (!op.isREG(32) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x7E); }\n");
|
|
||||||
|
|
||||||
printf("void vmovq(const Xmm& x, const Address& addr) { uint64_t type; uint8_t code; if (x.getIdx() < 16) { type = T_0F | T_F3; code = 0x7E; } else { type = T_0F | T_66 | T_EVEX | T_EW1 | T_N8; code = 0x6E; } opAVX_X_X_XM(x, xm0, addr, type, code); }\n");
|
printf("void vmovq(const Xmm& x, const Address& addr) { uint64_t type; uint8_t code; if (x.getIdx() < 16) { type = T_0F | T_F3; code = 0x7E; } else { type = T_0F | T_66 | T_EVEX | T_EW1 | T_N8; code = 0x6E; } opAVX_X_X_XM(x, xm0, addr, type, code); }\n");
|
||||||
printf("void vmovq(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, x.getIdx() < 16 ? 0xD6 : 0x7E); }\n");
|
printf("void vmovq(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, x.getIdx() < 16 ? 0xD6 : 0x7E); }\n");
|
||||||
printf("void vmovq(const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x1, xm0, x2, T_0F | T_F3 | T_EVEX | T_EW1 | T_N8, 0x7E); }\n");
|
printf("void vmovq(const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x1, xm0, x2, T_0F | T_F3 | T_EVEX | T_EW1 | T_N8, 0x7E); }\n");
|
||||||
|
@ -1899,36 +1897,6 @@ void put()
|
||||||
printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, %s, 0x%02X, encoding); }\n", p->name, s.c_str(), p->code);
|
printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, %s, 0x%02X, encoding); }\n", p->name, s.c_str(), p->code);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// avx-vnni-int8
|
|
||||||
// avx-vnni-int16
|
|
||||||
#if 0
|
|
||||||
{
|
|
||||||
const struct Tbl {
|
|
||||||
uint8_t code;
|
|
||||||
const char *name;
|
|
||||||
uint64_t type;
|
|
||||||
} tbl[] = {
|
|
||||||
// { 0x50, "vpdpbssd", T_F2 | T_0F38 | T_W0 | T_YMM },
|
|
||||||
// { 0x51, "vpdpbssds", T_F2 | T_0F38 | T_W0 | T_YMM },
|
|
||||||
// { 0x50, "vpdpbsud", T_F3 | T_0F38 | T_W0 | T_YMM },
|
|
||||||
// { 0x51, "vpdpbsuds", T_F3 | T_0F38 | T_W0 | T_YMM },
|
|
||||||
// { 0x50, "vpdpbuud", T_0F38 | T_W0 | T_YMM },
|
|
||||||
// { 0x51, "vpdpbuuds", T_0F38 | T_W0 | T_YMM },
|
|
||||||
|
|
||||||
// { 0xD2, "vpdpwsud", T_F3 | T_0F38 | T_W0 | T_YMM },
|
|
||||||
// { 0xD3, "vpdpwsuds", T_F3 | T_0F38 | T_W0 | T_YMM },
|
|
||||||
// { 0xD2, "vpdpwusd", T_66 | T_0F38 | T_W0 | T_YMM },
|
|
||||||
// { 0xD3, "vpdpwusds", T_66 | T_0F38 | T_W0 | T_YMM },
|
|
||||||
// { 0xD2, "vpdpwuud", T_0F38 | T_W0 | T_YMM },
|
|
||||||
// { 0xD3, "vpdpwuuds", T_0F38 | T_W0 | T_YMM },
|
|
||||||
};
|
|
||||||
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
|
||||||
const Tbl *p = &tbl[i];
|
|
||||||
std::string s = type2String(p->type);
|
|
||||||
printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, %s, 0x%02X); }\n", p->name, s.c_str(), p->code);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void put32()
|
void put32()
|
||||||
|
|
|
@ -5,7 +5,7 @@
|
||||||
project(
|
project(
|
||||||
'xbyak',
|
'xbyak',
|
||||||
'cpp',
|
'cpp',
|
||||||
version: '7.10',
|
version: '7.20',
|
||||||
license: 'BSD-3-Clause',
|
license: 'BSD-3-Clause',
|
||||||
default_options: 'b_ndebug=if-release'
|
default_options: 'b_ndebug=if-release'
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
|
|
||||||
# Xbyak 7.10 [![Badge Build]][Build Status]
|
# Xbyak 7.20 [![Badge Build]][Build Status]
|
||||||
|
|
||||||
*A C++ JIT assembler for x86 (IA32), x64 (AMD64, x86-64)*
|
*A C++ JIT assembler for x86 (IA32), x64 (AMD64, x86-64)*
|
||||||
|
|
||||||
|
@ -20,8 +20,7 @@ It is named from a Japanese word [開闢](https://translate.google.com/?hl=ja&sl
|
||||||
|
|
||||||
- header file only
|
- header file only
|
||||||
- Intel/MASM like syntax
|
- Intel/MASM like syntax
|
||||||
- fully support AVX-512
|
- Full support for AVX-512, APX, and AVX10.2
|
||||||
- support APX/AVX10
|
|
||||||
|
|
||||||
**Note**:
|
**Note**:
|
||||||
Use `and_()`, `or_()`, ... instead of `and()`, `or()`.
|
Use `and_()`, `or_()`, ... instead of `and()`, `or()`.
|
||||||
|
@ -33,6 +32,7 @@ If you want to use them, then specify `-fno-operator-names` option to gcc/clang.
|
||||||
|
|
||||||
### News
|
### News
|
||||||
|
|
||||||
|
- support AVX10.2
|
||||||
- support xresldtrk/xsusldtrk
|
- support xresldtrk/xsusldtrk
|
||||||
- support RAO-INT for APX
|
- support RAO-INT for APX
|
||||||
- support AVX10 detection, AESKLE, WIDE_KL, KEYLOCKER, KEYLOCKER_WIDE
|
- support AVX10 detection, AESKLE, WIDE_KL, KEYLOCKER, KEYLOCKER_WIDE
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
|
|
||||||
C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 7.10
|
C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 7.20
|
||||||
|
|
||||||
-----------------------------------------------------------------------------
|
-----------------------------------------------------------------------------
|
||||||
◎概要
|
◎概要
|
||||||
|
@ -14,7 +14,7 @@
|
||||||
xbyak.hをインクルードするだけですぐ利用することができます。
|
xbyak.hをインクルードするだけですぐ利用することができます。
|
||||||
C++の枠組み内で閉じているため、外部アセンブラは不要です。
|
C++の枠組み内で閉じているため、外部アセンブラは不要です。
|
||||||
32bit/64bit両対応です。
|
32bit/64bit両対応です。
|
||||||
対応ニーモニック:特権命令除くx86, MMX/MMX2/SSE/SSE2/SSE3/SSSE3/SSE4/FPU(一部)/AVX/AVX2/FMA/VEX-encoded GPR
|
対応ニーモニック:特権命令除くx86, MMX/MMX2/SSE/SSE2/SSE3/SSSE3/SSE4/FPU(一部)/AVX/AVX2/FMA/AVX-512/APX/AVX10.2
|
||||||
|
|
||||||
・Windows Xp(32bit, 64bit), Windows 7/Linux(32bit, 64bit)/Intel Mac対応
|
・Windows Xp(32bit, 64bit), Windows 7/Linux(32bit, 64bit)/Intel Mac対応
|
||||||
Windows Xp, Windows 7上ではVC2008, VC2010, VC2012
|
Windows Xp, Windows 7上ではVC2008, VC2010, VC2012
|
||||||
|
@ -46,7 +46,7 @@ Linuxではmake installで/usr/local/include/xbyakにコピーされます。
|
||||||
-----------------------------------------------------------------------------
|
-----------------------------------------------------------------------------
|
||||||
◎新機能
|
◎新機能
|
||||||
|
|
||||||
APX/AVX10対応
|
APX/AVX10.2対応
|
||||||
|
|
||||||
例外なしモード追加
|
例外なしモード追加
|
||||||
XBYAK_NO_EXCEPTIONを定義してコンパイルするとgcc/clangで-fno-exceptionsオプションでコンパイルできます。
|
XBYAK_NO_EXCEPTIONを定義してコンパイルするとgcc/clangで-fno-exceptionsオプションでコンパイルできます。
|
||||||
|
@ -404,6 +404,9 @@ sample/{echo,hello}.bfは http://www.kmonos.net/alang/etc/brainfuck.php から
|
||||||
-----------------------------------------------------------------------------
|
-----------------------------------------------------------------------------
|
||||||
◎履歴
|
◎履歴
|
||||||
|
|
||||||
|
2024/10/15 ver 7.20 setDefaultEncoding/setDefaultEncodingAVX10の仕様確定
|
||||||
|
2024/10/15 ver 7.11 AVX10.2完全サポート
|
||||||
|
2024/10/13 ver 7.10 AVX10 integer and fp16 vnni, mediaの新命令対応. setDefaultEncodingの拡張.
|
||||||
2024/10/10 ver 7.09.1 vpcompressbとvpcompresswの名前修正
|
2024/10/10 ver 7.09.1 vpcompressbとvpcompresswの名前修正
|
||||||
2024/10/08 ver 7.09 AVX10.2のYMMレジスタの埋め込み丸め対応
|
2024/10/08 ver 7.09 AVX10.2のYMMレジスタの埋め込み丸め対応
|
||||||
2024/10/07 ver 7.08 rdfabaseなどサポート
|
2024/10/07 ver 7.08 rdfabaseなどサポート
|
||||||
|
|
|
@ -60,9 +60,12 @@ apx: apx.cpp $(XBYAK_INC)
|
||||||
avx10_test: avx10_test.cpp $(XBYAK_INC)
|
avx10_test: avx10_test.cpp $(XBYAK_INC)
|
||||||
$(CXX) $(CFLAGS) avx10_test.cpp -o $@ -DXBYAK64
|
$(CXX) $(CFLAGS) avx10_test.cpp -o $@ -DXBYAK64
|
||||||
|
|
||||||
TEST_FILES=old.txt new-ymm.txt bf16.txt comp.txt convert.txt
|
TEST_FILES=old.txt new-ymm.txt bf16.txt comp.txt misc.txt convert.txt minmax.txt saturation.txt
|
||||||
xed_test:
|
xed_test:
|
||||||
@for target in $(addprefix avx10/, $(TEST_FILES)); do ./test_by_xed.sh $$target; done
|
@set -e; \
|
||||||
|
for target in $(addprefix avx10/, $(TEST_FILES)); do \
|
||||||
|
./test_by_xed.sh $$target || exit 1; \
|
||||||
|
done
|
||||||
|
|
||||||
test_nm: normalize_prefix $(TARGET)
|
test_nm: normalize_prefix $(TARGET)
|
||||||
$(MAKE) -C ../gen
|
$(MAKE) -C ../gen
|
||||||
|
|
66
test/avx10/minmax.txt
Normal file
66
test/avx10/minmax.txt
Normal file
|
@ -0,0 +1,66 @@
|
||||||
|
vminmaxnepbf16(xm1|k3|T_z, xm2, xm3, 5);
|
||||||
|
vminmaxnepbf16(xm1|k3|T_z, xm2, ptr[rax+128], 5);
|
||||||
|
vminmaxnepbf16(xm1|k3|T_z, xm2, ptr_b[rax+128], 5);
|
||||||
|
|
||||||
|
vminmaxnepbf16(ym1|k3|T_z, ym2, ym3, 5);
|
||||||
|
vminmaxnepbf16(ym1|k3|T_z, ym2, ptr[rax+128], 5);
|
||||||
|
vminmaxnepbf16(ym1|k3|T_z, ym2, ptr_b[rax+128], 5);
|
||||||
|
|
||||||
|
vminmaxnepbf16(zm1|k3|T_z, zm2, zm3, 5);
|
||||||
|
vminmaxnepbf16(zm1|k3|T_z, zm2, ptr[rax+128], 5);
|
||||||
|
vminmaxnepbf16(zm1|k3|T_z, zm2, ptr_b[rax+128], 5);
|
||||||
|
//
|
||||||
|
vminmaxpd(xm1|k3|T_z, xm2, xm3, 5);
|
||||||
|
vminmaxpd(xm1|k3|T_z, xm2, ptr[rax+128], 5);
|
||||||
|
vminmaxpd(xm1|k3|T_z, xm2, ptr_b[rax+128], 5);
|
||||||
|
|
||||||
|
vminmaxpd(ym1|k3|T_z, ym2, ym3, 5);
|
||||||
|
vminmaxpd(ym1|k3|T_z, ym2, ym3|T_sae, 5);
|
||||||
|
vminmaxpd(ym1|k3|T_z, ym2, ptr[rax+128], 5);
|
||||||
|
vminmaxpd(ym1|k3|T_z, ym2, ptr_b[rax+128], 5);
|
||||||
|
|
||||||
|
vminmaxpd(zm1|k3|T_z, zm2, zm3, 5);
|
||||||
|
vminmaxpd(zm1|k3|T_z, zm2, zm3|T_sae, 5);
|
||||||
|
vminmaxpd(zm1|k3|T_z, zm2, ptr[rax+128], 5);
|
||||||
|
vminmaxpd(zm1|k3|T_z, zm2, ptr_b[rax+128], 5);
|
||||||
|
//
|
||||||
|
vminmaxph(xm1|k3|T_z, xm2, xm3, 5);
|
||||||
|
vminmaxph(xm1|k3|T_z, xm2, ptr[rax+128], 5);
|
||||||
|
vminmaxph(xm1|k3|T_z, xm2, ptr[rax+128], 5);
|
||||||
|
vminmaxph(xm1|k3|T_z, xm2, ptr_b[rax+128], 5);
|
||||||
|
|
||||||
|
vminmaxph(ym1|k3|T_z, ym2, ym3, 5);
|
||||||
|
vminmaxph(ym1|k3|T_z, ym2, ym3|T_sae, 5);
|
||||||
|
vminmaxph(ym1|k3|T_z, ym2, ptr[rax+128], 5);
|
||||||
|
vminmaxph(ym1|k3|T_z, ym2, ptr_b[rax+128], 5);
|
||||||
|
|
||||||
|
vminmaxph(zm1|k3|T_z, zm2, zm3, 5);
|
||||||
|
vminmaxph(zm1|k3|T_z, zm2, zm3|T_sae, 5);
|
||||||
|
vminmaxph(zm1|k3|T_z, zm2, ptr[rax+128], 5);
|
||||||
|
vminmaxph(zm1|k3|T_z, zm2, ptr_b[rax+128], 5);
|
||||||
|
//
|
||||||
|
vminmaxps(xm1|k3|T_z, xm2, xm3, 5);
|
||||||
|
vminmaxps(xm1|k3|T_z, xm2, ptr[rax+128], 5);
|
||||||
|
vminmaxps(xm1|k3|T_z, xm2, ptr_b[rax+128], 5);
|
||||||
|
|
||||||
|
vminmaxps(ym1|k3|T_z, ym2, ym3, 5);
|
||||||
|
vminmaxps(ym1|k3|T_z, ym2, ym3|T_sae, 5);
|
||||||
|
vminmaxps(ym1|k3|T_z, ym2, ptr[rax+128], 5);
|
||||||
|
vminmaxps(ym1|k3|T_z, ym2, ptr_b[rax+128], 5);
|
||||||
|
|
||||||
|
vminmaxps(zm1|k3|T_z, zm2, zm3, 5);
|
||||||
|
vminmaxps(zm1|k3|T_z, zm2, zm3|T_sae, 5);
|
||||||
|
vminmaxps(zm1|k3|T_z, zm2, ptr[rax+128], 5);
|
||||||
|
vminmaxps(zm1|k3|T_z, zm2, ptr_b[rax+128], 5);
|
||||||
|
//
|
||||||
|
vminmaxsd(xm1|k3|T_z, xm2, xm3, 5);
|
||||||
|
vminmaxsd(xm1|k3|T_z, xm2, xm3|T_sae, 5);
|
||||||
|
vminmaxsd(xm1|k3|T_z, xm2, ptr[rax+128], 5);
|
||||||
|
//
|
||||||
|
vminmaxsh(xm1|k3|T_z, xm2, xm3, 5);
|
||||||
|
vminmaxsh(xm1|k3|T_z, xm2, xm3|T_sae, 5);
|
||||||
|
vminmaxsh(xm1|k3|T_z, xm2, ptr[rax+128], 5);
|
||||||
|
//
|
||||||
|
vminmaxss(xm1|k3|T_z, xm2, xm3, 5);
|
||||||
|
vminmaxss(xm1|k3|T_z, xm2, xm3|T_sae, 5);
|
||||||
|
vminmaxss(xm1|k3|T_z, xm2, ptr[rax+128], 5);
|
|
@ -1,3 +1,4 @@
|
||||||
|
// AVX10 integer and FP16 VNNI, media and zero-extending
|
||||||
vdpphps(xm1, xm2, xm3);
|
vdpphps(xm1, xm2, xm3);
|
||||||
vdpphps(xm1, xm2, ptr[rax+128]);
|
vdpphps(xm1, xm2, ptr[rax+128]);
|
||||||
vdpphps(xm1, xm2, ptr_b[rax+128]);
|
vdpphps(xm1, xm2, ptr_b[rax+128]);
|
||||||
|
@ -165,3 +166,14 @@ vpdpwuuds(ym1, ym2, ptr_b[rax+128]);
|
||||||
vpdpwuuds(zm1, zm2, zm3);
|
vpdpwuuds(zm1, zm2, zm3);
|
||||||
vpdpwuuds(zm1, zm2, ptr[rax+128]);
|
vpdpwuuds(zm1, zm2, ptr[rax+128]);
|
||||||
vpdpwuuds(zm1, zm2, ptr_b[rax+128]);
|
vpdpwuuds(zm1, zm2, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
//
|
||||||
|
vmovd(xm10, xm20);
|
||||||
|
vmovd(xm1, xm2);
|
||||||
|
vmovd(xm10, ptr[rax+128]);
|
||||||
|
vmovd(ptr[rax+128], xm30);
|
||||||
|
//
|
||||||
|
vmovw(xm1, xm20);
|
||||||
|
vmovw(xm1, xm2);
|
||||||
|
vmovw(xm3, ptr [rax+0x40]);
|
||||||
|
vmovw(ptr [rax+0x40], xm7);
|
||||||
|
|
|
@ -355,10 +355,6 @@ vgetmantsh(xmm1|k1|T_z|T_sae, xmm3, xmm5, 0x6);
|
||||||
vmovsh(xmm1|k1|T_z, ptr [rax+0x40]);
|
vmovsh(xmm1|k1|T_z, ptr [rax+0x40]);
|
||||||
vmovsh(ptr [rax+0x40]|k1, xmm1);
|
vmovsh(ptr [rax+0x40]|k1, xmm1);
|
||||||
vmovsh(xmm1|k2|T_z, xmm3, xmm5);
|
vmovsh(xmm1|k2|T_z, xmm3, xmm5);
|
||||||
vmovw(xmm1, r13d);
|
|
||||||
vmovw(xmm3, ptr [rax+0x40]);
|
|
||||||
vmovw(r9d, xmm1);
|
|
||||||
vmovw(ptr [rax+0x40], xmm7);
|
|
||||||
vcvtsd2sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3);
|
vcvtsd2sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3);
|
||||||
vcvtsd2sh(xmm1, xmm2, ptr [rax+0x40]);
|
vcvtsd2sh(xmm1, xmm2, ptr [rax+0x40]);
|
||||||
vcvtsh2sd(xmm1|k1|T_z|T_sae, xmm2, xmm3);
|
vcvtsh2sd(xmm1|k1|T_z|T_sae, xmm2, xmm3);
|
||||||
|
|
310
test/avx10/saturation.txt
Normal file
310
test/avx10/saturation.txt
Normal file
|
@ -0,0 +1,310 @@
|
||||||
|
//
|
||||||
|
vcvtnebf162ibs(xm1, xm2);
|
||||||
|
vcvtnebf162ibs(xm1, ptr[rax+128]);
|
||||||
|
vcvtnebf162ibs(xm1, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvtnebf162ibs(ym1, ym2);
|
||||||
|
vcvtnebf162ibs(ym1, ptr[rax+128]);
|
||||||
|
vcvtnebf162ibs(ym1, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvtnebf162ibs(zm1, zm2);
|
||||||
|
vcvtnebf162ibs(zm1, ptr[rax+128]);
|
||||||
|
vcvtnebf162ibs(zm1, ptr_b[rax+128]);
|
||||||
|
//
|
||||||
|
vcvtnebf162iubs(xm1, xm2);
|
||||||
|
vcvtnebf162iubs(xm1, ptr[rax+128]);
|
||||||
|
vcvtnebf162iubs(xm1, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvtnebf162iubs(ym1, ym2);
|
||||||
|
vcvtnebf162iubs(ym1, ptr[rax+128]);
|
||||||
|
vcvtnebf162iubs(ym1, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvtnebf162iubs(zm1, zm2);
|
||||||
|
vcvtnebf162iubs(zm1, ptr[rax+128]);
|
||||||
|
vcvtnebf162iubs(zm1, ptr_b[rax+128]);
|
||||||
|
//
|
||||||
|
vcvttnebf162ibs(xm1, xm2);
|
||||||
|
vcvttnebf162ibs(xm1, ptr[rax+128]);
|
||||||
|
vcvttnebf162ibs(xm1, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvttnebf162ibs(ym1, ym2);
|
||||||
|
vcvttnebf162ibs(ym1, ptr[rax+128]);
|
||||||
|
vcvttnebf162ibs(ym1, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvttnebf162ibs(zm1, zm2);
|
||||||
|
vcvttnebf162ibs(zm1, ptr[rax+128]);
|
||||||
|
vcvttnebf162ibs(zm1, ptr_b[rax+128]);
|
||||||
|
//
|
||||||
|
vcvttnebf162iubs(xm1, xm2);
|
||||||
|
vcvttnebf162iubs(xm1, ptr[rax+128]);
|
||||||
|
vcvttnebf162iubs(xm1, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvttnebf162iubs(ym1, ym2);
|
||||||
|
vcvttnebf162iubs(ym1, ptr[rax+128]);
|
||||||
|
vcvttnebf162iubs(ym1, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvttnebf162iubs(zm1, zm2);
|
||||||
|
vcvttnebf162iubs(zm1, ptr[rax+128]);
|
||||||
|
vcvttnebf162iubs(zm1, ptr_b[rax+128]);
|
||||||
|
//
|
||||||
|
vcvttpd2qqs(xm1, xm2);
|
||||||
|
vcvttpd2qqs(xm1, ptr[rax+128]);
|
||||||
|
vcvttpd2qqs(xm1, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvttpd2qqs(ym1, ym2);
|
||||||
|
vcvttpd2qqs(ym1, ym2|T_sae);
|
||||||
|
vcvttpd2qqs(ym1, ptr[rax+128]);
|
||||||
|
vcvttpd2qqs(ym1, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvttpd2qqs(zm1, zm2);
|
||||||
|
vcvttpd2qqs(zm1, zm2|T_sae);
|
||||||
|
vcvttpd2qqs(zm1, ptr[rax+128]);
|
||||||
|
vcvttpd2qqs(zm1, ptr_b[rax+128]);
|
||||||
|
//
|
||||||
|
vcvttpd2uqqs(xm1, xm2);
|
||||||
|
vcvttpd2uqqs(xm1, ptr[rax+128]);
|
||||||
|
vcvttpd2uqqs(xm1, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvttpd2uqqs(ym1, ym2);
|
||||||
|
vcvttpd2uqqs(ym1, ym2|T_sae);
|
||||||
|
vcvttpd2uqqs(ym1, ptr[rax+128]);
|
||||||
|
vcvttpd2uqqs(ym1, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvttpd2uqqs(zm1, zm2);
|
||||||
|
vcvttpd2uqqs(zm1, zm2|T_sae);
|
||||||
|
vcvttpd2uqqs(zm1, ptr[rax+128]);
|
||||||
|
vcvttpd2uqqs(zm1, ptr_b[rax+128]);
|
||||||
|
//
|
||||||
|
vcvtph2ibs(xm1, xm2);
|
||||||
|
vcvtph2ibs(xm1, ptr[rax+128]);
|
||||||
|
vcvtph2ibs(xm1, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvtph2ibs(ym1, ym2);
|
||||||
|
vcvtph2ibs(ym1, ym2|T_rd_sae);
|
||||||
|
vcvtph2ibs(ym1, ptr[rax+128]);
|
||||||
|
vcvtph2ibs(ym1, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvtph2ibs(zm1, zm2);
|
||||||
|
vcvtph2ibs(zm1, zm2|T_ru_sae);
|
||||||
|
vcvtph2ibs(zm1, ptr[rax+128]);
|
||||||
|
vcvtph2ibs(zm1, ptr_b[rax+128]);
|
||||||
|
//
|
||||||
|
vcvtph2iubs(xm1, xm2);
|
||||||
|
vcvtph2iubs(xm1, ptr[rax+128]);
|
||||||
|
vcvtph2iubs(xm1, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvtph2iubs(ym1, ym2);
|
||||||
|
vcvtph2iubs(ym1, ym2|T_rd_sae);
|
||||||
|
vcvtph2iubs(ym1, ptr[rax+128]);
|
||||||
|
vcvtph2iubs(ym1, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvtph2iubs(zm1, zm2);
|
||||||
|
vcvtph2iubs(zm1, zm2|T_ru_sae);
|
||||||
|
vcvtph2iubs(zm1, ptr[rax+128]);
|
||||||
|
vcvtph2iubs(zm1, ptr_b[rax+128]);
|
||||||
|
//
|
||||||
|
vcvttph2ibs(xm1, xm2);
|
||||||
|
vcvttph2ibs(xm1, ptr[rax+128]);
|
||||||
|
vcvttph2ibs(xm1, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvttph2ibs(ym1, ym2);
|
||||||
|
vcvttph2ibs(ym1, ym2|T_rd_sae);
|
||||||
|
vcvttph2ibs(ym1, ptr[rax+128]);
|
||||||
|
vcvttph2ibs(ym1, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvttph2ibs(zm1, zm2);
|
||||||
|
vcvttph2ibs(zm1, zm2|T_ru_sae);
|
||||||
|
vcvttph2ibs(zm1, ptr[rax+128]);
|
||||||
|
vcvttph2ibs(zm1, ptr_b[rax+128]);
|
||||||
|
//
|
||||||
|
vcvttph2iubs(xm1, xm2);
|
||||||
|
vcvttph2iubs(xm1, ptr[rax+128]);
|
||||||
|
vcvttph2iubs(xm1, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvttph2iubs(ym1, ym2);
|
||||||
|
vcvttph2iubs(ym1, ym2|T_rd_sae);
|
||||||
|
vcvttph2iubs(ym1, ptr[rax+128]);
|
||||||
|
vcvttph2iubs(ym1, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvttph2iubs(zm1, zm2);
|
||||||
|
vcvttph2iubs(zm1, zm2|T_ru_sae);
|
||||||
|
vcvttph2iubs(zm1, ptr[rax+128]);
|
||||||
|
vcvttph2iubs(zm1, ptr_b[rax+128]);
|
||||||
|
//
|
||||||
|
vcvttps2dqs(xm1, xm2);
|
||||||
|
vcvttps2dqs(xm1, ptr[rax+128]);
|
||||||
|
vcvttps2dqs(xm1, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvttps2dqs(ym1, ym2);
|
||||||
|
vcvttps2dqs(ym1, ym2|T_sae);
|
||||||
|
vcvttps2dqs(ym1, ptr[rax+128]);
|
||||||
|
vcvttps2dqs(ym1, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvttps2dqs(zm1, zm2);
|
||||||
|
vcvttps2dqs(zm1, zm2|T_sae);
|
||||||
|
vcvttps2dqs(zm1, ptr[rax+128]);
|
||||||
|
vcvttps2dqs(zm1, ptr_b[rax+128]);
|
||||||
|
//
|
||||||
|
vcvtps2ibs(xm1, xm2);
|
||||||
|
vcvtps2ibs(xm1, ptr[rax+128]);
|
||||||
|
vcvtps2ibs(xm1, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvtps2ibs(ym1, ym2);
|
||||||
|
vcvtps2ibs(ym1, ym2|T_rd_sae);
|
||||||
|
vcvtps2ibs(ym1, ptr[rax+128]);
|
||||||
|
vcvtps2ibs(ym1, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvtps2ibs(zm1, zm2);
|
||||||
|
vcvtps2ibs(zm1, zm2|T_ru_sae);
|
||||||
|
vcvtps2ibs(zm1, ptr[rax+128]);
|
||||||
|
vcvtps2ibs(zm1, ptr_b[rax+128]);
|
||||||
|
//
|
||||||
|
vcvtps2iubs(xm1, xm2);
|
||||||
|
vcvtps2iubs(xm1, ptr[rax+128]);
|
||||||
|
vcvtps2iubs(xm1, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvtps2iubs(ym1, ym2);
|
||||||
|
vcvtps2iubs(ym1, ym2|T_rd_sae);
|
||||||
|
vcvtps2iubs(ym1, ptr[rax+128]);
|
||||||
|
vcvtps2iubs(ym1, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvtps2iubs(zm1, zm2);
|
||||||
|
vcvtps2iubs(zm1, zm2|T_ru_sae);
|
||||||
|
vcvtps2iubs(zm1, ptr[rax+128]);
|
||||||
|
vcvtps2iubs(zm1, ptr_b[rax+128]);
|
||||||
|
//
|
||||||
|
vcvttps2ibs(xm1, xm2);
|
||||||
|
vcvttps2ibs(xm1, ptr[rax+128]);
|
||||||
|
vcvttps2ibs(xm1, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvttps2ibs(ym1, ym2);
|
||||||
|
vcvttps2ibs(ym1, ym2|T_rd_sae);
|
||||||
|
vcvttps2ibs(ym1, ptr[rax+128]);
|
||||||
|
vcvttps2ibs(ym1, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvttps2ibs(zm1, zm2);
|
||||||
|
vcvttps2ibs(zm1, zm2|T_ru_sae);
|
||||||
|
vcvttps2ibs(zm1, ptr[rax+128]);
|
||||||
|
vcvttps2ibs(zm1, ptr_b[rax+128]);
|
||||||
|
//
|
||||||
|
vcvttps2iubs(xm1, xm2);
|
||||||
|
vcvttps2iubs(xm1, ptr[rax+128]);
|
||||||
|
vcvttps2iubs(xm1, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvttps2iubs(ym1, ym2);
|
||||||
|
vcvttps2iubs(ym1, ym2|T_rd_sae);
|
||||||
|
vcvttps2iubs(ym1, ptr[rax+128]);
|
||||||
|
vcvttps2iubs(ym1, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvttps2iubs(zm1, zm2);
|
||||||
|
vcvttps2iubs(zm1, zm2|T_ru_sae);
|
||||||
|
vcvttps2iubs(zm1, ptr[rax+128]);
|
||||||
|
vcvttps2iubs(zm1, ptr_b[rax+128]);
|
||||||
|
//
|
||||||
|
vcvttps2udqs(xm1, xm2);
|
||||||
|
vcvttps2udqs(xm1, ptr[rax+128]);
|
||||||
|
vcvttps2udqs(xm1, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvttps2udqs(ym1, ym2);
|
||||||
|
vcvttps2udqs(ym1, ym2|T_sae);
|
||||||
|
vcvttps2udqs(ym1, ptr[rax+128]);
|
||||||
|
vcvttps2udqs(ym1, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvttps2udqs(zm1, zm2);
|
||||||
|
vcvttps2udqs(zm1, zm2|T_sae);
|
||||||
|
vcvttps2udqs(zm1, ptr[rax+128]);
|
||||||
|
vcvttps2udqs(zm1, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
//
|
||||||
|
vcvttpd2dqs(xm1|k1|T_z, xm2);
|
||||||
|
vcvttpd2dqs(xm1|k1|T_z, xword [rax+128]);
|
||||||
|
vcvttpd2dqs(xm1|k1|T_z, xword_b[rax+128]);
|
||||||
|
|
||||||
|
vcvttpd2dqs(xm1|k1|T_z, ym2);
|
||||||
|
vcvttpd2dqs(xm1|k1|T_z, ym2|T_sae);
|
||||||
|
vcvttpd2dqs(xm1|k1|T_z, yword [rax+128]);
|
||||||
|
vcvttpd2dqs(xm1|k1|T_z, yword_b[rax+128]);
|
||||||
|
|
||||||
|
vcvttpd2dqs(ym1|k1|T_z, zm2);
|
||||||
|
vcvttpd2dqs(ym1|k1|T_z, zm2|T_sae);
|
||||||
|
vcvttpd2dqs(ym1|k1|T_z, zword [rax+128]);
|
||||||
|
vcvttpd2dqs(ym1|k1|T_z, zword_b[rax+128]);
|
||||||
|
|
||||||
|
//
|
||||||
|
vcvttpd2udqs(xm1|k1|T_z, xm2);
|
||||||
|
vcvttpd2udqs(xm1|k1|T_z, xword [rax+128]);
|
||||||
|
vcvttpd2udqs(xm1|k1|T_z, xword_b[rax+128]);
|
||||||
|
|
||||||
|
vcvttpd2udqs(xm1|k1|T_z, ym2);
|
||||||
|
vcvttpd2udqs(xm1|k1|T_z, ym2|T_sae);
|
||||||
|
vcvttpd2udqs(xm1|k1|T_z, yword [rax+128]);
|
||||||
|
vcvttpd2udqs(xm1|k1|T_z, yword_b[rax+128]);
|
||||||
|
|
||||||
|
vcvttpd2udqs(ym1|k1|T_z, zm2);
|
||||||
|
vcvttpd2udqs(ym1|k1|T_z, zm2|T_sae);
|
||||||
|
vcvttpd2udqs(ym1|k1|T_z, zword [rax+128]);
|
||||||
|
vcvttpd2udqs(ym1|k1|T_z, zword_b[rax+128]);
|
||||||
|
//
|
||||||
|
vcvttps2qqs(xm1|k1|T_z, xm2);
|
||||||
|
vcvttps2qqs(xm1|k1|T_z, ptr [rax+128]);
|
||||||
|
vcvttps2qqs(xm1|k1|T_z, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvttps2qqs(ym1|k1|T_z, xm2);
|
||||||
|
vcvttps2qqs(ym1|k1|T_z, xm2|T_sae);
|
||||||
|
vcvttps2qqs(ym1|k1|T_z, ptr [rax+128]);
|
||||||
|
vcvttps2qqs(ym1|k1|T_z, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvttps2qqs(zm1, ym2);
|
||||||
|
vcvttps2qqs(zm1|k1|T_z, ym2);
|
||||||
|
vcvttps2qqs(zm1|k1|T_z|T_sae, ym2);
|
||||||
|
vcvttps2qqs(zm1|k1|T_z, ptr [rax+128]);
|
||||||
|
vcvttps2qqs(zm1|k1|T_z, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
//
|
||||||
|
vcvttps2uqqs(xm1|k1|T_z, xm2);
|
||||||
|
vcvttps2uqqs(xm1|k1|T_z, ptr [rax+128]);
|
||||||
|
vcvttps2uqqs(xm1|k1|T_z, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvttps2uqqs(ym1|k1|T_z, xm2);
|
||||||
|
vcvttps2uqqs(ym1|k1|T_z, xm2|T_sae);
|
||||||
|
vcvttps2uqqs(ym1|k1|T_z, ptr [rax+128]);
|
||||||
|
vcvttps2uqqs(ym1|k1|T_z, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
vcvttps2uqqs(zm1, ym2);
|
||||||
|
vcvttps2uqqs(zm1|k1|T_z, ym2);
|
||||||
|
vcvttps2uqqs(zm1|k1|T_z|T_sae, ym2);
|
||||||
|
vcvttps2uqqs(zm1|k1|T_z, ptr [rax+128]);
|
||||||
|
vcvttps2uqqs(zm1|k1|T_z, ptr_b[rax+128]);
|
||||||
|
|
||||||
|
//
|
||||||
|
vcvttsd2sis(eax, xm1);
|
||||||
|
vcvttsd2sis(eax, xm1|T_sae);
|
||||||
|
vcvttsd2sis(eax, ptr[rax+128]);
|
||||||
|
|
||||||
|
vcvttsd2sis(r30, xm1);
|
||||||
|
vcvttsd2sis(r30, xm1|T_sae);
|
||||||
|
vcvttsd2sis(r30, ptr[rax+128]);
|
||||||
|
//
|
||||||
|
vcvttsd2usis(eax, xm1);
|
||||||
|
vcvttsd2usis(eax, xm1|T_sae);
|
||||||
|
vcvttsd2usis(eax, ptr[rax+128]);
|
||||||
|
|
||||||
|
vcvttsd2usis(r30, xm1);
|
||||||
|
vcvttsd2usis(r30, xm1|T_sae);
|
||||||
|
vcvttsd2usis(r30, ptr[rax+128]);
|
||||||
|
//
|
||||||
|
vcvttss2sis(eax, xm1);
|
||||||
|
vcvttss2sis(eax, xm1|T_sae);
|
||||||
|
vcvttss2sis(eax, ptr[rax+128]);
|
||||||
|
|
||||||
|
vcvttss2sis(r30, xm1);
|
||||||
|
vcvttss2sis(r30, xm1|T_sae);
|
||||||
|
vcvttss2sis(r30, ptr[rax+128]);
|
||||||
|
//
|
||||||
|
vcvttss2usis(eax, xm1);
|
||||||
|
vcvttss2usis(eax, xm1|T_sae);
|
||||||
|
vcvttss2usis(eax, ptr[rax+128]);
|
||||||
|
|
||||||
|
vcvttss2usis(r30, xm1);
|
||||||
|
vcvttss2usis(r30, xm1|T_sae);
|
||||||
|
vcvttss2usis(r30, ptr[rax+128]);
|
|
@ -234,10 +234,10 @@ CYBOZU_TEST_AUTO(vmpsadbw)
|
||||||
struct Code : Xbyak::CodeGenerator {
|
struct Code : Xbyak::CodeGenerator {
|
||||||
Code()
|
Code()
|
||||||
{
|
{
|
||||||
setDefaultEncoding();
|
setDefaultEncodingAVX10();
|
||||||
vmpsadbw(xm1, xm3, xm15, 3); // vex(avx)
|
vmpsadbw(xm1, xm3, xm15, 3); // vex(avx)
|
||||||
vmpsadbw(ym1, ym3, ptr[rax+128], 3); // vex(avx2)
|
vmpsadbw(ym1, ym3, ptr[rax+128], 3); // vex(avx2)
|
||||||
setDefaultEncoding(VexEncoding, EvexEncoding);
|
setDefaultEncodingAVX10(AVX10v2Encoding);
|
||||||
vmpsadbw(ym1, ym3, ym15, 3); // evex(avx10.2)
|
vmpsadbw(ym1, ym3, ym15, 3); // evex(avx10.2)
|
||||||
vmpsadbw(ym1, ym3, ptr[rax+128], 3); // evex(avx10.2)
|
vmpsadbw(ym1, ym3, ptr[rax+128], 3); // evex(avx10.2)
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,7 +7,7 @@ struct Code : Xbyak::CodeGenerator {
|
||||||
Code()
|
Code()
|
||||||
: Xbyak::CodeGenerator(4096*8)
|
: Xbyak::CodeGenerator(4096*8)
|
||||||
{
|
{
|
||||||
setDefaultEncoding(VexEncoding, EvexEncoding);
|
setDefaultEncodingAVX10(AVX10v2Encoding);
|
||||||
#include "tmp.cpp"
|
#include "tmp.cpp"
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
|
@ -366,7 +366,7 @@ def parseNmemonicTest():
|
||||||
('vpshldw(xmm9|k3|T_z, xmm2, ptr [rax + 0x40], 5);', Nmemonic('vpshldw', [xmm9, xmm2, Memory(0, rax, None, 0, 0x40), 5], [k3, T_z])),
|
('vpshldw(xmm9|k3|T_z, xmm2, ptr [rax + 0x40], 5);', Nmemonic('vpshldw', [xmm9, xmm2, Memory(0, rax, None, 0, 0x40), 5], [k3, T_z])),
|
||||||
('vpshrdd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5);', Nmemonic('vpshrdd', [xmm5, xmm2, Memory(0, rax, None, 0, 0x40, True), 5], [k3, T_z])),
|
('vpshrdd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5);', Nmemonic('vpshrdd', [xmm5, xmm2, Memory(0, rax, None, 0, 0x40, True), 5], [k3, T_z])),
|
||||||
('vpshrdd xmm5{k3}{z}, xmm2, dword ptr [rax+0x40]{1to4}, 0x5', Nmemonic('vpshrdd', [xmm5, xmm2, Memory(0, rax, None, 0, 0x40, True), 5], [k3, T_z])),
|
('vpshrdd xmm5{k3}{z}, xmm2, dword ptr [rax+0x40]{1to4}, 0x5', Nmemonic('vpshrdd', [xmm5, xmm2, Memory(0, rax, None, 0, 0x40, True), 5], [k3, T_z])),
|
||||||
('vcmpph(k1, xm15, ptr[rax+64], 1);', Nmemonic('vcmpph', [k1, xm15, Memory(0, rax, None, 0, 64), 1])),
|
('vcmpph(k1, xmm15, ptr[rax+64], 1);', Nmemonic('vcmpph', [k1, xmm15, Memory(0, rax, None, 0, 64), 1])),
|
||||||
]
|
]
|
||||||
for (s, expected) in tbl:
|
for (s, expected) in tbl:
|
||||||
e = parseNmemonic(s)
|
e = parseNmemonic(s)
|
||||||
|
|
|
@ -155,7 +155,7 @@ namespace Xbyak {
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
DEFAULT_MAX_CODE_SIZE = 4096,
|
DEFAULT_MAX_CODE_SIZE = 4096,
|
||||||
VERSION = 0x7100 /* 0xABCD = A.BC(.D) */
|
VERSION = 0x7200 /* 0xABCD = A.BC(.D) */
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifndef MIE_INTEGER_TYPE_DEFINED
|
#ifndef MIE_INTEGER_TYPE_DEFINED
|
||||||
|
@ -232,6 +232,7 @@ enum {
|
||||||
ERR_CANT_USE_REX2,
|
ERR_CANT_USE_REX2,
|
||||||
ERR_INVALID_DFV,
|
ERR_INVALID_DFV,
|
||||||
ERR_INVALID_REG_IDX,
|
ERR_INVALID_REG_IDX,
|
||||||
|
ERR_BAD_ENCODING_MODE,
|
||||||
ERR_INTERNAL // Put it at last.
|
ERR_INTERNAL // Put it at last.
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -290,6 +291,7 @@ inline const char *ConvertErrorToString(int err)
|
||||||
"can't use rex2",
|
"can't use rex2",
|
||||||
"invalid dfv",
|
"invalid dfv",
|
||||||
"invalid reg index",
|
"invalid reg index",
|
||||||
|
"bad encoding mode",
|
||||||
"internal error"
|
"internal error"
|
||||||
};
|
};
|
||||||
assert(ERR_INTERNAL + 1 == sizeof(errTbl) / sizeof(*errTbl));
|
assert(ERR_INTERNAL + 1 == sizeof(errTbl) / sizeof(*errTbl));
|
||||||
|
@ -1673,7 +1675,9 @@ inline const uint8_t* Label::getAddress() const
|
||||||
typedef enum {
|
typedef enum {
|
||||||
DefaultEncoding,
|
DefaultEncoding,
|
||||||
VexEncoding,
|
VexEncoding,
|
||||||
EvexEncoding
|
EvexEncoding,
|
||||||
|
PreAVX10v2Encoding,
|
||||||
|
AVX10v2Encoding
|
||||||
} PreferredEncoding;
|
} PreferredEncoding;
|
||||||
|
|
||||||
class CodeGenerator : public CodeArray {
|
class CodeGenerator : public CodeArray {
|
||||||
|
@ -2661,21 +2665,24 @@ private:
|
||||||
if (addr.getRegExp().getIndex().getKind() != kind) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
|
if (addr.getRegExp().getIndex().getKind() != kind) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
|
||||||
opVex(x, 0, addr, type, code);
|
opVex(x, 0, addr, type, code);
|
||||||
}
|
}
|
||||||
void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, uint64_t type, int code, PreferredEncoding encoding, int imm = NONE, uint64_t typeVex = 0, uint64_t typeEvex = 0, int sel = 0)
|
void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, uint64_t type, int code, PreferredEncoding enc, int imm = NONE, uint64_t typeVex = 0, uint64_t typeEvex = 0, int sel = 0)
|
||||||
{
|
{
|
||||||
opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding, typeVex, typeEvex, sel), code, imm);
|
opAVX_X_X_XM(x1, x2, op, type | orEvexIf(enc, typeVex, typeEvex, sel), code, imm);
|
||||||
}
|
}
|
||||||
int orEvexIf(PreferredEncoding encoding, uint64_t typeVex, uint64_t typeEvex, int sel) {
|
PreferredEncoding getEncoding(PreferredEncoding enc, int sel) const
|
||||||
if (encoding == DefaultEncoding) {
|
{
|
||||||
encoding = defaultEncoding_[sel];
|
if (enc == DefaultEncoding) {
|
||||||
|
enc = defaultEncoding_[sel];
|
||||||
}
|
}
|
||||||
if (encoding == EvexEncoding) {
|
if ((sel == 0 && enc != VexEncoding && enc != EvexEncoding) || (sel == 1 && enc != PreAVX10v2Encoding && enc != AVX10v2Encoding)) XBYAK_THROW_RET(ERR_BAD_ENCODING_MODE, VexEncoding)
|
||||||
#ifdef XBYAK_DISABLE_AVX512
|
#ifdef XBYAK_DISABLE_AVX512
|
||||||
XBYAK_THROW(ERR_EVEX_IS_INVALID)
|
if (enc == EvexEncoding || enc == AVX10v2Encoding) XBYAK_THROW(ERR_EVEX_IS_INVALID)
|
||||||
#endif
|
#endif
|
||||||
return T_MUST_EVEX | typeEvex;
|
return enc;
|
||||||
}
|
}
|
||||||
return typeVex;
|
uint64_t orEvexIf(PreferredEncoding enc, uint64_t typeVex, uint64_t typeEvex, int sel) {
|
||||||
|
enc = getEncoding(enc, sel);
|
||||||
|
return ((sel == 0 && enc == VexEncoding) || (sel == 1 && enc != AVX10v2Encoding)) ? typeVex : (T_MUST_EVEX | typeEvex);
|
||||||
}
|
}
|
||||||
void opInOut(const Reg& a, const Reg& d, uint8_t code)
|
void opInOut(const Reg& a, const Reg& d, uint8_t code)
|
||||||
{
|
{
|
||||||
|
@ -3132,8 +3139,8 @@ public:
|
||||||
#endif
|
#endif
|
||||||
, isDefaultJmpNEAR_(false)
|
, isDefaultJmpNEAR_(false)
|
||||||
{
|
{
|
||||||
// select avx512-vnni, vmpsadbw(avx)
|
|
||||||
setDefaultEncoding();
|
setDefaultEncoding();
|
||||||
|
setDefaultEncodingAVX10();
|
||||||
labelMgr_.set(this);
|
labelMgr_.set(this);
|
||||||
}
|
}
|
||||||
void reset()
|
void reset()
|
||||||
|
@ -3170,16 +3177,20 @@ public:
|
||||||
#undef jnl
|
#undef jnl
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// set default encoding
|
// set default encoding of VNNI
|
||||||
// vnniEnc : control AVX512_VNNI (evex:default) or AVX-VNNI (vex)
|
// EvexEncoding : AVX512_VNNI, VexEncoding : AVX-VNNI
|
||||||
// avx10Enc : control mpsadbw, AVX-VNNI-INT8 (vex:default) or AVX10.2 (evex)
|
void setDefaultEncoding(PreferredEncoding enc = EvexEncoding)
|
||||||
void setDefaultEncoding(PreferredEncoding vnniEnc = EvexEncoding, PreferredEncoding avx10Enc = VexEncoding)
|
|
||||||
{ defaultEncoding_[0] = vnniEnc; defaultEncoding_[1] = avx10Enc; }
|
|
||||||
|
|
||||||
void sha1msg12(const Xmm& x, const Operand& op)
|
|
||||||
{
|
{
|
||||||
opROO(Reg(), op, x, T_MUST_EVEX, 0xD9);
|
if (enc != VexEncoding && enc != EvexEncoding) XBYAK_THROW(ERR_BAD_ENCODING_MODE)
|
||||||
|
defaultEncoding_[0] = enc;
|
||||||
}
|
}
|
||||||
|
// default : PreferredEncoding : AVX-VNNI-INT8/AVX512-FP16
|
||||||
|
void setDefaultEncodingAVX10(PreferredEncoding enc = PreAVX10v2Encoding)
|
||||||
|
{
|
||||||
|
if (enc != PreAVX10v2Encoding && enc != AVX10v2Encoding) XBYAK_THROW(ERR_BAD_ENCODING_MODE)
|
||||||
|
defaultEncoding_[1] = enc;
|
||||||
|
}
|
||||||
|
|
||||||
void bswap(const Reg32e& r)
|
void bswap(const Reg32e& r)
|
||||||
{
|
{
|
||||||
int idx = r.getIdx();
|
int idx = r.getIdx();
|
||||||
|
@ -3192,6 +3203,48 @@ public:
|
||||||
}
|
}
|
||||||
db(0xC8 + (idx & 7));
|
db(0xC8 + (idx & 7));
|
||||||
}
|
}
|
||||||
|
// AVX10 zero-extending for vmovd, vmovw
|
||||||
|
void opAVX10ZeroExt(const Operand& op1, const Operand& op2, const uint64_t typeTbl[4], const int codeTbl[4], PreferredEncoding enc, int bit)
|
||||||
|
{
|
||||||
|
const Operand *p1 = &op1;
|
||||||
|
const Operand *p2 = &op2;
|
||||||
|
bool rev = false;
|
||||||
|
if (p1->isMEM()) {
|
||||||
|
std::swap(p1, p2);
|
||||||
|
rev = true;
|
||||||
|
}
|
||||||
|
if (p1->isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION)
|
||||||
|
if (p1->isXMM()) {
|
||||||
|
std::swap(p1, p2);
|
||||||
|
rev = !rev;
|
||||||
|
}
|
||||||
|
int sel = -1;
|
||||||
|
if (getEncoding(enc, 1) == AVX10v2Encoding) {
|
||||||
|
if ((p1->isXMM() || p1->isMEM()) && p2->isXMM()) sel = 2 + int(rev);
|
||||||
|
} else {
|
||||||
|
if ((p1->isREG(bit) || p1->isMEM()) && p2->isXMM()) sel = int(rev);
|
||||||
|
}
|
||||||
|
if (sel == -1) XBYAK_THROW(ERR_BAD_COMBINATION)
|
||||||
|
opAVX_X_X_XM(*static_cast<const Xmm*>(p2), xm0, *p1, typeTbl[sel], codeTbl[sel]);
|
||||||
|
}
|
||||||
|
void vmovd(const Operand& op1, const Operand& op2, PreferredEncoding enc = DefaultEncoding)
|
||||||
|
{
|
||||||
|
const uint64_t typeTbl[] = {
|
||||||
|
T_EVEX|T_66|T_0F|T_W0|T_N4, T_EVEX|T_66|T_0F|T_W0|T_N4, // legacy, avx, avx512
|
||||||
|
T_MUST_EVEX|T_66|T_0F|T_EW0|T_N4, T_MUST_EVEX|T_F3|T_0F|T_EW0|T_N4, // avx10.2
|
||||||
|
};
|
||||||
|
const int codeTbl[] = { 0x7E, 0x6E, 0xD6, 0x7E };
|
||||||
|
opAVX10ZeroExt(op1, op2, typeTbl, codeTbl, enc, 32);
|
||||||
|
}
|
||||||
|
void vmovw(const Operand& op1, const Operand& op2, PreferredEncoding enc = DefaultEncoding)
|
||||||
|
{
|
||||||
|
const uint64_t typeTbl[] = {
|
||||||
|
T_MUST_EVEX|T_66|T_MAP5|T_N2, T_MUST_EVEX|T_66|T_MAP5|T_N2, // avx512-fp16
|
||||||
|
T_MUST_EVEX|T_F3|T_MAP5|T_EW0|T_N2, T_MUST_EVEX|T_F3|T_MAP5|T_EW0|T_N2, // avx10.2
|
||||||
|
};
|
||||||
|
const int codeTbl[] = { 0x7E, 0x6E, 0x7E, 0x6E };
|
||||||
|
opAVX10ZeroExt(op1, op2, typeTbl, codeTbl, enc, 16|32|64);
|
||||||
|
}
|
||||||
/*
|
/*
|
||||||
use single byte nop if useMultiByteNop = false
|
use single byte nop if useMultiByteNop = false
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
const char *getVersionString() const { return "7.10"; }
|
const char *getVersionString() const { return "7.20"; }
|
||||||
void aadd(const Address& addr, const Reg32e ®) { opMR(addr, reg, T_0F38, 0x0FC, T_APX); }
|
void aadd(const Address& addr, const Reg32e ®) { opMR(addr, reg, T_0F38, 0x0FC, T_APX); }
|
||||||
void aand(const Address& addr, const Reg32e ®) { opMR(addr, reg, T_0F38|T_66, 0x0FC, T_APX|T_66); }
|
void aand(const Address& addr, const Reg32e ®) { opMR(addr, reg, T_0F38|T_66, 0x0FC, T_APX|T_66); }
|
||||||
void adc(const Operand& op, uint32_t imm) { opOI(op, imm, 0x10, 2); }
|
void adc(const Operand& op, uint32_t imm) { opOI(op, imm, 0x10, 2); }
|
||||||
|
@ -988,6 +988,7 @@ void sets(const Operand& op) { opSetCC(op, 8); }//-V524
|
||||||
void setz(const Operand& op) { opSetCC(op, 4); }//-V524
|
void setz(const Operand& op) { opSetCC(op, 4); }//-V524
|
||||||
void sfence() { db(0x0F); db(0xAE); db(0xF8); }
|
void sfence() { db(0x0F); db(0xAE); db(0xF8); }
|
||||||
void sha1msg1(const Xmm& x, const Operand& op) { opSSE_APX(x, op, T_0F38, 0xC9, T_MUST_EVEX, 0xD9); }
|
void sha1msg1(const Xmm& x, const Operand& op) { opSSE_APX(x, op, T_0F38, 0xC9, T_MUST_EVEX, 0xD9); }
|
||||||
|
void sha1msg12(const Xmm& x, const Operand& op) { opROO(Reg(), op, x, T_MUST_EVEX, 0xD9); }
|
||||||
void sha1msg2(const Xmm& x, const Operand& op) { opSSE_APX(x, op, T_0F38, 0xCA, T_MUST_EVEX, 0xDA); }
|
void sha1msg2(const Xmm& x, const Operand& op) { opSSE_APX(x, op, T_0F38, 0xCA, T_MUST_EVEX, 0xDA); }
|
||||||
void sha1nexte(const Xmm& x, const Operand& op) { opSSE_APX(x, op, T_0F38, 0xC8, T_MUST_EVEX, 0xD8); }
|
void sha1nexte(const Xmm& x, const Operand& op) { opSSE_APX(x, op, T_0F38, 0xC8, T_MUST_EVEX, 0xD8); }
|
||||||
void sha1rnds4(const Xmm& x, const Operand& op, uint8_t imm) { opSSE_APX(x, op, T_0F3A, 0xCC, T_MUST_EVEX, 0xD4, imm); }
|
void sha1rnds4(const Xmm& x, const Operand& op, uint8_t imm) { opSSE_APX(x, op, T_0F3A, 0xCC, T_MUST_EVEX, 0xD4, imm); }
|
||||||
|
@ -1331,8 +1332,6 @@ void vmovapd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_
|
||||||
void vmovapd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F|T_EW1|T_YMM|T_EVEX, 0x28); }
|
void vmovapd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F|T_EW1|T_YMM|T_EVEX, 0x28); }
|
||||||
void vmovaps(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_0F|T_EW0|T_YMM|T_EVEX|T_M_K, 0x29); }
|
void vmovaps(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_0F|T_EW0|T_YMM|T_EVEX|T_M_K, 0x29); }
|
||||||
void vmovaps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F|T_EW0|T_YMM|T_EVEX, 0x28); }
|
void vmovaps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F|T_EW0|T_YMM|T_EVEX, 0x28); }
|
||||||
void vmovd(const Operand& op, const Xmm& x) { if (!op.isREG(32) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x7E); }
|
|
||||||
void vmovd(const Xmm& x, const Operand& op) { if (!op.isREG(32) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x6E); }
|
|
||||||
void vmovddup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_DUP|T_F2|T_0F|T_EW1|T_YMM|T_EVEX|T_ER_X|T_ER_Y|T_ER_Z, 0x12); }
|
void vmovddup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_DUP|T_F2|T_0F|T_EW1|T_YMM|T_EVEX|T_ER_X|T_ER_Y|T_ER_Z, 0x12); }
|
||||||
void vmovdqa(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_66|T_0F|T_YMM, 0x7F); }
|
void vmovdqa(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_66|T_0F|T_YMM, 0x7F); }
|
||||||
void vmovdqa(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F|T_YMM, 0x6F); }
|
void vmovdqa(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F|T_YMM, 0x6F); }
|
||||||
|
@ -2202,6 +2201,8 @@ void vcvtne2ph2bf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X
|
||||||
void vcvtne2ph2hf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x18); }
|
void vcvtne2ph2hf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x18); }
|
||||||
void vcvtne2ph2hf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x1B); }
|
void vcvtne2ph2hf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x1B); }
|
||||||
void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x72); }
|
void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x72); }
|
||||||
|
void vcvtnebf162ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x69); }
|
||||||
|
void vcvtnebf162iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x6B); }
|
||||||
void vcvtneph2bf8(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); }
|
void vcvtneph2bf8(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); }
|
||||||
void vcvtneph2bf8s(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); }
|
void vcvtneph2bf8s(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); }
|
||||||
void vcvtneph2hf8(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x18); }
|
void vcvtneph2hf8(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x18); }
|
||||||
|
@ -2212,6 +2213,8 @@ void vcvtpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0
|
||||||
void vcvtpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x79); }
|
void vcvtpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x79); }
|
||||||
void vcvtpd2uqq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x79); }
|
void vcvtpd2uqq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x79); }
|
||||||
void vcvtph2dq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_MUST_EVEX|T_B16, 0x5B); }
|
void vcvtph2dq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_MUST_EVEX|T_B16, 0x5B); }
|
||||||
|
void vcvtph2ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B16, 0x69); }
|
||||||
|
void vcvtph2iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B16, 0x6B); }
|
||||||
void vcvtph2pd(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_MUST_EVEX|T_B16, 0x5A); }
|
void vcvtph2pd(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_MUST_EVEX|T_B16, 0x5A); }
|
||||||
void vcvtph2psx(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_MAP6|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B16, 0x13); }
|
void vcvtph2psx(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_MAP6|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B16, 0x13); }
|
||||||
void vcvtph2qq(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_ER_X|T_MUST_EVEX|T_B16, 0x7B); }
|
void vcvtph2qq(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_ER_X|T_MUST_EVEX|T_B16, 0x7B); }
|
||||||
|
@ -2219,6 +2222,8 @@ void vcvtph2udq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0,
|
||||||
void vcvtph2uqq(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_ER_X|T_MUST_EVEX|T_B16, 0x79); }
|
void vcvtph2uqq(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_ER_X|T_MUST_EVEX|T_B16, 0x79); }
|
||||||
void vcvtph2uw(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x7D); }
|
void vcvtph2uw(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x7D); }
|
||||||
void vcvtph2w(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x7D); }
|
void vcvtph2w(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x7D); }
|
||||||
|
void vcvtps2ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x69); }
|
||||||
|
void vcvtps2iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x6B); }
|
||||||
void vcvtps2phx(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, T_N16|T_N_VL|T_66|T_MAP5|T_EW0|T_ER_Z|T_MUST_EVEX|T_B32, 0x1D); }
|
void vcvtps2phx(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, T_N16|T_N_VL|T_66|T_MAP5|T_EW0|T_ER_Z|T_MUST_EVEX|T_B32, 0x1D); }
|
||||||
void vcvtps2qq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_0F|T_EW0|T_YMM|T_ER_Y|T_MUST_EVEX|T_B32, 0x7B); }
|
void vcvtps2qq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_0F|T_EW0|T_YMM|T_ER_Y|T_MUST_EVEX|T_B32, 0x7B); }
|
||||||
void vcvtps2udq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_0F|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x79); }
|
void vcvtps2udq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_0F|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x79); }
|
||||||
|
@ -2235,22 +2240,40 @@ void vcvtsh2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N2|T_F3
|
||||||
void vcvtsi2sh(const Xmm& x1, const Xmm& x2, const Operand& op) { if (!(x1.isXMM() && x2.isXMM() && op.isBit(32|64))) XBYAK_THROW(ERR_BAD_COMBINATION) uint64_t type = (T_F3|T_MAP5|T_ER_R|T_MUST_EVEX|T_M_K) | (op.isBit(32) ? (T_EW0 | T_N4) : (T_EW1 | T_N8)); opVex(x1, &x2, op, type, 0x2A); }
|
void vcvtsi2sh(const Xmm& x1, const Xmm& x2, const Operand& op) { if (!(x1.isXMM() && x2.isXMM() && op.isBit(32|64))) XBYAK_THROW(ERR_BAD_COMBINATION) uint64_t type = (T_F3|T_MAP5|T_ER_R|T_MUST_EVEX|T_M_K) | (op.isBit(32) ? (T_EW0 | T_N4) : (T_EW1 | T_N8)); opVex(x1, &x2, op, type, 0x2A); }
|
||||||
void vcvtss2sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_MAP5|T_EW0|T_ER_X|T_MUST_EVEX, 0x1D); }
|
void vcvtss2sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_MAP5|T_EW0|T_ER_X|T_MUST_EVEX, 0x1D); }
|
||||||
void vcvtss2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N4|T_F3|T_0F|T_ER_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x79); }
|
void vcvtss2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N4|T_F3|T_0F|T_ER_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x79); }
|
||||||
|
void vcvttnebf162ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x68); }
|
||||||
|
void vcvttnebf162iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x6A); }
|
||||||
|
void vcvttpd2dqs(const Xmm& x, const Operand& op) { opCvt2(x, op, T_MAP5|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x6D); }
|
||||||
void vcvttpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x7A); }
|
void vcvttpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x7A); }
|
||||||
|
void vcvttpd2qqs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x6D); }
|
||||||
void vcvttpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x78); }
|
void vcvttpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x78); }
|
||||||
|
void vcvttpd2udqs(const Xmm& x, const Operand& op) { opCvt2(x, op, T_MAP5|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x6C); }
|
||||||
void vcvttpd2uqq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x78); }
|
void vcvttpd2uqq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x78); }
|
||||||
|
void vcvttpd2uqqs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x6C); }
|
||||||
void vcvttph2dq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_F3|T_MAP5|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B16, 0x5B); }
|
void vcvttph2dq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_F3|T_MAP5|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B16, 0x5B); }
|
||||||
|
void vcvttph2ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B16, 0x68); }
|
||||||
|
void vcvttph2iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B16, 0x6A); }
|
||||||
void vcvttph2qq(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_MUST_EVEX|T_B16, 0x7A); }
|
void vcvttph2qq(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_MUST_EVEX|T_B16, 0x7A); }
|
||||||
void vcvttph2udq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_MAP5|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B16, 0x78); }
|
void vcvttph2udq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_MAP5|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B16, 0x78); }
|
||||||
void vcvttph2uqq(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_MUST_EVEX|T_B16, 0x78); }
|
void vcvttph2uqq(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_MUST_EVEX|T_B16, 0x78); }
|
||||||
void vcvttph2uw(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x7C); }
|
void vcvttph2uw(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x7C); }
|
||||||
void vcvttph2w(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x7C); }
|
void vcvttph2w(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x7C); }
|
||||||
|
void vcvttps2dqs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B32, 0x6D); }
|
||||||
|
void vcvttps2ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x68); }
|
||||||
|
void vcvttps2iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x6A); }
|
||||||
void vcvttps2qq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_0F|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B32, 0x7A); }
|
void vcvttps2qq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_0F|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B32, 0x7A); }
|
||||||
|
void vcvttps2qqs(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_SAE_Y|T_MUST_EVEX|T_B32, 0x6D); }
|
||||||
void vcvttps2udq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_0F|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x78); }
|
void vcvttps2udq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_0F|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x78); }
|
||||||
|
void vcvttps2udqs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B32, 0x6C); }
|
||||||
void vcvttps2uqq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_0F|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B32, 0x78); }
|
void vcvttps2uqq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_0F|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B32, 0x78); }
|
||||||
|
void vcvttps2uqqs(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_SAE_Y|T_MUST_EVEX|T_B32, 0x6C); }
|
||||||
|
void vcvttsd2sis(const Reg32e& r, const Operand& op) { uint64_t type = (T_N8|T_F2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x6D); }
|
||||||
void vcvttsd2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N8|T_F2|T_0F|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x78); }
|
void vcvttsd2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N8|T_F2|T_0F|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x78); }
|
||||||
|
void vcvttsd2usis(const Reg32e& r, const Operand& op) { uint64_t type = (T_N8|T_F2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x6C); }
|
||||||
void vcvttsh2si(const Reg32e& r, const Operand& op) { uint64_t type = (T_N2|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x2C); }
|
void vcvttsh2si(const Reg32e& r, const Operand& op) { uint64_t type = (T_N2|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x2C); }
|
||||||
void vcvttsh2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N2|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x78); }
|
void vcvttsh2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N2|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x78); }
|
||||||
|
void vcvttss2sis(const Reg32e& r, const Operand& op) { uint64_t type = (T_N4|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x6D); }
|
||||||
void vcvttss2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N4|T_F3|T_0F|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x78); }
|
void vcvttss2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N4|T_F3|T_0F|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x78); }
|
||||||
|
void vcvttss2usis(const Reg32e& r, const Operand& op) { uint64_t type = (T_N4|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x6C); }
|
||||||
void vcvtudq2pd(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_F3|T_0F|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x7A); }
|
void vcvtudq2pd(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_F3|T_0F|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x7A); }
|
||||||
void vcvtudq2ph(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, T_N16|T_N_VL|T_F2|T_MAP5|T_EW0|T_ER_Z|T_MUST_EVEX|T_B32, 0x7A); }
|
void vcvtudq2ph(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, T_N16|T_N_VL|T_F2|T_MAP5|T_EW0|T_ER_Z|T_MUST_EVEX|T_B32, 0x7A); }
|
||||||
void vcvtudq2ps(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_0F|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x7A); }
|
void vcvtudq2ps(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_0F|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x7A); }
|
||||||
|
@ -2374,6 +2397,13 @@ void vinserti64x4(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm)
|
||||||
void vmaxpbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5F); }
|
void vmaxpbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5F); }
|
||||||
void vmaxph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_SAE_Z | T_B16, 0x5F); }
|
void vmaxph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_SAE_Z | T_B16, 0x5F); }
|
||||||
void vmaxsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_SAE_X | T_N2, 0x5F); }
|
void vmaxsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_SAE_X | T_N2, 0x5F); }
|
||||||
|
void vminmaxnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x52, imm); }
|
||||||
|
void vminmaxpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x52, imm); }
|
||||||
|
void vminmaxph(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_0F3A|T_EW0|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B16, 0x52, imm); }
|
||||||
|
void vminmaxps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B32, 0x52, imm); }
|
||||||
|
void vminmaxsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F3A|T_EW1|T_SAE_X|T_MUST_EVEX, 0x53, imm); }
|
||||||
|
void vminmaxsh(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N2|T_0F3A|T_EW0|T_SAE_X|T_MUST_EVEX, 0x53, imm); }
|
||||||
|
void vminmaxss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F3A|T_EW0|T_SAE_X|T_MUST_EVEX, 0x53, imm); }
|
||||||
void vminpbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5D); }
|
void vminpbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5D); }
|
||||||
void vminph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_SAE_Z | T_B16, 0x5D); }
|
void vminph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_SAE_Z | T_B16, 0x5D); }
|
||||||
void vminsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_SAE_X | T_N2, 0x5D); }
|
void vminsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_SAE_X | T_N2, 0x5D); }
|
||||||
|
@ -2392,9 +2422,6 @@ void vmovdqu8(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_0F
|
||||||
void vmovsh(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_N2|T_F3|T_MAP5|T_EW0|T_MUST_EVEX|T_M_K, 0x11); }
|
void vmovsh(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_N2|T_F3|T_MAP5|T_EW0|T_MUST_EVEX|T_M_K, 0x11); }
|
||||||
void vmovsh(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, T_N2|T_F3|T_MAP5|T_EW0|T_MUST_EVEX, 0x10); }
|
void vmovsh(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, T_N2|T_F3|T_MAP5|T_EW0|T_MUST_EVEX, 0x10); }
|
||||||
void vmovsh(const Xmm& x1, const Xmm& x2, const Xmm& x3) { opAVX_X_X_XM(x1, x2, x3, T_N2|T_F3|T_MAP5|T_EW0|T_MUST_EVEX, 0x10); }
|
void vmovsh(const Xmm& x1, const Xmm& x2, const Xmm& x3) { opAVX_X_X_XM(x1, x2, x3, T_N2|T_F3|T_MAP5|T_EW0|T_MUST_EVEX, 0x10); }
|
||||||
void vmovw(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x7E); }
|
|
||||||
void vmovw(const Reg32e& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, r, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x7E); }
|
|
||||||
void vmovw(const Xmm& x, const Operand& op) { if (!op.isREG(32|64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x6E); }
|
|
||||||
void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F3A|T_YMM, 0x42, encoding, imm, T_66|T_W0|T_YMM, T_F3|T_0F3A|T_EW0|T_B32, 1); }
|
void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F3A|T_YMM, 0x42, encoding, imm, T_66|T_W0|T_YMM, T_F3|T_0F3A|T_EW0|T_B32, 1); }
|
||||||
void vmulnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x59); }
|
void vmulnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x59); }
|
||||||
void vmulph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x59); }
|
void vmulph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x59); }
|
||||||
|
|
Loading…
Reference in a new issue