mirror of
https://github.com/herumi/xbyak
synced 2024-11-20 16:06:14 -07:00
This commit is contained in:
commit
3ee31be62d
18 changed files with 624 additions and 131 deletions
|
@ -1,6 +1,6 @@
|
|||
cmake_minimum_required(VERSION 3.5)
|
||||
|
||||
project(xbyak LANGUAGES CXX VERSION 7.10)
|
||||
project(xbyak LANGUAGES CXX VERSION 7.20)
|
||||
|
||||
file(GLOB headers xbyak/*.h)
|
||||
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
# History
|
||||
|
||||
* 2024/Oct/15 ver 7.20 Fixed the specification of setDefaultEncoding, setDefaultEncodingAVX10./
|
||||
* 2024/Oct/15 ver 7.11 Added full support for AVX10.2
|
||||
* 2024/Oct/13 ver 7.10 support AVX10 integer and fp16 vnni, media new instructions. setDefaultEncoding is extended.
|
||||
* 2024/Oct/10 ver 7.09.1 fix the names of vpcompressb and vpcompressw
|
||||
* 2024/Oct/08 ver 7.09 support YMM embedded rounding of AVX10.2 and fix some mnemonics with {sae}/{er}.
|
||||
|
|
101
doc/usage.md
101
doc/usage.md
|
@ -1,7 +1,7 @@
|
|||
# Usage
|
||||
|
||||
Inherit `Xbyak::CodeGenerator` class and make the class method.
|
||||
```
|
||||
```cpp
|
||||
#include <xbyak/xbyak.h>
|
||||
|
||||
struct Code : Xbyak::CodeGenerator {
|
||||
|
@ -13,7 +13,7 @@ struct Code : Xbyak::CodeGenerator {
|
|||
};
|
||||
```
|
||||
Or you can pass the instance of CodeGenerator without inheriting.
|
||||
```
|
||||
```cpp
|
||||
void genCode(Xbyak::CodeGenerator& code, int x) {
|
||||
using namespace Xbyak::util;
|
||||
code.mov(eax, x);
|
||||
|
@ -23,7 +23,7 @@ void genCode(Xbyak::CodeGenerator& code, int x) {
|
|||
|
||||
Make an instance of the class and get the function
|
||||
pointer by calling `getCode()` and call it.
|
||||
```
|
||||
```cpp
|
||||
Code c(5);
|
||||
int (*f)() = c.getCode<int (*)()>();
|
||||
printf("ret=%d\n", f()); // ret = 5
|
||||
|
@ -32,7 +32,7 @@ printf("ret=%d\n", f()); // ret = 5
|
|||
## Syntax
|
||||
Similar to MASM/NASM syntax with parentheses.
|
||||
|
||||
```
|
||||
```cpp
|
||||
NASM Xbyak
|
||||
mov eax, ebx --> mov(eax, ebx);
|
||||
inc ecx inc(ecx);
|
||||
|
@ -43,7 +43,7 @@ ret --> ret();
|
|||
Use `qword`, `dword`, `word` and `byte` if it is necessary to specify the size of memory,
|
||||
otherwise use `ptr`.
|
||||
|
||||
```
|
||||
```cpp
|
||||
(ptr|qword|dword|word|byte) [base + index * (1|2|4|8) + displacement]
|
||||
[rip + 32bit disp] ; x64 only
|
||||
|
||||
|
@ -53,19 +53,21 @@ mov al, [ebx+ecx] --> mov(al, ptr [ebx + ecx]);
|
|||
test byte [esp], 4 --> test(byte [esp], 4);
|
||||
inc qword [rax] --> inc(qword [rax]);
|
||||
```
|
||||
|
||||
**Note**: `qword`, ... are member variables, then don't use `dword` as unsigned int type.
|
||||
|
||||
### How to use Selector (Segment Register)
|
||||
```
|
||||
```cpp
|
||||
mov eax, [fs:eax] --> putSeg(fs);
|
||||
mov(eax, ptr [eax]);
|
||||
mov ax, cs --> mov(ax, cs);
|
||||
```
|
||||
|
||||
**Note**: Segment class is not derived from `Operand`.
|
||||
|
||||
## AVX
|
||||
|
||||
```
|
||||
```cpp
|
||||
vaddps(xmm1, xmm2, xmm3); // xmm1 <- xmm2 + xmm3
|
||||
vaddps(xmm2, xmm3, ptr [rax]); // use ptr to access memory
|
||||
vgatherdpd(xmm1, ptr [ebp + 256 + xmm2*4], xmm3);
|
||||
|
@ -74,13 +76,13 @@ vgatherdpd(xmm1, ptr [ebp + 256 + xmm2*4], xmm3);
|
|||
**Note**:
|
||||
If `XBYAK_ENABLE_OMITTED_OPERAND` is defined, then you can use two operand version for backward compatibility.
|
||||
But the newer version will not support it.
|
||||
```
|
||||
```cpp
|
||||
vaddps(xmm2, xmm3); // xmm2 <- xmm2 + xmm3
|
||||
```
|
||||
|
||||
## AVX-512
|
||||
|
||||
```
|
||||
```cpp
|
||||
vaddpd zmm2, zmm5, zmm30 --> vaddpd(zmm2, zmm5, zmm30);
|
||||
vaddpd xmm30, xmm20, [rax] --> vaddpd(xmm30, xmm20, ptr [rax]);
|
||||
vaddps xmm30, xmm20, [rax] --> vaddps(xmm30, xmm20, ptr [rax]);
|
||||
|
@ -108,35 +110,44 @@ vfpclasspd k5{k3}, [rax+64]{1to2}, 5 --> vfpclasspd(k5|k3, xword_b [rax+64],
|
|||
vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, yword_b [rax+64], 5); // broadcast 64-bit to 256-bit
|
||||
```
|
||||
|
||||
## Selecting AVX512-VNNI, AVX-VNNI, AVX-VNNI-INT8 etc.
|
||||
Some mnemonics have two types of encodings: VEX and EVEX.
|
||||
## Selecting AVX512-VNNI, AVX-VNNI, AVX-VNNI-INT8, AVX10.2.
|
||||
Some mnemonics have some types of encodings: VEX, EVEX, AVX10.2.
|
||||
The functions for these mnemonics include an optional parameter as the last argument to specify the encoding.
|
||||
The default behavior depends on the order in which the instruction was introduced (whether VEX or EVEX came first),
|
||||
The default behavior depends on the order in which the instruction was introduced (whether VEX, EVEX or AVX10.2 came first),
|
||||
and can be specified using setDefaultEncoding.
|
||||
|
||||
```
|
||||
```cpp
|
||||
vpdpbusd(xm0, xm1, xm2); // default encoding: EVEX (AVX512-VNNI)
|
||||
vpdpbusd(xm0, xm1, xm2, EvexEncoding); // same as the above
|
||||
vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX (AVX-VNNI)
|
||||
setDefaultEncoding(VexEncoding); // default encoding is VEX
|
||||
setDefaultEncoding(VexEncoding); // change default encoding
|
||||
vpdpbusd(xm0, xm1, xm2); // VEX
|
||||
|
||||
vmpsadbw(xm1, xm3, xm15, 3); // default encoding: VEX (AVX-VNNI)
|
||||
vmpsadbw(xm1, xm3, xm15, 3, VexEncoding); // same as the above
|
||||
vmpsadbw(xm1, xm3, xm15, 3, EvexEncoding); // EVEX (AVX10.2)
|
||||
setDefaultEncoding(VexEncoding, EvexEncoding); // use 2nd argument.
|
||||
vmpsadbw(xm1, xm3, xm15, 3); // EVEX
|
||||
vmpsadbw(xm1, xm3, xm15, 3); // default encoding: AVX
|
||||
vmpsadbw(xm1, xm3, xm15, 3, PreAVX10v2Encoding); // same as the above
|
||||
vmpsadbw(xm1, xm3, xm15, 3, AVX10v2Encoding); // AVX10.2
|
||||
setDefaultEncodingAVX10(AVX10v2Encoding); // change default encoding
|
||||
vmpsadbw(xm1, xm3, xm15, 3); // AVX10.2
|
||||
```
|
||||
|
||||
- `setDefaultEncoding(PreferredEncoding vnniEnc = EvexEncoding, PreferredEncoding avx10Enc = VexEncoding)`
|
||||
Control the default encoding of mnemonics with `Xbyak::PreferredEncoding` param.
|
||||
- `setDefaultEncoding(PreferredEncoding enc = EvexEncoding)`
|
||||
- Configure encoding for AVX512-VNNI or AVX-VNNI instructions.
|
||||
- `setDefaultEncodingAVX10(PreferredEncoding enc = PreAVXv2Encoding)`
|
||||
- Configure encoding for pre-AVX10.2 and AVX10.2 instructions.
|
||||
|
||||
param|vnniEnc|avx10Enc
|
||||
`setDefaultEncoding`|EvexEncoding (default)|VexEncoding
|
||||
-|-|-
|
||||
EvexEncoding|AVX512-VNNI|AVX10.2
|
||||
VexEncoding|AVX-VNNI|AVX-VNNI-INT8
|
||||
default|EvexEncoding|VexEncoding
|
||||
mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd, vpdpbssds, vpdpbsud, vpdpbsuds, vpdpbuud, vpdpbuuds, vpdpwsud vpdpwsuds vpdpwusd vpdpwusds vpdpwuud, vpdpwuuds
|
||||
feature|AVX512-VNNI|AVX-VNNI
|
||||
|
||||
- Target functions: vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds
|
||||
|
||||
`setDefaultEncodingAVX10`|PreAVX10v2Encoding (default)|AVX10v2Encoding
|
||||
-|-|-
|
||||
feature|AVX-VNNI-INT8, AVX512-FP16|AVX10.2
|
||||
|
||||
- Target functions: vmpsadbw, vpdpbssd, vpdpbssds, vpdpbsud, vpdpbsuds, vpdpbuud, vpdpbuuds, vpdpwsud vpdpwsuds vpdpwusd vpdpwusds vpdpwuud, vpdpwuuds, vmovd, vmovw
|
||||
|
||||
- Remark: vmovd and vmovw several kinds of encoding such as AVX/AVX512F/AVX512-FP16/AVX10.2.
|
||||
|
||||
### Remark
|
||||
* `k1`, ..., `k7` are opmask registers.
|
||||
|
@ -179,7 +190,7 @@ mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd, vpdpbssds,
|
|||
Two kinds of Label are supported. (String literal and Label class).
|
||||
|
||||
### String literal
|
||||
```
|
||||
```cpp
|
||||
L("L1");
|
||||
jmp("L1");
|
||||
|
||||
|
@ -201,7 +212,7 @@ L("L3");
|
|||
|
||||
### Support `@@`, `@f`, `@b` like MASM
|
||||
|
||||
```
|
||||
```cpp
|
||||
L("@@"); // <A>
|
||||
jmp("@b"); // jmp to <A>
|
||||
jmp("@f"); // jmp to <B>
|
||||
|
@ -217,7 +228,7 @@ Label symbols beginning with a period between `inLocalLabel()` and `outLocalLabe
|
|||
are treated as a local label.
|
||||
`inLocalLabel()` and `outLocalLabel()` can be nested.
|
||||
|
||||
```
|
||||
```cpp
|
||||
void func1()
|
||||
{
|
||||
inLocalLabel();
|
||||
|
@ -240,7 +251,7 @@ void func1()
|
|||
Xbyak deals with jump mnemonics of an undefined label as short jump if no type is specified.
|
||||
So if the size between jmp and label is larger than 127 byte, then xbyak will cause an error.
|
||||
|
||||
```
|
||||
```cpp
|
||||
jmp("short-jmp"); // short jmp
|
||||
// small code
|
||||
L("short-jmp");
|
||||
|
@ -249,14 +260,16 @@ jmp("long-jmp");
|
|||
// long code
|
||||
L("long-jmp"); // throw exception
|
||||
```
|
||||
|
||||
Then specify T_NEAR for jmp.
|
||||
```
|
||||
```cpp
|
||||
jmp("long-jmp", T_NEAR); // long jmp
|
||||
// long code
|
||||
L("long-jmp");
|
||||
```
|
||||
|
||||
Or call `setDefaultJmpNEAR(true);` once, then the default type is set to T_NEAR.
|
||||
```
|
||||
```cpp
|
||||
jmp("long-jmp"); // long jmp
|
||||
// long code
|
||||
L("long-jmp");
|
||||
|
@ -266,7 +279,7 @@ L("long-jmp");
|
|||
|
||||
`L()` and `jxx()` support Label class.
|
||||
|
||||
```
|
||||
```cpp
|
||||
Xbyak::Label label1, label2;
|
||||
L(label1);
|
||||
...
|
||||
|
@ -278,7 +291,7 @@ L(label2);
|
|||
```
|
||||
|
||||
Use `putL` for jmp table
|
||||
```
|
||||
```cpp
|
||||
Label labelTbl, L0, L1, L2;
|
||||
mov(rax, labelTbl);
|
||||
// rdx is an index of jump table
|
||||
|
@ -295,7 +308,7 @@ L(L1);
|
|||
|
||||
`assignL(dstLabel, srcLabel)` binds dstLabel with srcLabel.
|
||||
|
||||
```
|
||||
```cpp
|
||||
Label label2;
|
||||
Label label1 = L(); // make label1 ; same to Label label1; L(label1);
|
||||
...
|
||||
|
@ -310,7 +323,7 @@ The `jmp` in the above code jumps to label1 assigned by `assignL`.
|
|||
* dstLabel must not be used in `L()`.
|
||||
|
||||
`Label::getAddress()` returns the address specified by the label instance and 0 if not specified.
|
||||
```
|
||||
```cpp
|
||||
// not AutoGrow mode
|
||||
Label label;
|
||||
assert(label.getAddress() == 0);
|
||||
|
@ -319,7 +332,7 @@ assert(label.getAddress() == getCurr());
|
|||
```
|
||||
|
||||
### Rip ; relative addressing
|
||||
```
|
||||
```cpp
|
||||
Label label;
|
||||
mov(eax, ptr [rip + label]); // eax = 4
|
||||
...
|
||||
|
@ -327,7 +340,7 @@ mov(eax, ptr [rip + label]); // eax = 4
|
|||
L(label);
|
||||
dd(4);
|
||||
```
|
||||
```
|
||||
```cpp
|
||||
int x;
|
||||
...
|
||||
mov(eax, ptr[rip + &x]); // throw exception if the difference between &x and current position is larger than 2GiB
|
||||
|
@ -338,13 +351,13 @@ int x;
|
|||
Use `word|dword|qword` instead of `ptr` to specify the address size.
|
||||
|
||||
### 32 bit mode
|
||||
```
|
||||
```cpp
|
||||
jmp(word[eax], T_FAR); // jmp m16:16(FF /5)
|
||||
jmp(dword[eax], T_FAR); // jmp m16:32(FF /5)
|
||||
```
|
||||
|
||||
### 64 bit mode
|
||||
```
|
||||
```cpp
|
||||
jmp(word[rax], T_FAR); // jmp m16:16(FF /5)
|
||||
jmp(dword[rax], T_FAR); // jmp m16:32(FF /5)
|
||||
jmp(qword[rax], T_FAR); // jmp m16:64(REX.W FF /5)
|
||||
|
@ -355,7 +368,7 @@ The same applies to `call`.
|
|||
The default max code size is 4096 bytes.
|
||||
Specify the size in constructor of `CodeGenerator()` if necessary.
|
||||
|
||||
```
|
||||
```cpp
|
||||
class Quantize : public Xbyak::CodeGenerator {
|
||||
public:
|
||||
Quantize()
|
||||
|
@ -372,7 +385,7 @@ You can make jit code on prepared memory.
|
|||
|
||||
Call `setProtectModeRE` yourself to change memory mode if using the prepared memory.
|
||||
|
||||
```
|
||||
```cpp
|
||||
uint8_t alignas(4096) buf[8192]; // C++11 or later
|
||||
|
||||
struct Code : Xbyak::CodeGenerator {
|
||||
|
@ -398,7 +411,7 @@ int main()
|
|||
The memory region for jit is automatically extended if necessary when `AutoGrow` is specified in a constructor of `CodeGenerator`.
|
||||
|
||||
Call `ready()` or `readyRE()` before calling `getCode()` to fix jump address.
|
||||
```
|
||||
```cpp
|
||||
struct Code : Xbyak::CodeGenerator {
|
||||
Code()
|
||||
: Xbyak::CodeGenerator(<default memory size>, Xbyak::AutoGrow)
|
||||
|
@ -419,7 +432,7 @@ Xbyak set Read/Write/Exec mode to memory to run jit code.
|
|||
If you want to use Read/Exec mode for security, then specify `DontSetProtectRWE` for `CodeGenerator` and
|
||||
call `setProtectModeRE()` after generating jit code.
|
||||
|
||||
```
|
||||
```cpp
|
||||
struct Code : Xbyak::CodeGenerator {
|
||||
Code()
|
||||
: Xbyak::CodeGenerator(4096, Xbyak::DontSetProtectRWE)
|
||||
|
|
|
@ -209,6 +209,30 @@ void putX_XM()
|
|||
{ 0x2E, "vucomxsd", T_MUST_EVEX | T_F3 | T_0F | T_EW1 | T_SAE_X | T_N8 },
|
||||
{ 0x2E, "vucomxsh", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_SAE_X | T_N2 },
|
||||
{ 0x2E, "vucomxss", T_MUST_EVEX | T_F2 | T_0F | T_EW0 | T_SAE_X | T_N4 },
|
||||
|
||||
// 13.1
|
||||
{ 0x69, "vcvtnebf162ibs", T_MUST_EVEX | T_YMM | T_F2 | T_MAP5 | T_EW0 | T_B16 },
|
||||
{ 0x6B, "vcvtnebf162iubs", T_MUST_EVEX | T_YMM | T_F2 | T_MAP5 | T_EW0 | T_B16 },
|
||||
{ 0x68, "vcvttnebf162ibs", T_MUST_EVEX | T_YMM | T_F2 | T_MAP5 | T_EW0 | T_B16 },
|
||||
{ 0x6A, "vcvttnebf162iubs", T_MUST_EVEX | T_YMM | T_F2 | T_MAP5 | T_EW0 | T_B16 },
|
||||
// 13.3
|
||||
{ 0x6D, "vcvttpd2qqs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW1 | T_B64 | T_SAE_Y | T_SAE_Z },
|
||||
// 13.5
|
||||
{ 0x6C, "vcvttpd2uqqs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW1 | T_B64 | T_SAE_Y | T_SAE_Z },
|
||||
// 13.6
|
||||
{ 0x69, "vcvtph2ibs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW0 | T_B16 | T_ER_Y | T_ER_Z },
|
||||
{ 0x6B, "vcvtph2iubs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW0 | T_B16 | T_ER_Y | T_ER_Z },
|
||||
{ 0x68, "vcvttph2ibs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW0 | T_B16 | T_ER_Y | T_ER_Z },
|
||||
{ 0x6A, "vcvttph2iubs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW0 | T_B16 | T_ER_Y | T_ER_Z },
|
||||
// 13.7
|
||||
{ 0x6D, "vcvttps2dqs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW0 | T_B32 | T_SAE_Y | T_SAE_Z },
|
||||
// 13.8
|
||||
{ 0x69, "vcvtps2ibs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW0 | T_B32 | T_ER_Y | T_ER_Z },
|
||||
{ 0x6B, "vcvtps2iubs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW0 | T_B32 | T_ER_Y | T_ER_Z },
|
||||
{ 0x68, "vcvttps2ibs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW0 | T_B32 | T_ER_Y | T_ER_Z },
|
||||
{ 0x6A, "vcvttps2iubs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW0 | T_B32 | T_ER_Y | T_ER_Z },
|
||||
// 13.10
|
||||
{ 0x6C, "vcvttps2udqs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW0 | T_B32 | T_SAE_Y | T_SAE_Z },
|
||||
};
|
||||
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
||||
const Tbl *p = &tbl[i];
|
||||
|
@ -240,7 +264,6 @@ void putM_X()
|
|||
{ 0x7F, "vmovdqu32", T_F3 | T_0F | T_MUST_EVEX | T_YMM | T_EW0 | T_ER_X | T_ER_Y | T_ER_Z | T_M_K },
|
||||
{ 0x7F, "vmovdqu64", T_F3 | T_0F | T_MUST_EVEX | T_YMM | T_EW1 | T_ER_X | T_ER_Y | T_ER_Z | T_M_K },
|
||||
{ 0x11, "vmovsh", T_F3 | T_MAP5 | T_MUST_EVEX | T_EW0 | T_N2 | T_M_K },
|
||||
{ 0x7E, "vmovw", T_66 | T_MAP5 | T_MUST_EVEX | T_N2 },
|
||||
};
|
||||
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
||||
const Tbl *p = &tbl[i];
|
||||
|
@ -447,6 +470,13 @@ void putX_X_XM_IMM()
|
|||
{ 0x1B, "vcvtne2ph2hf8s", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_B16 | T_N1, false },
|
||||
|
||||
{ 0x52, "vdpphps", T_MUST_EVEX | T_0F38 | T_EW0 | T_YMM | T_B32, false },
|
||||
{ 0x52, "vminmaxnepbf16", T_MUST_EVEX | T_F2 | T_0F3A | T_EW0 | T_YMM | T_B16, true },
|
||||
{ 0x52, "vminmaxpd", T_MUST_EVEX | T_66 | T_0F3A | T_EW1 | T_YMM | T_B64 | T_SAE_Y | T_SAE_Z, true },
|
||||
{ 0x52, "vminmaxph", T_MUST_EVEX | T_0F3A | T_EW0 | T_YMM | T_B16 | T_SAE_Y | T_SAE_Z, true },
|
||||
{ 0x52, "vminmaxps", T_MUST_EVEX | T_66 | T_0F3A | T_EW0 | T_YMM | T_B32 | T_SAE_Y | T_SAE_Z, true },
|
||||
{ 0x53, "vminmaxsd", T_MUST_EVEX | T_66 | T_0F3A | T_EW1 | T_SAE_X | T_N8, true },
|
||||
{ 0x53, "vminmaxsh", T_MUST_EVEX | T_0F3A | T_EW0 | T_SAE_X | T_N2, true },
|
||||
{ 0x53, "vminmaxss", T_MUST_EVEX | T_66 | T_0F3A | T_EW0 | T_SAE_X | T_N4, true },
|
||||
};
|
||||
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
||||
const Tbl *p = &tbl[i];
|
||||
|
@ -658,6 +688,22 @@ void putCvt()
|
|||
{ 0x7B, "vcvtusi2sh", T_F3 | T_MAP5 | T_MUST_EVEX | T_ER_R | T_M_K, 6 },
|
||||
|
||||
{ 0x72, "vcvtneps2bf16", T_MUST_EVEX | T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 2 },
|
||||
// 13.2
|
||||
{ 0x6D, "vcvttpd2dqs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW1 | T_B64 | T_SAE_Y | T_SAE_Z, 2 },
|
||||
// 13.4
|
||||
{ 0x6C, "vcvttpd2udqs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW1 | T_B64 | T_SAE_Y | T_SAE_Z, 2 },
|
||||
// 13.9
|
||||
{ 0x6D, "vcvttps2qqs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW0 | T_B32 | T_SAE_X | T_SAE_Y | T_N8 | T_N_VL, 1 },
|
||||
// 13.11
|
||||
{ 0x6C, "vcvttps2uqqs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW0 | T_B32 | T_SAE_X | T_SAE_Y | T_N8 | T_N_VL, 1 },
|
||||
// 13.12
|
||||
{ 0x6D, "vcvttsd2sis", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_SAE_X | T_N8, 0 },
|
||||
// 13.13
|
||||
{ 0x6C, "vcvttsd2usis", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_SAE_X | T_N8, 0 },
|
||||
// 13.14
|
||||
{ 0x6D, "vcvttss2sis", T_MUST_EVEX | T_F3 | T_MAP5 | T_EW0 | T_SAE_X | T_N4, 0 },
|
||||
// 13.15
|
||||
{ 0x6C, "vcvttss2usis", T_MUST_EVEX | T_F3 | T_MAP5 | T_EW0 | T_SAE_X | T_N4, 0 },
|
||||
};
|
||||
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
||||
const Tbl& p = tbl[i];
|
||||
|
@ -666,10 +712,10 @@ void putCvt()
|
|||
case 0:
|
||||
printf("void %s(const Reg32e& r, const Operand& op) { uint64_t type = (%s) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x%02X); }\n", p.name, s.c_str(), p.code);
|
||||
break;
|
||||
case 1:
|
||||
case 1: // (x, x/m), (y, x/m256), (z, y/m)
|
||||
printf("void %s(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, %s, 0x%02X); }\n", p.name, s.c_str(), p.code);
|
||||
break;
|
||||
case 2:
|
||||
case 2: // (x, x/m), (x, y/m256), (y, z/m)
|
||||
printf("void %s(const Xmm& x, const Operand& op) { opCvt2(x, op, %s, 0x%02X); }\n", p.name, s.c_str(), p.code);
|
||||
break;
|
||||
case 3:
|
||||
|
@ -1032,12 +1078,6 @@ void putFP16_2()
|
|||
printf("void vmovsh(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, %s, 0x10); }\n", s.c_str());
|
||||
printf("void vmovsh(const Xmm& x1, const Xmm& x2, const Xmm& x3) { opAVX_X_X_XM(x1, x2, x3, %s, 0x10); }\n", s.c_str());
|
||||
}
|
||||
{
|
||||
uint64_t type = T_66 | T_MAP5 | T_MUST_EVEX | T_N2;
|
||||
std::string s = type2String(type);
|
||||
printf("void vmovw(const Xmm& x, const Operand& op) { if (!op.isREG(32|64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, %s, 0x6E); }\n", s.c_str());
|
||||
printf("void vmovw(const Reg32e& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, r, %s, 0x7E); }\n", s.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
void putFP16()
|
||||
|
|
|
@ -1443,6 +1443,7 @@ void put()
|
|||
printf("void %s(const Xmm& x, const Operand& op) { opSSE_APX(x, op, T_0F38, 0x%02X, T_MUST_EVEX, 0x%02X); }\n", p->name, p->code, p->code2);
|
||||
}
|
||||
puts("void sha1rnds4(const Xmm& x, const Operand& op, uint8_t imm) { opSSE_APX(x, op, T_0F3A, 0xCC, T_MUST_EVEX, 0xD4, imm); }");
|
||||
puts("void sha1msg12(const Xmm& x, const Operand& op) { opROO(Reg(), op, x, T_MUST_EVEX, 0xD9); }");
|
||||
}
|
||||
// (m, x), (m, y)
|
||||
{
|
||||
|
@ -1733,9 +1734,6 @@ void put()
|
|||
}
|
||||
// mov
|
||||
{
|
||||
printf("void vmovd(const Xmm& x, const Operand& op) { if (!op.isREG(32) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x6E); }\n");
|
||||
printf("void vmovd(const Operand& op, const Xmm& x) { if (!op.isREG(32) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x7E); }\n");
|
||||
|
||||
printf("void vmovq(const Xmm& x, const Address& addr) { uint64_t type; uint8_t code; if (x.getIdx() < 16) { type = T_0F | T_F3; code = 0x7E; } else { type = T_0F | T_66 | T_EVEX | T_EW1 | T_N8; code = 0x6E; } opAVX_X_X_XM(x, xm0, addr, type, code); }\n");
|
||||
printf("void vmovq(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, x.getIdx() < 16 ? 0xD6 : 0x7E); }\n");
|
||||
printf("void vmovq(const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x1, xm0, x2, T_0F | T_F3 | T_EVEX | T_EW1 | T_N8, 0x7E); }\n");
|
||||
|
@ -1899,36 +1897,6 @@ void put()
|
|||
printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, %s, 0x%02X, encoding); }\n", p->name, s.c_str(), p->code);
|
||||
}
|
||||
}
|
||||
// avx-vnni-int8
|
||||
// avx-vnni-int16
|
||||
#if 0
|
||||
{
|
||||
const struct Tbl {
|
||||
uint8_t code;
|
||||
const char *name;
|
||||
uint64_t type;
|
||||
} tbl[] = {
|
||||
// { 0x50, "vpdpbssd", T_F2 | T_0F38 | T_W0 | T_YMM },
|
||||
// { 0x51, "vpdpbssds", T_F2 | T_0F38 | T_W0 | T_YMM },
|
||||
// { 0x50, "vpdpbsud", T_F3 | T_0F38 | T_W0 | T_YMM },
|
||||
// { 0x51, "vpdpbsuds", T_F3 | T_0F38 | T_W0 | T_YMM },
|
||||
// { 0x50, "vpdpbuud", T_0F38 | T_W0 | T_YMM },
|
||||
// { 0x51, "vpdpbuuds", T_0F38 | T_W0 | T_YMM },
|
||||
|
||||
// { 0xD2, "vpdpwsud", T_F3 | T_0F38 | T_W0 | T_YMM },
|
||||
// { 0xD3, "vpdpwsuds", T_F3 | T_0F38 | T_W0 | T_YMM },
|
||||
// { 0xD2, "vpdpwusd", T_66 | T_0F38 | T_W0 | T_YMM },
|
||||
// { 0xD3, "vpdpwusds", T_66 | T_0F38 | T_W0 | T_YMM },
|
||||
// { 0xD2, "vpdpwuud", T_0F38 | T_W0 | T_YMM },
|
||||
// { 0xD3, "vpdpwuuds", T_0F38 | T_W0 | T_YMM },
|
||||
};
|
||||
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
|
||||
const Tbl *p = &tbl[i];
|
||||
std::string s = type2String(p->type);
|
||||
printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, %s, 0x%02X); }\n", p->name, s.c_str(), p->code);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void put32()
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
project(
|
||||
'xbyak',
|
||||
'cpp',
|
||||
version: '7.10',
|
||||
version: '7.20',
|
||||
license: 'BSD-3-Clause',
|
||||
default_options: 'b_ndebug=if-release'
|
||||
)
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
# Xbyak 7.10 [![Badge Build]][Build Status]
|
||||
# Xbyak 7.20 [![Badge Build]][Build Status]
|
||||
|
||||
*A C++ JIT assembler for x86 (IA32), x64 (AMD64, x86-64)*
|
||||
|
||||
|
@ -20,8 +20,7 @@ It is named from a Japanese word [開闢](https://translate.google.com/?hl=ja&sl
|
|||
|
||||
- header file only
|
||||
- Intel/MASM like syntax
|
||||
- fully support AVX-512
|
||||
- support APX/AVX10
|
||||
- Full support for AVX-512, APX, and AVX10.2
|
||||
|
||||
**Note**:
|
||||
Use `and_()`, `or_()`, ... instead of `and()`, `or()`.
|
||||
|
@ -33,6 +32,7 @@ If you want to use them, then specify `-fno-operator-names` option to gcc/clang.
|
|||
|
||||
### News
|
||||
|
||||
- support AVX10.2
|
||||
- support xresldtrk/xsusldtrk
|
||||
- support RAO-INT for APX
|
||||
- support AVX10 detection, AESKLE, WIDE_KL, KEYLOCKER, KEYLOCKER_WIDE
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 7.10
|
||||
C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 7.20
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
◎概要
|
||||
|
@ -14,7 +14,7 @@
|
|||
xbyak.hをインクルードするだけですぐ利用することができます。
|
||||
C++の枠組み内で閉じているため、外部アセンブラは不要です。
|
||||
32bit/64bit両対応です。
|
||||
対応ニーモニック:特権命令除くx86, MMX/MMX2/SSE/SSE2/SSE3/SSSE3/SSE4/FPU(一部)/AVX/AVX2/FMA/VEX-encoded GPR
|
||||
対応ニーモニック:特権命令除くx86, MMX/MMX2/SSE/SSE2/SSE3/SSSE3/SSE4/FPU(一部)/AVX/AVX2/FMA/AVX-512/APX/AVX10.2
|
||||
|
||||
・Windows Xp(32bit, 64bit), Windows 7/Linux(32bit, 64bit)/Intel Mac対応
|
||||
Windows Xp, Windows 7上ではVC2008, VC2010, VC2012
|
||||
|
@ -46,7 +46,7 @@ Linuxではmake installで/usr/local/include/xbyakにコピーされます。
|
|||
-----------------------------------------------------------------------------
|
||||
◎新機能
|
||||
|
||||
APX/AVX10対応
|
||||
APX/AVX10.2対応
|
||||
|
||||
例外なしモード追加
|
||||
XBYAK_NO_EXCEPTIONを定義してコンパイルするとgcc/clangで-fno-exceptionsオプションでコンパイルできます。
|
||||
|
@ -404,6 +404,9 @@ sample/{echo,hello}.bfは http://www.kmonos.net/alang/etc/brainfuck.php から
|
|||
-----------------------------------------------------------------------------
|
||||
◎履歴
|
||||
|
||||
2024/10/15 ver 7.20 setDefaultEncoding/setDefaultEncodingAVX10の仕様確定
|
||||
2024/10/15 ver 7.11 AVX10.2完全サポート
|
||||
2024/10/13 ver 7.10 AVX10 integer and fp16 vnni, mediaの新命令対応. setDefaultEncodingの拡張.
|
||||
2024/10/10 ver 7.09.1 vpcompressbとvpcompresswの名前修正
|
||||
2024/10/08 ver 7.09 AVX10.2のYMMレジスタの埋め込み丸め対応
|
||||
2024/10/07 ver 7.08 rdfabaseなどサポート
|
||||
|
|
|
@ -60,9 +60,12 @@ apx: apx.cpp $(XBYAK_INC)
|
|||
avx10_test: avx10_test.cpp $(XBYAK_INC)
|
||||
$(CXX) $(CFLAGS) avx10_test.cpp -o $@ -DXBYAK64
|
||||
|
||||
TEST_FILES=old.txt new-ymm.txt bf16.txt comp.txt convert.txt
|
||||
TEST_FILES=old.txt new-ymm.txt bf16.txt comp.txt misc.txt convert.txt minmax.txt saturation.txt
|
||||
xed_test:
|
||||
@for target in $(addprefix avx10/, $(TEST_FILES)); do ./test_by_xed.sh $$target; done
|
||||
@set -e; \
|
||||
for target in $(addprefix avx10/, $(TEST_FILES)); do \
|
||||
./test_by_xed.sh $$target || exit 1; \
|
||||
done
|
||||
|
||||
test_nm: normalize_prefix $(TARGET)
|
||||
$(MAKE) -C ../gen
|
||||
|
|
66
test/avx10/minmax.txt
Normal file
66
test/avx10/minmax.txt
Normal file
|
@ -0,0 +1,66 @@
|
|||
vminmaxnepbf16(xm1|k3|T_z, xm2, xm3, 5);
|
||||
vminmaxnepbf16(xm1|k3|T_z, xm2, ptr[rax+128], 5);
|
||||
vminmaxnepbf16(xm1|k3|T_z, xm2, ptr_b[rax+128], 5);
|
||||
|
||||
vminmaxnepbf16(ym1|k3|T_z, ym2, ym3, 5);
|
||||
vminmaxnepbf16(ym1|k3|T_z, ym2, ptr[rax+128], 5);
|
||||
vminmaxnepbf16(ym1|k3|T_z, ym2, ptr_b[rax+128], 5);
|
||||
|
||||
vminmaxnepbf16(zm1|k3|T_z, zm2, zm3, 5);
|
||||
vminmaxnepbf16(zm1|k3|T_z, zm2, ptr[rax+128], 5);
|
||||
vminmaxnepbf16(zm1|k3|T_z, zm2, ptr_b[rax+128], 5);
|
||||
//
|
||||
vminmaxpd(xm1|k3|T_z, xm2, xm3, 5);
|
||||
vminmaxpd(xm1|k3|T_z, xm2, ptr[rax+128], 5);
|
||||
vminmaxpd(xm1|k3|T_z, xm2, ptr_b[rax+128], 5);
|
||||
|
||||
vminmaxpd(ym1|k3|T_z, ym2, ym3, 5);
|
||||
vminmaxpd(ym1|k3|T_z, ym2, ym3|T_sae, 5);
|
||||
vminmaxpd(ym1|k3|T_z, ym2, ptr[rax+128], 5);
|
||||
vminmaxpd(ym1|k3|T_z, ym2, ptr_b[rax+128], 5);
|
||||
|
||||
vminmaxpd(zm1|k3|T_z, zm2, zm3, 5);
|
||||
vminmaxpd(zm1|k3|T_z, zm2, zm3|T_sae, 5);
|
||||
vminmaxpd(zm1|k3|T_z, zm2, ptr[rax+128], 5);
|
||||
vminmaxpd(zm1|k3|T_z, zm2, ptr_b[rax+128], 5);
|
||||
//
|
||||
vminmaxph(xm1|k3|T_z, xm2, xm3, 5);
|
||||
vminmaxph(xm1|k3|T_z, xm2, ptr[rax+128], 5);
|
||||
vminmaxph(xm1|k3|T_z, xm2, ptr[rax+128], 5);
|
||||
vminmaxph(xm1|k3|T_z, xm2, ptr_b[rax+128], 5);
|
||||
|
||||
vminmaxph(ym1|k3|T_z, ym2, ym3, 5);
|
||||
vminmaxph(ym1|k3|T_z, ym2, ym3|T_sae, 5);
|
||||
vminmaxph(ym1|k3|T_z, ym2, ptr[rax+128], 5);
|
||||
vminmaxph(ym1|k3|T_z, ym2, ptr_b[rax+128], 5);
|
||||
|
||||
vminmaxph(zm1|k3|T_z, zm2, zm3, 5);
|
||||
vminmaxph(zm1|k3|T_z, zm2, zm3|T_sae, 5);
|
||||
vminmaxph(zm1|k3|T_z, zm2, ptr[rax+128], 5);
|
||||
vminmaxph(zm1|k3|T_z, zm2, ptr_b[rax+128], 5);
|
||||
//
|
||||
vminmaxps(xm1|k3|T_z, xm2, xm3, 5);
|
||||
vminmaxps(xm1|k3|T_z, xm2, ptr[rax+128], 5);
|
||||
vminmaxps(xm1|k3|T_z, xm2, ptr_b[rax+128], 5);
|
||||
|
||||
vminmaxps(ym1|k3|T_z, ym2, ym3, 5);
|
||||
vminmaxps(ym1|k3|T_z, ym2, ym3|T_sae, 5);
|
||||
vminmaxps(ym1|k3|T_z, ym2, ptr[rax+128], 5);
|
||||
vminmaxps(ym1|k3|T_z, ym2, ptr_b[rax+128], 5);
|
||||
|
||||
vminmaxps(zm1|k3|T_z, zm2, zm3, 5);
|
||||
vminmaxps(zm1|k3|T_z, zm2, zm3|T_sae, 5);
|
||||
vminmaxps(zm1|k3|T_z, zm2, ptr[rax+128], 5);
|
||||
vminmaxps(zm1|k3|T_z, zm2, ptr_b[rax+128], 5);
|
||||
//
|
||||
vminmaxsd(xm1|k3|T_z, xm2, xm3, 5);
|
||||
vminmaxsd(xm1|k3|T_z, xm2, xm3|T_sae, 5);
|
||||
vminmaxsd(xm1|k3|T_z, xm2, ptr[rax+128], 5);
|
||||
//
|
||||
vminmaxsh(xm1|k3|T_z, xm2, xm3, 5);
|
||||
vminmaxsh(xm1|k3|T_z, xm2, xm3|T_sae, 5);
|
||||
vminmaxsh(xm1|k3|T_z, xm2, ptr[rax+128], 5);
|
||||
//
|
||||
vminmaxss(xm1|k3|T_z, xm2, xm3, 5);
|
||||
vminmaxss(xm1|k3|T_z, xm2, xm3|T_sae, 5);
|
||||
vminmaxss(xm1|k3|T_z, xm2, ptr[rax+128], 5);
|
|
@ -1,3 +1,4 @@
|
|||
// AVX10 integer and FP16 VNNI, media and zero-extending
|
||||
vdpphps(xm1, xm2, xm3);
|
||||
vdpphps(xm1, xm2, ptr[rax+128]);
|
||||
vdpphps(xm1, xm2, ptr_b[rax+128]);
|
||||
|
@ -165,3 +166,14 @@ vpdpwuuds(ym1, ym2, ptr_b[rax+128]);
|
|||
vpdpwuuds(zm1, zm2, zm3);
|
||||
vpdpwuuds(zm1, zm2, ptr[rax+128]);
|
||||
vpdpwuuds(zm1, zm2, ptr_b[rax+128]);
|
||||
|
||||
//
|
||||
vmovd(xm10, xm20);
|
||||
vmovd(xm1, xm2);
|
||||
vmovd(xm10, ptr[rax+128]);
|
||||
vmovd(ptr[rax+128], xm30);
|
||||
//
|
||||
vmovw(xm1, xm20);
|
||||
vmovw(xm1, xm2);
|
||||
vmovw(xm3, ptr [rax+0x40]);
|
||||
vmovw(ptr [rax+0x40], xm7);
|
||||
|
|
|
@ -355,10 +355,6 @@ vgetmantsh(xmm1|k1|T_z|T_sae, xmm3, xmm5, 0x6);
|
|||
vmovsh(xmm1|k1|T_z, ptr [rax+0x40]);
|
||||
vmovsh(ptr [rax+0x40]|k1, xmm1);
|
||||
vmovsh(xmm1|k2|T_z, xmm3, xmm5);
|
||||
vmovw(xmm1, r13d);
|
||||
vmovw(xmm3, ptr [rax+0x40]);
|
||||
vmovw(r9d, xmm1);
|
||||
vmovw(ptr [rax+0x40], xmm7);
|
||||
vcvtsd2sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3);
|
||||
vcvtsd2sh(xmm1, xmm2, ptr [rax+0x40]);
|
||||
vcvtsh2sd(xmm1|k1|T_z|T_sae, xmm2, xmm3);
|
||||
|
|
310
test/avx10/saturation.txt
Normal file
310
test/avx10/saturation.txt
Normal file
|
@ -0,0 +1,310 @@
|
|||
//
|
||||
vcvtnebf162ibs(xm1, xm2);
|
||||
vcvtnebf162ibs(xm1, ptr[rax+128]);
|
||||
vcvtnebf162ibs(xm1, ptr_b[rax+128]);
|
||||
|
||||
vcvtnebf162ibs(ym1, ym2);
|
||||
vcvtnebf162ibs(ym1, ptr[rax+128]);
|
||||
vcvtnebf162ibs(ym1, ptr_b[rax+128]);
|
||||
|
||||
vcvtnebf162ibs(zm1, zm2);
|
||||
vcvtnebf162ibs(zm1, ptr[rax+128]);
|
||||
vcvtnebf162ibs(zm1, ptr_b[rax+128]);
|
||||
//
|
||||
vcvtnebf162iubs(xm1, xm2);
|
||||
vcvtnebf162iubs(xm1, ptr[rax+128]);
|
||||
vcvtnebf162iubs(xm1, ptr_b[rax+128]);
|
||||
|
||||
vcvtnebf162iubs(ym1, ym2);
|
||||
vcvtnebf162iubs(ym1, ptr[rax+128]);
|
||||
vcvtnebf162iubs(ym1, ptr_b[rax+128]);
|
||||
|
||||
vcvtnebf162iubs(zm1, zm2);
|
||||
vcvtnebf162iubs(zm1, ptr[rax+128]);
|
||||
vcvtnebf162iubs(zm1, ptr_b[rax+128]);
|
||||
//
|
||||
vcvttnebf162ibs(xm1, xm2);
|
||||
vcvttnebf162ibs(xm1, ptr[rax+128]);
|
||||
vcvttnebf162ibs(xm1, ptr_b[rax+128]);
|
||||
|
||||
vcvttnebf162ibs(ym1, ym2);
|
||||
vcvttnebf162ibs(ym1, ptr[rax+128]);
|
||||
vcvttnebf162ibs(ym1, ptr_b[rax+128]);
|
||||
|
||||
vcvttnebf162ibs(zm1, zm2);
|
||||
vcvttnebf162ibs(zm1, ptr[rax+128]);
|
||||
vcvttnebf162ibs(zm1, ptr_b[rax+128]);
|
||||
//
|
||||
vcvttnebf162iubs(xm1, xm2);
|
||||
vcvttnebf162iubs(xm1, ptr[rax+128]);
|
||||
vcvttnebf162iubs(xm1, ptr_b[rax+128]);
|
||||
|
||||
vcvttnebf162iubs(ym1, ym2);
|
||||
vcvttnebf162iubs(ym1, ptr[rax+128]);
|
||||
vcvttnebf162iubs(ym1, ptr_b[rax+128]);
|
||||
|
||||
vcvttnebf162iubs(zm1, zm2);
|
||||
vcvttnebf162iubs(zm1, ptr[rax+128]);
|
||||
vcvttnebf162iubs(zm1, ptr_b[rax+128]);
|
||||
//
|
||||
vcvttpd2qqs(xm1, xm2);
|
||||
vcvttpd2qqs(xm1, ptr[rax+128]);
|
||||
vcvttpd2qqs(xm1, ptr_b[rax+128]);
|
||||
|
||||
vcvttpd2qqs(ym1, ym2);
|
||||
vcvttpd2qqs(ym1, ym2|T_sae);
|
||||
vcvttpd2qqs(ym1, ptr[rax+128]);
|
||||
vcvttpd2qqs(ym1, ptr_b[rax+128]);
|
||||
|
||||
vcvttpd2qqs(zm1, zm2);
|
||||
vcvttpd2qqs(zm1, zm2|T_sae);
|
||||
vcvttpd2qqs(zm1, ptr[rax+128]);
|
||||
vcvttpd2qqs(zm1, ptr_b[rax+128]);
|
||||
//
|
||||
vcvttpd2uqqs(xm1, xm2);
|
||||
vcvttpd2uqqs(xm1, ptr[rax+128]);
|
||||
vcvttpd2uqqs(xm1, ptr_b[rax+128]);
|
||||
|
||||
vcvttpd2uqqs(ym1, ym2);
|
||||
vcvttpd2uqqs(ym1, ym2|T_sae);
|
||||
vcvttpd2uqqs(ym1, ptr[rax+128]);
|
||||
vcvttpd2uqqs(ym1, ptr_b[rax+128]);
|
||||
|
||||
vcvttpd2uqqs(zm1, zm2);
|
||||
vcvttpd2uqqs(zm1, zm2|T_sae);
|
||||
vcvttpd2uqqs(zm1, ptr[rax+128]);
|
||||
vcvttpd2uqqs(zm1, ptr_b[rax+128]);
|
||||
//
|
||||
vcvtph2ibs(xm1, xm2);
|
||||
vcvtph2ibs(xm1, ptr[rax+128]);
|
||||
vcvtph2ibs(xm1, ptr_b[rax+128]);
|
||||
|
||||
vcvtph2ibs(ym1, ym2);
|
||||
vcvtph2ibs(ym1, ym2|T_rd_sae);
|
||||
vcvtph2ibs(ym1, ptr[rax+128]);
|
||||
vcvtph2ibs(ym1, ptr_b[rax+128]);
|
||||
|
||||
vcvtph2ibs(zm1, zm2);
|
||||
vcvtph2ibs(zm1, zm2|T_ru_sae);
|
||||
vcvtph2ibs(zm1, ptr[rax+128]);
|
||||
vcvtph2ibs(zm1, ptr_b[rax+128]);
|
||||
//
|
||||
vcvtph2iubs(xm1, xm2);
|
||||
vcvtph2iubs(xm1, ptr[rax+128]);
|
||||
vcvtph2iubs(xm1, ptr_b[rax+128]);
|
||||
|
||||
vcvtph2iubs(ym1, ym2);
|
||||
vcvtph2iubs(ym1, ym2|T_rd_sae);
|
||||
vcvtph2iubs(ym1, ptr[rax+128]);
|
||||
vcvtph2iubs(ym1, ptr_b[rax+128]);
|
||||
|
||||
vcvtph2iubs(zm1, zm2);
|
||||
vcvtph2iubs(zm1, zm2|T_ru_sae);
|
||||
vcvtph2iubs(zm1, ptr[rax+128]);
|
||||
vcvtph2iubs(zm1, ptr_b[rax+128]);
|
||||
//
|
||||
vcvttph2ibs(xm1, xm2);
|
||||
vcvttph2ibs(xm1, ptr[rax+128]);
|
||||
vcvttph2ibs(xm1, ptr_b[rax+128]);
|
||||
|
||||
vcvttph2ibs(ym1, ym2);
|
||||
vcvttph2ibs(ym1, ym2|T_rd_sae);
|
||||
vcvttph2ibs(ym1, ptr[rax+128]);
|
||||
vcvttph2ibs(ym1, ptr_b[rax+128]);
|
||||
|
||||
vcvttph2ibs(zm1, zm2);
|
||||
vcvttph2ibs(zm1, zm2|T_ru_sae);
|
||||
vcvttph2ibs(zm1, ptr[rax+128]);
|
||||
vcvttph2ibs(zm1, ptr_b[rax+128]);
|
||||
//
|
||||
vcvttph2iubs(xm1, xm2);
|
||||
vcvttph2iubs(xm1, ptr[rax+128]);
|
||||
vcvttph2iubs(xm1, ptr_b[rax+128]);
|
||||
|
||||
vcvttph2iubs(ym1, ym2);
|
||||
vcvttph2iubs(ym1, ym2|T_rd_sae);
|
||||
vcvttph2iubs(ym1, ptr[rax+128]);
|
||||
vcvttph2iubs(ym1, ptr_b[rax+128]);
|
||||
|
||||
vcvttph2iubs(zm1, zm2);
|
||||
vcvttph2iubs(zm1, zm2|T_ru_sae);
|
||||
vcvttph2iubs(zm1, ptr[rax+128]);
|
||||
vcvttph2iubs(zm1, ptr_b[rax+128]);
|
||||
//
|
||||
vcvttps2dqs(xm1, xm2);
|
||||
vcvttps2dqs(xm1, ptr[rax+128]);
|
||||
vcvttps2dqs(xm1, ptr_b[rax+128]);
|
||||
|
||||
vcvttps2dqs(ym1, ym2);
|
||||
vcvttps2dqs(ym1, ym2|T_sae);
|
||||
vcvttps2dqs(ym1, ptr[rax+128]);
|
||||
vcvttps2dqs(ym1, ptr_b[rax+128]);
|
||||
|
||||
vcvttps2dqs(zm1, zm2);
|
||||
vcvttps2dqs(zm1, zm2|T_sae);
|
||||
vcvttps2dqs(zm1, ptr[rax+128]);
|
||||
vcvttps2dqs(zm1, ptr_b[rax+128]);
|
||||
//
|
||||
vcvtps2ibs(xm1, xm2);
|
||||
vcvtps2ibs(xm1, ptr[rax+128]);
|
||||
vcvtps2ibs(xm1, ptr_b[rax+128]);
|
||||
|
||||
vcvtps2ibs(ym1, ym2);
|
||||
vcvtps2ibs(ym1, ym2|T_rd_sae);
|
||||
vcvtps2ibs(ym1, ptr[rax+128]);
|
||||
vcvtps2ibs(ym1, ptr_b[rax+128]);
|
||||
|
||||
vcvtps2ibs(zm1, zm2);
|
||||
vcvtps2ibs(zm1, zm2|T_ru_sae);
|
||||
vcvtps2ibs(zm1, ptr[rax+128]);
|
||||
vcvtps2ibs(zm1, ptr_b[rax+128]);
|
||||
//
|
||||
vcvtps2iubs(xm1, xm2);
|
||||
vcvtps2iubs(xm1, ptr[rax+128]);
|
||||
vcvtps2iubs(xm1, ptr_b[rax+128]);
|
||||
|
||||
vcvtps2iubs(ym1, ym2);
|
||||
vcvtps2iubs(ym1, ym2|T_rd_sae);
|
||||
vcvtps2iubs(ym1, ptr[rax+128]);
|
||||
vcvtps2iubs(ym1, ptr_b[rax+128]);
|
||||
|
||||
vcvtps2iubs(zm1, zm2);
|
||||
vcvtps2iubs(zm1, zm2|T_ru_sae);
|
||||
vcvtps2iubs(zm1, ptr[rax+128]);
|
||||
vcvtps2iubs(zm1, ptr_b[rax+128]);
|
||||
//
|
||||
vcvttps2ibs(xm1, xm2);
|
||||
vcvttps2ibs(xm1, ptr[rax+128]);
|
||||
vcvttps2ibs(xm1, ptr_b[rax+128]);
|
||||
|
||||
vcvttps2ibs(ym1, ym2);
|
||||
vcvttps2ibs(ym1, ym2|T_rd_sae);
|
||||
vcvttps2ibs(ym1, ptr[rax+128]);
|
||||
vcvttps2ibs(ym1, ptr_b[rax+128]);
|
||||
|
||||
vcvttps2ibs(zm1, zm2);
|
||||
vcvttps2ibs(zm1, zm2|T_ru_sae);
|
||||
vcvttps2ibs(zm1, ptr[rax+128]);
|
||||
vcvttps2ibs(zm1, ptr_b[rax+128]);
|
||||
//
|
||||
vcvttps2iubs(xm1, xm2);
|
||||
vcvttps2iubs(xm1, ptr[rax+128]);
|
||||
vcvttps2iubs(xm1, ptr_b[rax+128]);
|
||||
|
||||
vcvttps2iubs(ym1, ym2);
|
||||
vcvttps2iubs(ym1, ym2|T_rd_sae);
|
||||
vcvttps2iubs(ym1, ptr[rax+128]);
|
||||
vcvttps2iubs(ym1, ptr_b[rax+128]);
|
||||
|
||||
vcvttps2iubs(zm1, zm2);
|
||||
vcvttps2iubs(zm1, zm2|T_ru_sae);
|
||||
vcvttps2iubs(zm1, ptr[rax+128]);
|
||||
vcvttps2iubs(zm1, ptr_b[rax+128]);
|
||||
//
|
||||
vcvttps2udqs(xm1, xm2);
|
||||
vcvttps2udqs(xm1, ptr[rax+128]);
|
||||
vcvttps2udqs(xm1, ptr_b[rax+128]);
|
||||
|
||||
vcvttps2udqs(ym1, ym2);
|
||||
vcvttps2udqs(ym1, ym2|T_sae);
|
||||
vcvttps2udqs(ym1, ptr[rax+128]);
|
||||
vcvttps2udqs(ym1, ptr_b[rax+128]);
|
||||
|
||||
vcvttps2udqs(zm1, zm2);
|
||||
vcvttps2udqs(zm1, zm2|T_sae);
|
||||
vcvttps2udqs(zm1, ptr[rax+128]);
|
||||
vcvttps2udqs(zm1, ptr_b[rax+128]);
|
||||
|
||||
//
|
||||
vcvttpd2dqs(xm1|k1|T_z, xm2);
|
||||
vcvttpd2dqs(xm1|k1|T_z, xword [rax+128]);
|
||||
vcvttpd2dqs(xm1|k1|T_z, xword_b[rax+128]);
|
||||
|
||||
vcvttpd2dqs(xm1|k1|T_z, ym2);
|
||||
vcvttpd2dqs(xm1|k1|T_z, ym2|T_sae);
|
||||
vcvttpd2dqs(xm1|k1|T_z, yword [rax+128]);
|
||||
vcvttpd2dqs(xm1|k1|T_z, yword_b[rax+128]);
|
||||
|
||||
vcvttpd2dqs(ym1|k1|T_z, zm2);
|
||||
vcvttpd2dqs(ym1|k1|T_z, zm2|T_sae);
|
||||
vcvttpd2dqs(ym1|k1|T_z, zword [rax+128]);
|
||||
vcvttpd2dqs(ym1|k1|T_z, zword_b[rax+128]);
|
||||
|
||||
//
|
||||
vcvttpd2udqs(xm1|k1|T_z, xm2);
|
||||
vcvttpd2udqs(xm1|k1|T_z, xword [rax+128]);
|
||||
vcvttpd2udqs(xm1|k1|T_z, xword_b[rax+128]);
|
||||
|
||||
vcvttpd2udqs(xm1|k1|T_z, ym2);
|
||||
vcvttpd2udqs(xm1|k1|T_z, ym2|T_sae);
|
||||
vcvttpd2udqs(xm1|k1|T_z, yword [rax+128]);
|
||||
vcvttpd2udqs(xm1|k1|T_z, yword_b[rax+128]);
|
||||
|
||||
vcvttpd2udqs(ym1|k1|T_z, zm2);
|
||||
vcvttpd2udqs(ym1|k1|T_z, zm2|T_sae);
|
||||
vcvttpd2udqs(ym1|k1|T_z, zword [rax+128]);
|
||||
vcvttpd2udqs(ym1|k1|T_z, zword_b[rax+128]);
|
||||
//
|
||||
vcvttps2qqs(xm1|k1|T_z, xm2);
|
||||
vcvttps2qqs(xm1|k1|T_z, ptr [rax+128]);
|
||||
vcvttps2qqs(xm1|k1|T_z, ptr_b[rax+128]);
|
||||
|
||||
vcvttps2qqs(ym1|k1|T_z, xm2);
|
||||
vcvttps2qqs(ym1|k1|T_z, xm2|T_sae);
|
||||
vcvttps2qqs(ym1|k1|T_z, ptr [rax+128]);
|
||||
vcvttps2qqs(ym1|k1|T_z, ptr_b[rax+128]);
|
||||
|
||||
vcvttps2qqs(zm1, ym2);
|
||||
vcvttps2qqs(zm1|k1|T_z, ym2);
|
||||
vcvttps2qqs(zm1|k1|T_z|T_sae, ym2);
|
||||
vcvttps2qqs(zm1|k1|T_z, ptr [rax+128]);
|
||||
vcvttps2qqs(zm1|k1|T_z, ptr_b[rax+128]);
|
||||
|
||||
//
|
||||
vcvttps2uqqs(xm1|k1|T_z, xm2);
|
||||
vcvttps2uqqs(xm1|k1|T_z, ptr [rax+128]);
|
||||
vcvttps2uqqs(xm1|k1|T_z, ptr_b[rax+128]);
|
||||
|
||||
vcvttps2uqqs(ym1|k1|T_z, xm2);
|
||||
vcvttps2uqqs(ym1|k1|T_z, xm2|T_sae);
|
||||
vcvttps2uqqs(ym1|k1|T_z, ptr [rax+128]);
|
||||
vcvttps2uqqs(ym1|k1|T_z, ptr_b[rax+128]);
|
||||
|
||||
vcvttps2uqqs(zm1, ym2);
|
||||
vcvttps2uqqs(zm1|k1|T_z, ym2);
|
||||
vcvttps2uqqs(zm1|k1|T_z|T_sae, ym2);
|
||||
vcvttps2uqqs(zm1|k1|T_z, ptr [rax+128]);
|
||||
vcvttps2uqqs(zm1|k1|T_z, ptr_b[rax+128]);
|
||||
|
||||
//
|
||||
vcvttsd2sis(eax, xm1);
|
||||
vcvttsd2sis(eax, xm1|T_sae);
|
||||
vcvttsd2sis(eax, ptr[rax+128]);
|
||||
|
||||
vcvttsd2sis(r30, xm1);
|
||||
vcvttsd2sis(r30, xm1|T_sae);
|
||||
vcvttsd2sis(r30, ptr[rax+128]);
|
||||
//
|
||||
vcvttsd2usis(eax, xm1);
|
||||
vcvttsd2usis(eax, xm1|T_sae);
|
||||
vcvttsd2usis(eax, ptr[rax+128]);
|
||||
|
||||
vcvttsd2usis(r30, xm1);
|
||||
vcvttsd2usis(r30, xm1|T_sae);
|
||||
vcvttsd2usis(r30, ptr[rax+128]);
|
||||
//
|
||||
vcvttss2sis(eax, xm1);
|
||||
vcvttss2sis(eax, xm1|T_sae);
|
||||
vcvttss2sis(eax, ptr[rax+128]);
|
||||
|
||||
vcvttss2sis(r30, xm1);
|
||||
vcvttss2sis(r30, xm1|T_sae);
|
||||
vcvttss2sis(r30, ptr[rax+128]);
|
||||
//
|
||||
vcvttss2usis(eax, xm1);
|
||||
vcvttss2usis(eax, xm1|T_sae);
|
||||
vcvttss2usis(eax, ptr[rax+128]);
|
||||
|
||||
vcvttss2usis(r30, xm1);
|
||||
vcvttss2usis(r30, xm1|T_sae);
|
||||
vcvttss2usis(r30, ptr[rax+128]);
|
|
@ -234,10 +234,10 @@ CYBOZU_TEST_AUTO(vmpsadbw)
|
|||
struct Code : Xbyak::CodeGenerator {
|
||||
Code()
|
||||
{
|
||||
setDefaultEncoding();
|
||||
setDefaultEncodingAVX10();
|
||||
vmpsadbw(xm1, xm3, xm15, 3); // vex(avx)
|
||||
vmpsadbw(ym1, ym3, ptr[rax+128], 3); // vex(avx2)
|
||||
setDefaultEncoding(VexEncoding, EvexEncoding);
|
||||
setDefaultEncodingAVX10(AVX10v2Encoding);
|
||||
vmpsadbw(ym1, ym3, ym15, 3); // evex(avx10.2)
|
||||
vmpsadbw(ym1, ym3, ptr[rax+128], 3); // evex(avx10.2)
|
||||
}
|
||||
|
|
|
@ -7,7 +7,7 @@ struct Code : Xbyak::CodeGenerator {
|
|||
Code()
|
||||
: Xbyak::CodeGenerator(4096*8)
|
||||
{
|
||||
setDefaultEncoding(VexEncoding, EvexEncoding);
|
||||
setDefaultEncodingAVX10(AVX10v2Encoding);
|
||||
#include "tmp.cpp"
|
||||
}
|
||||
};
|
||||
|
|
|
@ -366,7 +366,7 @@ def parseNmemonicTest():
|
|||
('vpshldw(xmm9|k3|T_z, xmm2, ptr [rax + 0x40], 5);', Nmemonic('vpshldw', [xmm9, xmm2, Memory(0, rax, None, 0, 0x40), 5], [k3, T_z])),
|
||||
('vpshrdd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5);', Nmemonic('vpshrdd', [xmm5, xmm2, Memory(0, rax, None, 0, 0x40, True), 5], [k3, T_z])),
|
||||
('vpshrdd xmm5{k3}{z}, xmm2, dword ptr [rax+0x40]{1to4}, 0x5', Nmemonic('vpshrdd', [xmm5, xmm2, Memory(0, rax, None, 0, 0x40, True), 5], [k3, T_z])),
|
||||
('vcmpph(k1, xm15, ptr[rax+64], 1);', Nmemonic('vcmpph', [k1, xm15, Memory(0, rax, None, 0, 64), 1])),
|
||||
('vcmpph(k1, xmm15, ptr[rax+64], 1);', Nmemonic('vcmpph', [k1, xmm15, Memory(0, rax, None, 0, 64), 1])),
|
||||
]
|
||||
for (s, expected) in tbl:
|
||||
e = parseNmemonic(s)
|
||||
|
|
|
@ -155,7 +155,7 @@ namespace Xbyak {
|
|||
|
||||
enum {
|
||||
DEFAULT_MAX_CODE_SIZE = 4096,
|
||||
VERSION = 0x7100 /* 0xABCD = A.BC(.D) */
|
||||
VERSION = 0x7200 /* 0xABCD = A.BC(.D) */
|
||||
};
|
||||
|
||||
#ifndef MIE_INTEGER_TYPE_DEFINED
|
||||
|
@ -232,6 +232,7 @@ enum {
|
|||
ERR_CANT_USE_REX2,
|
||||
ERR_INVALID_DFV,
|
||||
ERR_INVALID_REG_IDX,
|
||||
ERR_BAD_ENCODING_MODE,
|
||||
ERR_INTERNAL // Put it at last.
|
||||
};
|
||||
|
||||
|
@ -290,6 +291,7 @@ inline const char *ConvertErrorToString(int err)
|
|||
"can't use rex2",
|
||||
"invalid dfv",
|
||||
"invalid reg index",
|
||||
"bad encoding mode",
|
||||
"internal error"
|
||||
};
|
||||
assert(ERR_INTERNAL + 1 == sizeof(errTbl) / sizeof(*errTbl));
|
||||
|
@ -1673,7 +1675,9 @@ inline const uint8_t* Label::getAddress() const
|
|||
typedef enum {
|
||||
DefaultEncoding,
|
||||
VexEncoding,
|
||||
EvexEncoding
|
||||
EvexEncoding,
|
||||
PreAVX10v2Encoding,
|
||||
AVX10v2Encoding
|
||||
} PreferredEncoding;
|
||||
|
||||
class CodeGenerator : public CodeArray {
|
||||
|
@ -2661,21 +2665,24 @@ private:
|
|||
if (addr.getRegExp().getIndex().getKind() != kind) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
|
||||
opVex(x, 0, addr, type, code);
|
||||
}
|
||||
void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, uint64_t type, int code, PreferredEncoding encoding, int imm = NONE, uint64_t typeVex = 0, uint64_t typeEvex = 0, int sel = 0)
|
||||
void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, uint64_t type, int code, PreferredEncoding enc, int imm = NONE, uint64_t typeVex = 0, uint64_t typeEvex = 0, int sel = 0)
|
||||
{
|
||||
opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding, typeVex, typeEvex, sel), code, imm);
|
||||
opAVX_X_X_XM(x1, x2, op, type | orEvexIf(enc, typeVex, typeEvex, sel), code, imm);
|
||||
}
|
||||
int orEvexIf(PreferredEncoding encoding, uint64_t typeVex, uint64_t typeEvex, int sel) {
|
||||
if (encoding == DefaultEncoding) {
|
||||
encoding = defaultEncoding_[sel];
|
||||
PreferredEncoding getEncoding(PreferredEncoding enc, int sel) const
|
||||
{
|
||||
if (enc == DefaultEncoding) {
|
||||
enc = defaultEncoding_[sel];
|
||||
}
|
||||
if (encoding == EvexEncoding) {
|
||||
if ((sel == 0 && enc != VexEncoding && enc != EvexEncoding) || (sel == 1 && enc != PreAVX10v2Encoding && enc != AVX10v2Encoding)) XBYAK_THROW_RET(ERR_BAD_ENCODING_MODE, VexEncoding)
|
||||
#ifdef XBYAK_DISABLE_AVX512
|
||||
XBYAK_THROW(ERR_EVEX_IS_INVALID)
|
||||
if (enc == EvexEncoding || enc == AVX10v2Encoding) XBYAK_THROW(ERR_EVEX_IS_INVALID)
|
||||
#endif
|
||||
return T_MUST_EVEX | typeEvex;
|
||||
}
|
||||
return typeVex;
|
||||
return enc;
|
||||
}
|
||||
uint64_t orEvexIf(PreferredEncoding enc, uint64_t typeVex, uint64_t typeEvex, int sel) {
|
||||
enc = getEncoding(enc, sel);
|
||||
return ((sel == 0 && enc == VexEncoding) || (sel == 1 && enc != AVX10v2Encoding)) ? typeVex : (T_MUST_EVEX | typeEvex);
|
||||
}
|
||||
void opInOut(const Reg& a, const Reg& d, uint8_t code)
|
||||
{
|
||||
|
@ -3132,8 +3139,8 @@ public:
|
|||
#endif
|
||||
, isDefaultJmpNEAR_(false)
|
||||
{
|
||||
// select avx512-vnni, vmpsadbw(avx)
|
||||
setDefaultEncoding();
|
||||
setDefaultEncodingAVX10();
|
||||
labelMgr_.set(this);
|
||||
}
|
||||
void reset()
|
||||
|
@ -3170,16 +3177,20 @@ public:
|
|||
#undef jnl
|
||||
#endif
|
||||
|
||||
// set default encoding
|
||||
// vnniEnc : control AVX512_VNNI (evex:default) or AVX-VNNI (vex)
|
||||
// avx10Enc : control mpsadbw, AVX-VNNI-INT8 (vex:default) or AVX10.2 (evex)
|
||||
void setDefaultEncoding(PreferredEncoding vnniEnc = EvexEncoding, PreferredEncoding avx10Enc = VexEncoding)
|
||||
{ defaultEncoding_[0] = vnniEnc; defaultEncoding_[1] = avx10Enc; }
|
||||
|
||||
void sha1msg12(const Xmm& x, const Operand& op)
|
||||
// set default encoding of VNNI
|
||||
// EvexEncoding : AVX512_VNNI, VexEncoding : AVX-VNNI
|
||||
void setDefaultEncoding(PreferredEncoding enc = EvexEncoding)
|
||||
{
|
||||
opROO(Reg(), op, x, T_MUST_EVEX, 0xD9);
|
||||
if (enc != VexEncoding && enc != EvexEncoding) XBYAK_THROW(ERR_BAD_ENCODING_MODE)
|
||||
defaultEncoding_[0] = enc;
|
||||
}
|
||||
// default : PreferredEncoding : AVX-VNNI-INT8/AVX512-FP16
|
||||
void setDefaultEncodingAVX10(PreferredEncoding enc = PreAVX10v2Encoding)
|
||||
{
|
||||
if (enc != PreAVX10v2Encoding && enc != AVX10v2Encoding) XBYAK_THROW(ERR_BAD_ENCODING_MODE)
|
||||
defaultEncoding_[1] = enc;
|
||||
}
|
||||
|
||||
void bswap(const Reg32e& r)
|
||||
{
|
||||
int idx = r.getIdx();
|
||||
|
@ -3192,6 +3203,48 @@ public:
|
|||
}
|
||||
db(0xC8 + (idx & 7));
|
||||
}
|
||||
// AVX10 zero-extending for vmovd, vmovw
|
||||
void opAVX10ZeroExt(const Operand& op1, const Operand& op2, const uint64_t typeTbl[4], const int codeTbl[4], PreferredEncoding enc, int bit)
|
||||
{
|
||||
const Operand *p1 = &op1;
|
||||
const Operand *p2 = &op2;
|
||||
bool rev = false;
|
||||
if (p1->isMEM()) {
|
||||
std::swap(p1, p2);
|
||||
rev = true;
|
||||
}
|
||||
if (p1->isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION)
|
||||
if (p1->isXMM()) {
|
||||
std::swap(p1, p2);
|
||||
rev = !rev;
|
||||
}
|
||||
int sel = -1;
|
||||
if (getEncoding(enc, 1) == AVX10v2Encoding) {
|
||||
if ((p1->isXMM() || p1->isMEM()) && p2->isXMM()) sel = 2 + int(rev);
|
||||
} else {
|
||||
if ((p1->isREG(bit) || p1->isMEM()) && p2->isXMM()) sel = int(rev);
|
||||
}
|
||||
if (sel == -1) XBYAK_THROW(ERR_BAD_COMBINATION)
|
||||
opAVX_X_X_XM(*static_cast<const Xmm*>(p2), xm0, *p1, typeTbl[sel], codeTbl[sel]);
|
||||
}
|
||||
void vmovd(const Operand& op1, const Operand& op2, PreferredEncoding enc = DefaultEncoding)
|
||||
{
|
||||
const uint64_t typeTbl[] = {
|
||||
T_EVEX|T_66|T_0F|T_W0|T_N4, T_EVEX|T_66|T_0F|T_W0|T_N4, // legacy, avx, avx512
|
||||
T_MUST_EVEX|T_66|T_0F|T_EW0|T_N4, T_MUST_EVEX|T_F3|T_0F|T_EW0|T_N4, // avx10.2
|
||||
};
|
||||
const int codeTbl[] = { 0x7E, 0x6E, 0xD6, 0x7E };
|
||||
opAVX10ZeroExt(op1, op2, typeTbl, codeTbl, enc, 32);
|
||||
}
|
||||
void vmovw(const Operand& op1, const Operand& op2, PreferredEncoding enc = DefaultEncoding)
|
||||
{
|
||||
const uint64_t typeTbl[] = {
|
||||
T_MUST_EVEX|T_66|T_MAP5|T_N2, T_MUST_EVEX|T_66|T_MAP5|T_N2, // avx512-fp16
|
||||
T_MUST_EVEX|T_F3|T_MAP5|T_EW0|T_N2, T_MUST_EVEX|T_F3|T_MAP5|T_EW0|T_N2, // avx10.2
|
||||
};
|
||||
const int codeTbl[] = { 0x7E, 0x6E, 0x7E, 0x6E };
|
||||
opAVX10ZeroExt(op1, op2, typeTbl, codeTbl, enc, 16|32|64);
|
||||
}
|
||||
/*
|
||||
use single byte nop if useMultiByteNop = false
|
||||
*/
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
const char *getVersionString() const { return "7.10"; }
|
||||
const char *getVersionString() const { return "7.20"; }
|
||||
void aadd(const Address& addr, const Reg32e ®) { opMR(addr, reg, T_0F38, 0x0FC, T_APX); }
|
||||
void aand(const Address& addr, const Reg32e ®) { opMR(addr, reg, T_0F38|T_66, 0x0FC, T_APX|T_66); }
|
||||
void adc(const Operand& op, uint32_t imm) { opOI(op, imm, 0x10, 2); }
|
||||
|
@ -988,6 +988,7 @@ void sets(const Operand& op) { opSetCC(op, 8); }//-V524
|
|||
void setz(const Operand& op) { opSetCC(op, 4); }//-V524
|
||||
void sfence() { db(0x0F); db(0xAE); db(0xF8); }
|
||||
void sha1msg1(const Xmm& x, const Operand& op) { opSSE_APX(x, op, T_0F38, 0xC9, T_MUST_EVEX, 0xD9); }
|
||||
void sha1msg12(const Xmm& x, const Operand& op) { opROO(Reg(), op, x, T_MUST_EVEX, 0xD9); }
|
||||
void sha1msg2(const Xmm& x, const Operand& op) { opSSE_APX(x, op, T_0F38, 0xCA, T_MUST_EVEX, 0xDA); }
|
||||
void sha1nexte(const Xmm& x, const Operand& op) { opSSE_APX(x, op, T_0F38, 0xC8, T_MUST_EVEX, 0xD8); }
|
||||
void sha1rnds4(const Xmm& x, const Operand& op, uint8_t imm) { opSSE_APX(x, op, T_0F3A, 0xCC, T_MUST_EVEX, 0xD4, imm); }
|
||||
|
@ -1331,8 +1332,6 @@ void vmovapd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_
|
|||
void vmovapd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F|T_EW1|T_YMM|T_EVEX, 0x28); }
|
||||
void vmovaps(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_0F|T_EW0|T_YMM|T_EVEX|T_M_K, 0x29); }
|
||||
void vmovaps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F|T_EW0|T_YMM|T_EVEX, 0x28); }
|
||||
void vmovd(const Operand& op, const Xmm& x) { if (!op.isREG(32) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x7E); }
|
||||
void vmovd(const Xmm& x, const Operand& op) { if (!op.isREG(32) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x6E); }
|
||||
void vmovddup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_DUP|T_F2|T_0F|T_EW1|T_YMM|T_EVEX|T_ER_X|T_ER_Y|T_ER_Z, 0x12); }
|
||||
void vmovdqa(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_66|T_0F|T_YMM, 0x7F); }
|
||||
void vmovdqa(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F|T_YMM, 0x6F); }
|
||||
|
@ -2202,6 +2201,8 @@ void vcvtne2ph2bf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X
|
|||
void vcvtne2ph2hf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x18); }
|
||||
void vcvtne2ph2hf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x1B); }
|
||||
void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x72); }
|
||||
void vcvtnebf162ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x69); }
|
||||
void vcvtnebf162iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x6B); }
|
||||
void vcvtneph2bf8(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); }
|
||||
void vcvtneph2bf8s(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); }
|
||||
void vcvtneph2hf8(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x18); }
|
||||
|
@ -2212,6 +2213,8 @@ void vcvtpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0
|
|||
void vcvtpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x79); }
|
||||
void vcvtpd2uqq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x79); }
|
||||
void vcvtph2dq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_MUST_EVEX|T_B16, 0x5B); }
|
||||
void vcvtph2ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B16, 0x69); }
|
||||
void vcvtph2iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B16, 0x6B); }
|
||||
void vcvtph2pd(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_MUST_EVEX|T_B16, 0x5A); }
|
||||
void vcvtph2psx(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_MAP6|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B16, 0x13); }
|
||||
void vcvtph2qq(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_ER_X|T_MUST_EVEX|T_B16, 0x7B); }
|
||||
|
@ -2219,6 +2222,8 @@ void vcvtph2udq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0,
|
|||
void vcvtph2uqq(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_ER_X|T_MUST_EVEX|T_B16, 0x79); }
|
||||
void vcvtph2uw(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x7D); }
|
||||
void vcvtph2w(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x7D); }
|
||||
void vcvtps2ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x69); }
|
||||
void vcvtps2iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x6B); }
|
||||
void vcvtps2phx(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, T_N16|T_N_VL|T_66|T_MAP5|T_EW0|T_ER_Z|T_MUST_EVEX|T_B32, 0x1D); }
|
||||
void vcvtps2qq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_0F|T_EW0|T_YMM|T_ER_Y|T_MUST_EVEX|T_B32, 0x7B); }
|
||||
void vcvtps2udq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_0F|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x79); }
|
||||
|
@ -2235,22 +2240,40 @@ void vcvtsh2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N2|T_F3
|
|||
void vcvtsi2sh(const Xmm& x1, const Xmm& x2, const Operand& op) { if (!(x1.isXMM() && x2.isXMM() && op.isBit(32|64))) XBYAK_THROW(ERR_BAD_COMBINATION) uint64_t type = (T_F3|T_MAP5|T_ER_R|T_MUST_EVEX|T_M_K) | (op.isBit(32) ? (T_EW0 | T_N4) : (T_EW1 | T_N8)); opVex(x1, &x2, op, type, 0x2A); }
|
||||
void vcvtss2sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_MAP5|T_EW0|T_ER_X|T_MUST_EVEX, 0x1D); }
|
||||
void vcvtss2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N4|T_F3|T_0F|T_ER_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x79); }
|
||||
void vcvttnebf162ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x68); }
|
||||
void vcvttnebf162iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x6A); }
|
||||
void vcvttpd2dqs(const Xmm& x, const Operand& op) { opCvt2(x, op, T_MAP5|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x6D); }
|
||||
void vcvttpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x7A); }
|
||||
void vcvttpd2qqs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x6D); }
|
||||
void vcvttpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x78); }
|
||||
void vcvttpd2udqs(const Xmm& x, const Operand& op) { opCvt2(x, op, T_MAP5|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x6C); }
|
||||
void vcvttpd2uqq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x78); }
|
||||
void vcvttpd2uqqs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x6C); }
|
||||
void vcvttph2dq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_F3|T_MAP5|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B16, 0x5B); }
|
||||
void vcvttph2ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B16, 0x68); }
|
||||
void vcvttph2iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B16, 0x6A); }
|
||||
void vcvttph2qq(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_MUST_EVEX|T_B16, 0x7A); }
|
||||
void vcvttph2udq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_MAP5|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B16, 0x78); }
|
||||
void vcvttph2uqq(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_MUST_EVEX|T_B16, 0x78); }
|
||||
void vcvttph2uw(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x7C); }
|
||||
void vcvttph2w(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x7C); }
|
||||
void vcvttps2dqs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B32, 0x6D); }
|
||||
void vcvttps2ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x68); }
|
||||
void vcvttps2iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x6A); }
|
||||
void vcvttps2qq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_0F|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B32, 0x7A); }
|
||||
void vcvttps2qqs(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_SAE_Y|T_MUST_EVEX|T_B32, 0x6D); }
|
||||
void vcvttps2udq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_0F|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x78); }
|
||||
void vcvttps2udqs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B32, 0x6C); }
|
||||
void vcvttps2uqq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_0F|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B32, 0x78); }
|
||||
void vcvttps2uqqs(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_SAE_Y|T_MUST_EVEX|T_B32, 0x6C); }
|
||||
void vcvttsd2sis(const Reg32e& r, const Operand& op) { uint64_t type = (T_N8|T_F2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x6D); }
|
||||
void vcvttsd2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N8|T_F2|T_0F|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x78); }
|
||||
void vcvttsd2usis(const Reg32e& r, const Operand& op) { uint64_t type = (T_N8|T_F2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x6C); }
|
||||
void vcvttsh2si(const Reg32e& r, const Operand& op) { uint64_t type = (T_N2|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x2C); }
|
||||
void vcvttsh2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N2|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x78); }
|
||||
void vcvttss2sis(const Reg32e& r, const Operand& op) { uint64_t type = (T_N4|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x6D); }
|
||||
void vcvttss2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N4|T_F3|T_0F|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x78); }
|
||||
void vcvttss2usis(const Reg32e& r, const Operand& op) { uint64_t type = (T_N4|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x6C); }
|
||||
void vcvtudq2pd(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_F3|T_0F|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x7A); }
|
||||
void vcvtudq2ph(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, T_N16|T_N_VL|T_F2|T_MAP5|T_EW0|T_ER_Z|T_MUST_EVEX|T_B32, 0x7A); }
|
||||
void vcvtudq2ps(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_0F|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x7A); }
|
||||
|
@ -2374,6 +2397,13 @@ void vinserti64x4(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm)
|
|||
void vmaxpbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5F); }
|
||||
void vmaxph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_SAE_Z | T_B16, 0x5F); }
|
||||
void vmaxsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_SAE_X | T_N2, 0x5F); }
|
||||
void vminmaxnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x52, imm); }
|
||||
void vminmaxpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x52, imm); }
|
||||
void vminmaxph(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_0F3A|T_EW0|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B16, 0x52, imm); }
|
||||
void vminmaxps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B32, 0x52, imm); }
|
||||
void vminmaxsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F3A|T_EW1|T_SAE_X|T_MUST_EVEX, 0x53, imm); }
|
||||
void vminmaxsh(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N2|T_0F3A|T_EW0|T_SAE_X|T_MUST_EVEX, 0x53, imm); }
|
||||
void vminmaxss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F3A|T_EW0|T_SAE_X|T_MUST_EVEX, 0x53, imm); }
|
||||
void vminpbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5D); }
|
||||
void vminph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_SAE_Z | T_B16, 0x5D); }
|
||||
void vminsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_SAE_X | T_N2, 0x5D); }
|
||||
|
@ -2392,9 +2422,6 @@ void vmovdqu8(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_0F
|
|||
void vmovsh(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_N2|T_F3|T_MAP5|T_EW0|T_MUST_EVEX|T_M_K, 0x11); }
|
||||
void vmovsh(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, T_N2|T_F3|T_MAP5|T_EW0|T_MUST_EVEX, 0x10); }
|
||||
void vmovsh(const Xmm& x1, const Xmm& x2, const Xmm& x3) { opAVX_X_X_XM(x1, x2, x3, T_N2|T_F3|T_MAP5|T_EW0|T_MUST_EVEX, 0x10); }
|
||||
void vmovw(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x7E); }
|
||||
void vmovw(const Reg32e& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, r, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x7E); }
|
||||
void vmovw(const Xmm& x, const Operand& op) { if (!op.isREG(32|64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x6E); }
|
||||
void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F3A|T_YMM, 0x42, encoding, imm, T_66|T_W0|T_YMM, T_F3|T_0F3A|T_EW0|T_B32, 1); }
|
||||
void vmulnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x59); }
|
||||
void vmulph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x59); }
|
||||
|
|
Loading…
Reference in a new issue