mirror of
https://github.com/herumi/xbyak
synced 2024-11-21 16:09:11 -07:00
add vpermi2b
This commit is contained in:
parent
fbde291731
commit
9af35326a6
5 changed files with 18 additions and 3 deletions
|
@ -282,6 +282,7 @@ void putX_X_XM_IMM()
|
|||
{ 0x7F, "vpermt2ps", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, false },
|
||||
{ 0x7F, "vpermt2pd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, false },
|
||||
|
||||
{ 0x75, "vpermi2b", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0, false },
|
||||
{ 0x75, "vpermi2w", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1, false },
|
||||
{ 0x76, "vpermi2d", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, false },
|
||||
{ 0x76, "vpermi2q", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, false },
|
||||
|
|
|
@ -143,18 +143,23 @@ vaddpd xmm1, xmm2, [rax+256]{1to2} --> vaddpd(xmm1, xmm2, ptr_b [rax+256]);
|
|||
vaddpd ymm1, ymm2, [rax+256]{1to4} --> vaddpd(ymm1, ymm2, ptr_b [rax+256]);
|
||||
vaddpd zmm1, zmm2, [rax+256]{1to8} --> vaddpd(zmm1, zmm2, ptr_b [rax+256]);
|
||||
vaddps zmm1, zmm2, [rax+rcx*8+8]{1to16} --> vaddps(zmm1, zmm2, ptr_b [rax+rcx*8+8]);
|
||||
vmovsd [rax]{k1}, xmm4 --> vmovsd (ptr [rax] | k1, xmm4);
|
||||
vmovsd [rax]{k1}, xmm4 --> vmovsd(ptr [rax] | k1, xmm4);
|
||||
|
||||
vcvtpd2dq xmm16, oword [eax+33] --> vcvtpd2dq(xmm16, ptr [eax+33]); // default oword(m128)
|
||||
vcvtpd2dq xmm21, [eax+32]{1to2} --> vcvtpd2dq(xmm21, ptr_b [eax+32]);
|
||||
vcvtpd2dq xmm0, yword [eax+33] --> vcvtpd2dq(xmm0, yword [eax+33]); // use yword for m256
|
||||
vcvtpd2dq xmm19, [eax+32]{1to4} --> vcvtpd2dq(xmm19, yword_b [eax+32]); // use yword_b to broadcast
|
||||
|
||||
vfpclassps k5{k3}, zword [rax+64], 5 --> vfpclassps(k5|k3, zword [rax+64], 5); // specify m512
|
||||
vfpclasspd k5{k3}, [rax+64]{1to2}, 5 --> vfpclasspd(k5|k3, xword_b [rax+64], 5); // broadcast 64-bit to 128-bit
|
||||
vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, xword_b [rax+64], 5); // broadcast 32-bit to 128-bit
|
||||
```
|
||||
Remark
|
||||
* k1, ..., k7 are new opmask registers.
|
||||
* use `| T_z`, `| T_sae`, `| T_rn_sae`, `| T_rd_sae`, `| T_ru_sae`, `| T_rz_sae` instead of `,{z}`, `,{sae}`, `,{rn-sae}`, `,{rd-sae}`, `,{ru-sae}`, `,{rz-sae}` respectively.
|
||||
* `k4 | k3` is different from `k3 | k4`.
|
||||
* use `ptr_b`, `yword_b' for broadcast `{1toX}`. X is automatically determined.
|
||||
* use `ptr_b` for broadcast `{1toX}`. X is automatically determined.
|
||||
* specify xword/yword/zword(_b) for m128/m256/m512.
|
||||
|
||||
### Label
|
||||
|
||||
|
|
|
@ -131,25 +131,31 @@ vaddpd zmm2, zmm5, zmm30 --> vaddpd(zmm2, zmm5, zmm30);
|
|||
vaddpd zmm2{k5}, zmm4, zmm2 --> vaddpd(zmm2 | k5, zmm4, zmm2);
|
||||
vaddpd zmm2{k5}{z}, zmm4, zmm2 --> vaddpd(zmm2 | k5 | T_z, zmm4, zmm2);
|
||||
vaddpd zmm2{k5}{z}, zmm4, zmm2,{rd-sae} --> vaddpd(zmm2 | k5 | T_z, zmm4, zmm2 | T_rd_sae);
|
||||
vaddpd(zmm2 | k5 | T_z | T_rd_sae, zmm4, zmm2); // the position of `|` is arbitrary.
|
||||
vcmppd k4{k3}, zmm1, zmm2, {sae}, 5 --> vcmppd(k4 | k3, zmm1, zmm2 | T_sae, 5);
|
||||
|
||||
vaddpd xmm1, xmm2, [rax+256]{1to2} --> vaddpd(xmm1, xmm2, ptr_b [rax+256]);
|
||||
vaddpd ymm1, ymm2, [rax+256]{1to4} --> vaddpd(ymm1, ymm2, ptr_b [rax+256]);
|
||||
vaddpd zmm1, zmm2, [rax+256]{1to8} --> vaddpd(zmm1, zmm2, ptr_b [rax+256]);
|
||||
vaddps zmm1, zmm2, [rax+rcx*8+8]{1to16} --> vaddps(zmm1, zmm2, ptr_b [rax+rcx*8+8]);
|
||||
vmovsd [rax]{k1}, xmm4 --> vmovsd (ptr [rax] | k1, xmm4);
|
||||
vmovsd [rax]{k1}, xmm4 --> vmovsd(ptr [rax] | k1, xmm4);
|
||||
|
||||
vcvtpd2dq xmm16, oword [eax+33] --> vcvtpd2dq(xmm16, ptr [eax+33]); // default oword(m128)
|
||||
vcvtpd2dq xmm21, [eax+32]{1to2} --> vcvtpd2dq(xmm21, ptr_b [eax+32]);
|
||||
vcvtpd2dq xmm0, yword [eax+33] --> vcvtpd2dq(xmm0, yword [eax+33]); // use yword for m256
|
||||
vcvtpd2dq xmm19, [eax+32]{1to4} --> vcvtpd2dq(xmm19, yword_b [eax+32]); // use yword_b to broadcast
|
||||
|
||||
vfpclassps k5{k3}, zword [rax+64], 5 --> vfpclassps(k5|k3, zword [rax+64], 5); // specify m512
|
||||
vfpclasspd k5{k3}, [rax+64]{1to2}, 5 --> vfpclasspd(k5|k3, xword_b [rax+64], 5); // broadcast 64-bit to 128-bit
|
||||
vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, xword_b [rax+64], 5); // broadcast 32-bit to 128-bit
|
||||
|
||||
|
||||
注意
|
||||
* k1, ..., k7 は新しいopmaskレジスタです。
|
||||
* z, sae, rn-sae, rd-sae, ru-sae, rz-saeの代わりにT_z, T_sae, T_rn_sae, T_rd_sae, T_ru_sae, T_rz_saeを使ってください。
|
||||
* `k4 | k3`と`k3 | k4`は意味が異なります。
|
||||
* {1toX}の代わりにptr_bを使ってください。Xは自動的に決まります。
|
||||
* 一部の命令はメモリサイズを指定するためにxword/yword/zword(_b)を使ってください。
|
||||
|
||||
・ラベル
|
||||
|
||||
|
|
|
@ -1755,6 +1755,7 @@ public:
|
|||
{ "vpermt2ps", M_1to4 },
|
||||
{ "vpermt2pd", M_1to2 },
|
||||
|
||||
{ "vpermi2b", 0 },
|
||||
{ "vpermi2w", 0 },
|
||||
{ "vpermi2d", M_1to4 },
|
||||
{ "vpermi2q", M_1to2 },
|
||||
|
@ -2085,6 +2086,7 @@ public:
|
|||
void putMin()
|
||||
{
|
||||
#ifdef XBYAK64
|
||||
putClass();
|
||||
#endif
|
||||
}
|
||||
void putAVX512()
|
||||
|
|
|
@ -164,6 +164,7 @@ void vpermt2d(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1
|
|||
void vpermt2q(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x7E); }
|
||||
void vpermt2ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x7F); }
|
||||
void vpermt2pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x7F); }
|
||||
void vpermi2b(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x75); }
|
||||
void vpermi2w(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x75); }
|
||||
void vpermi2d(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x76); }
|
||||
void vpermi2q(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x76); }
|
||||
|
|
Loading…
Reference in a new issue