From 44bc77d51b50ff0e798e44edcc8fae644322c180 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Mon, 14 Oct 2024 04:19:38 +0900 Subject: [PATCH 01/15] fix a type of return --- xbyak/xbyak.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h index 552e451..774f147 100644 --- a/xbyak/xbyak.h +++ b/xbyak/xbyak.h @@ -2665,7 +2665,7 @@ private: { opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding, typeVex, typeEvex, sel), code, imm); } - int orEvexIf(PreferredEncoding encoding, uint64_t typeVex, uint64_t typeEvex, int sel) { + uint64_t orEvexIf(PreferredEncoding encoding, uint64_t typeVex, uint64_t typeEvex, int sel) { if (encoding == DefaultEncoding) { encoding = defaultEncoding_[sel]; } From aabf2abeb0c65fecffdb71eccad62a5eac8587c5 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Mon, 14 Oct 2024 04:19:51 +0900 Subject: [PATCH 02/15] [doc] update Japanese doc --- readme.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/readme.txt b/readme.txt index deabcd8..417c50e 100644 --- a/readme.txt +++ b/readme.txt @@ -404,6 +404,7 @@ sample/{echo,hello}.bfは http://www.kmonos.net/alang/etc/brainfuck.php から ----------------------------------------------------------------------------- ◎履歴 +2024/10/13 ver 7.10 AVX10 integer and fp16 vnni, mediaの新命令対応. setDefaultEncodingの拡張. 2024/10/10 ver 7.09.1 vpcompressbとvpcompresswの名前修正 2024/10/08 ver 7.09 AVX10.2のYMMレジスタの埋め込み丸め対応 2024/10/07 ver 7.08 rdfabaseなどサポート From 86e532fe1a1027f8b13e4833d1558697c6512f76 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Mon, 14 Oct 2024 04:48:25 +0900 Subject: [PATCH 03/15] add avx10 minmax --- gen/gen_avx512.cpp | 7 +++++ test/avx10/minmax.txt | 66 ++++++++++++++++++++++++++++++++++++++++++ xbyak/xbyak_mnemonic.h | 7 +++++ 3 files changed, 80 insertions(+) create mode 100644 test/avx10/minmax.txt diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index 2b8a328..cfe0ac6 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -447,6 +447,13 @@ void putX_X_XM_IMM() { 0x1B, "vcvtne2ph2hf8s", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_B16 | T_N1, false }, { 0x52, "vdpphps", T_MUST_EVEX | T_0F38 | T_EW0 | T_YMM | T_B32, false }, + { 0x52, "vminmaxnepbf16", T_MUST_EVEX | T_F2 | T_0F3A | T_EW0 | T_YMM | T_B16, true }, + { 0x52, "vminmaxpd", T_MUST_EVEX | T_66 | T_0F3A | T_EW1 | T_YMM | T_B64 | T_SAE_Y | T_SAE_Z, true }, + { 0x52, "vminmaxph", T_MUST_EVEX | T_0F3A | T_EW0 | T_YMM | T_B16 | T_SAE_Y | T_SAE_Z, true }, + { 0x52, "vminmaxps", T_MUST_EVEX | T_66 | T_0F3A | T_EW0 | T_YMM | T_B32 | T_SAE_Y | T_SAE_Z, true }, + { 0x53, "vminmaxsd", T_MUST_EVEX | T_66 | T_0F3A | T_EW1 | T_SAE_X | T_N8, true }, + { 0x53, "vminmaxsh", T_MUST_EVEX | T_0F3A | T_EW0 | T_SAE_X | T_N2, true }, + { 0x53, "vminmaxss", T_MUST_EVEX | T_66 | T_0F3A | T_EW0 | T_SAE_X | T_N4, true }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; diff --git a/test/avx10/minmax.txt b/test/avx10/minmax.txt new file mode 100644 index 0000000..8b2c662 --- /dev/null +++ b/test/avx10/minmax.txt @@ -0,0 +1,66 @@ +vminmaxnepbf16(xm1|k3|T_z, xm2, xm3, 5); +vminmaxnepbf16(xm1|k3|T_z, xm2, ptr[rax+128], 5); +vminmaxnepbf16(xm1|k3|T_z, xm2, ptr_b[rax+128], 5); + +vminmaxnepbf16(ym1|k3|T_z, ym2, ym3, 5); +vminmaxnepbf16(ym1|k3|T_z, ym2, ptr[rax+128], 5); +vminmaxnepbf16(ym1|k3|T_z, ym2, ptr_b[rax+128], 5); + +vminmaxnepbf16(zm1|k3|T_z, zm2, zm3, 5); +vminmaxnepbf16(zm1|k3|T_z, zm2, ptr[rax+128], 5); +vminmaxnepbf16(zm1|k3|T_z, zm2, ptr_b[rax+128], 5); +// +vminmaxpd(xm1|k3|T_z, xm2, xm3, 5); +vminmaxpd(xm1|k3|T_z, xm2, ptr[rax+128], 5); +vminmaxpd(xm1|k3|T_z, xm2, ptr_b[rax+128], 5); + +vminmaxpd(ym1|k3|T_z, ym2, ym3, 5); +vminmaxpd(ym1|k3|T_z, ym2, ym3|T_sae, 5); +vminmaxpd(ym1|k3|T_z, ym2, ptr[rax+128], 5); +vminmaxpd(ym1|k3|T_z, ym2, ptr_b[rax+128], 5); + +vminmaxpd(zm1|k3|T_z, zm2, zm3, 5); +vminmaxpd(zm1|k3|T_z, zm2, zm3|T_sae, 5); +vminmaxpd(zm1|k3|T_z, zm2, ptr[rax+128], 5); +vminmaxpd(zm1|k3|T_z, zm2, ptr_b[rax+128], 5); +// +vminmaxph(xm1|k3|T_z, xm2, xm3, 5); +vminmaxph(xm1|k3|T_z, xm2, ptr[rax+128], 5); +vminmaxph(xm1|k3|T_z, xm2, ptr[rax+128], 5); +vminmaxph(xm1|k3|T_z, xm2, ptr_b[rax+128], 5); + +vminmaxph(ym1|k3|T_z, ym2, ym3, 5); +vminmaxph(ym1|k3|T_z, ym2, ym3|T_sae, 5); +vminmaxph(ym1|k3|T_z, ym2, ptr[rax+128], 5); +vminmaxph(ym1|k3|T_z, ym2, ptr_b[rax+128], 5); + +vminmaxph(zm1|k3|T_z, zm2, zm3, 5); +vminmaxph(zm1|k3|T_z, zm2, zm3|T_sae, 5); +vminmaxph(zm1|k3|T_z, zm2, ptr[rax+128], 5); +vminmaxph(zm1|k3|T_z, zm2, ptr_b[rax+128], 5); +// +vminmaxps(xm1|k3|T_z, xm2, xm3, 5); +vminmaxps(xm1|k3|T_z, xm2, ptr[rax+128], 5); +vminmaxps(xm1|k3|T_z, xm2, ptr_b[rax+128], 5); + +vminmaxps(ym1|k3|T_z, ym2, ym3, 5); +vminmaxps(ym1|k3|T_z, ym2, ym3|T_sae, 5); +vminmaxps(ym1|k3|T_z, ym2, ptr[rax+128], 5); +vminmaxps(ym1|k3|T_z, ym2, ptr_b[rax+128], 5); + +vminmaxps(zm1|k3|T_z, zm2, zm3, 5); +vminmaxps(zm1|k3|T_z, zm2, zm3|T_sae, 5); +vminmaxps(zm1|k3|T_z, zm2, ptr[rax+128], 5); +vminmaxps(zm1|k3|T_z, zm2, ptr_b[rax+128], 5); +// +vminmaxsd(xm1|k3|T_z, xm2, xm3, 5); +vminmaxsd(xm1|k3|T_z, xm2, xm3|T_sae, 5); +vminmaxsd(xm1|k3|T_z, xm2, ptr[rax+128], 5); +// +vminmaxsh(xm1|k3|T_z, xm2, xm3, 5); +vminmaxsh(xm1|k3|T_z, xm2, xm3|T_sae, 5); +vminmaxsh(xm1|k3|T_z, xm2, ptr[rax+128], 5); +// +vminmaxss(xm1|k3|T_z, xm2, xm3, 5); +vminmaxss(xm1|k3|T_z, xm2, xm3|T_sae, 5); +vminmaxss(xm1|k3|T_z, xm2, ptr[rax+128], 5); diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index 0397ffd..b4cb11c 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -2374,6 +2374,13 @@ void vinserti64x4(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) void vmaxpbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5F); } void vmaxph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_SAE_Z | T_B16, 0x5F); } void vmaxsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_SAE_X | T_N2, 0x5F); } +void vminmaxnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x52, imm); } +void vminmaxpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x52, imm); } +void vminmaxph(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_0F3A|T_EW0|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B16, 0x52, imm); } +void vminmaxps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B32, 0x52, imm); } +void vminmaxsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F3A|T_EW1|T_SAE_X|T_MUST_EVEX, 0x53, imm); } +void vminmaxsh(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N2|T_0F3A|T_EW0|T_SAE_X|T_MUST_EVEX, 0x53, imm); } +void vminmaxss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F3A|T_EW0|T_SAE_X|T_MUST_EVEX, 0x53, imm); } void vminpbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5D); } void vminph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_SAE_Z | T_B16, 0x5D); } void vminsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_SAE_X | T_N2, 0x5D); } From 5f942b59145c66b514085216320ac0de96f841cf Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Mon, 14 Oct 2024 05:15:24 +0900 Subject: [PATCH 04/15] under developing saturation --- gen/gen_avx512.cpp | 22 +++++ test/Makefile | 2 +- test/avx10/saturation.txt | 202 ++++++++++++++++++++++++++++++++++++++ xbyak/xbyak_mnemonic.h | 15 +++ 4 files changed, 240 insertions(+), 1 deletion(-) create mode 100644 test/avx10/saturation.txt diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index cfe0ac6..46b00b5 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -209,6 +209,28 @@ void putX_XM() { 0x2E, "vucomxsd", T_MUST_EVEX | T_F3 | T_0F | T_EW1 | T_SAE_X | T_N8 }, { 0x2E, "vucomxsh", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_SAE_X | T_N2 }, { 0x2E, "vucomxss", T_MUST_EVEX | T_F2 | T_0F | T_EW0 | T_SAE_X | T_N4 }, + + // 13.1 + { 0x69, "vcvtnebf162ibs", T_MUST_EVEX | T_YMM | T_F2 | T_MAP5 | T_EW0 | T_B16 }, + { 0x6B, "vcvtnebf162iubs", T_MUST_EVEX | T_YMM | T_F2 | T_MAP5 | T_EW0 | T_B16 }, + { 0x68, "vcvttnebf162ibs", T_MUST_EVEX | T_YMM | T_F2 | T_MAP5 | T_EW0 | T_B16 }, + { 0x6A, "vcvttnebf162iubs", T_MUST_EVEX | T_YMM | T_F2 | T_MAP5 | T_EW0 | T_B16 }, + // 13.3 + { 0x6D, "vcvttpd2qqs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW1 | T_B64 | T_SAE_Y | T_SAE_Z }, + // 13.5 + { 0x6C, "vcvttpd2uqqs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW1 | T_B64 | T_SAE_Y | T_SAE_Z }, + // 13.6 + { 0x69, "vcvtph2ibs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW0 | T_B16 | T_ER_Y | T_ER_Z }, + { 0x6B, "vcvtph2iubs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW0 | T_B16 | T_ER_Y | T_ER_Z }, + { 0x68, "vcvttph2ibs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW0 | T_B16 | T_ER_Y | T_ER_Z }, + { 0x6A, "vcvttph2iubs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW0 | T_B16 | T_ER_Y | T_ER_Z }, + // 13.7 + { 0x6D, "vcvttps2dqs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW0 | T_B32 | T_SAE_Y | T_SAE_Z }, + // 13.8 + { 0x69, "vcvtps2ibs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW0 | T_B32 | T_ER_Y | T_ER_Z }, + { 0x6B, "vcvtps2iubs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW0 | T_B32 | T_ER_Y | T_ER_Z }, + { 0x68, "vcvttps2ibs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW0 | T_B32 | T_ER_Y | T_ER_Z }, + { 0x6A, "vcvttps2iubs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW0 | T_B32 | T_ER_Y | T_ER_Z }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; diff --git a/test/Makefile b/test/Makefile index 336dcaf..d5613e4 100644 --- a/test/Makefile +++ b/test/Makefile @@ -60,7 +60,7 @@ apx: apx.cpp $(XBYAK_INC) avx10_test: avx10_test.cpp $(XBYAK_INC) $(CXX) $(CFLAGS) avx10_test.cpp -o $@ -DXBYAK64 -TEST_FILES=old.txt new-ymm.txt bf16.txt comp.txt convert.txt +TEST_FILES=old.txt new-ymm.txt bf16.txt comp.txt convert.txt minmax.txt saturation.txt xed_test: @for target in $(addprefix avx10/, $(TEST_FILES)); do ./test_by_xed.sh $$target; done diff --git a/test/avx10/saturation.txt b/test/avx10/saturation.txt new file mode 100644 index 0000000..591960e --- /dev/null +++ b/test/avx10/saturation.txt @@ -0,0 +1,202 @@ +// +vcvtnebf162ibs(xm1, xm2); +vcvtnebf162ibs(xm1, ptr[rax+128]); +vcvtnebf162ibs(xm1, ptr_b[rax+128]); + +vcvtnebf162ibs(ym1, ym2); +vcvtnebf162ibs(ym1, ptr[rax+128]); +vcvtnebf162ibs(ym1, ptr_b[rax+128]); + +vcvtnebf162ibs(zm1, zm2); +vcvtnebf162ibs(zm1, ptr[rax+128]); +vcvtnebf162ibs(zm1, ptr_b[rax+128]); +// +vcvtnebf162iubs(xm1, xm2); +vcvtnebf162iubs(xm1, ptr[rax+128]); +vcvtnebf162iubs(xm1, ptr_b[rax+128]); + +vcvtnebf162iubs(ym1, ym2); +vcvtnebf162iubs(ym1, ptr[rax+128]); +vcvtnebf162iubs(ym1, ptr_b[rax+128]); + +vcvtnebf162iubs(zm1, zm2); +vcvtnebf162iubs(zm1, ptr[rax+128]); +vcvtnebf162iubs(zm1, ptr_b[rax+128]); +// +vcvttnebf162ibs(xm1, xm2); +vcvttnebf162ibs(xm1, ptr[rax+128]); +vcvttnebf162ibs(xm1, ptr_b[rax+128]); + +vcvttnebf162ibs(ym1, ym2); +vcvttnebf162ibs(ym1, ptr[rax+128]); +vcvttnebf162ibs(ym1, ptr_b[rax+128]); + +vcvttnebf162ibs(zm1, zm2); +vcvttnebf162ibs(zm1, ptr[rax+128]); +vcvttnebf162ibs(zm1, ptr_b[rax+128]); +// +vcvttnebf162iubs(xm1, xm2); +vcvttnebf162iubs(xm1, ptr[rax+128]); +vcvttnebf162iubs(xm1, ptr_b[rax+128]); + +vcvttnebf162iubs(ym1, ym2); +vcvttnebf162iubs(ym1, ptr[rax+128]); +vcvttnebf162iubs(ym1, ptr_b[rax+128]); + +vcvttnebf162iubs(zm1, zm2); +vcvttnebf162iubs(zm1, ptr[rax+128]); +vcvttnebf162iubs(zm1, ptr_b[rax+128]); +// +vcvttpd2qqs(xm1, xm2); +vcvttpd2qqs(xm1, ptr[rax+128]); +vcvttpd2qqs(xm1, ptr_b[rax+128]); + +vcvttpd2qqs(ym1, ym2); +vcvttpd2qqs(ym1, ym2|T_sae); +vcvttpd2qqs(ym1, ptr[rax+128]); +vcvttpd2qqs(ym1, ptr_b[rax+128]); + +vcvttpd2qqs(zm1, zm2); +vcvttpd2qqs(zm1, zm2|T_sae); +vcvttpd2qqs(zm1, ptr[rax+128]); +vcvttpd2qqs(zm1, ptr_b[rax+128]); +// +vcvttpd2uqqs(xm1, xm2); +vcvttpd2uqqs(xm1, ptr[rax+128]); +vcvttpd2uqqs(xm1, ptr_b[rax+128]); + +vcvttpd2uqqs(ym1, ym2); +vcvttpd2uqqs(ym1, ym2|T_sae); +vcvttpd2uqqs(ym1, ptr[rax+128]); +vcvttpd2uqqs(ym1, ptr_b[rax+128]); + +vcvttpd2uqqs(zm1, zm2); +vcvttpd2uqqs(zm1, zm2|T_sae); +vcvttpd2uqqs(zm1, ptr[rax+128]); +vcvttpd2uqqs(zm1, ptr_b[rax+128]); +// +vcvtph2ibs(xm1, xm2); +vcvtph2ibs(xm1, ptr[rax+128]); +vcvtph2ibs(xm1, ptr_b[rax+128]); + +vcvtph2ibs(ym1, ym2); +vcvtph2ibs(ym1, ym2|T_rd_sae); +vcvtph2ibs(ym1, ptr[rax+128]); +vcvtph2ibs(ym1, ptr_b[rax+128]); + +vcvtph2ibs(zm1, zm2); +vcvtph2ibs(zm1, zm2|T_ru_sae); +vcvtph2ibs(zm1, ptr[rax+128]); +vcvtph2ibs(zm1, ptr_b[rax+128]); +// +vcvtph2iubs(xm1, xm2); +vcvtph2iubs(xm1, ptr[rax+128]); +vcvtph2iubs(xm1, ptr_b[rax+128]); + +vcvtph2iubs(ym1, ym2); +vcvtph2iubs(ym1, ym2|T_rd_sae); +vcvtph2iubs(ym1, ptr[rax+128]); +vcvtph2iubs(ym1, ptr_b[rax+128]); + +vcvtph2iubs(zm1, zm2); +vcvtph2iubs(zm1, zm2|T_ru_sae); +vcvtph2iubs(zm1, ptr[rax+128]); +vcvtph2iubs(zm1, ptr_b[rax+128]); +// +vcvttph2ibs(xm1, xm2); +vcvttph2ibs(xm1, ptr[rax+128]); +vcvttph2ibs(xm1, ptr_b[rax+128]); + +vcvttph2ibs(ym1, ym2); +vcvttph2ibs(ym1, ym2|T_rd_sae); +vcvttph2ibs(ym1, ptr[rax+128]); +vcvttph2ibs(ym1, ptr_b[rax+128]); + +vcvttph2ibs(zm1, zm2); +vcvttph2ibs(zm1, zm2|T_ru_sae); +vcvttph2ibs(zm1, ptr[rax+128]); +vcvttph2ibs(zm1, ptr_b[rax+128]); +// +vcvttph2iubs(xm1, xm2); +vcvttph2iubs(xm1, ptr[rax+128]); +vcvttph2iubs(xm1, ptr_b[rax+128]); + +vcvttph2iubs(ym1, ym2); +vcvttph2iubs(ym1, ym2|T_rd_sae); +vcvttph2iubs(ym1, ptr[rax+128]); +vcvttph2iubs(ym1, ptr_b[rax+128]); + +vcvttph2iubs(zm1, zm2); +vcvttph2iubs(zm1, zm2|T_ru_sae); +vcvttph2iubs(zm1, ptr[rax+128]); +vcvttph2iubs(zm1, ptr_b[rax+128]); +// +vcvttps2dqs(xm1, xm2); +vcvttps2dqs(xm1, ptr[rax+128]); +vcvttps2dqs(xm1, ptr_b[rax+128]); + +vcvttps2dqs(ym1, ym2); +vcvttps2dqs(ym1, ym2|T_sae); +vcvttps2dqs(ym1, ptr[rax+128]); +vcvttps2dqs(ym1, ptr_b[rax+128]); + +vcvttps2dqs(zm1, zm2); +vcvttps2dqs(zm1, zm2|T_sae); +vcvttps2dqs(zm1, ptr[rax+128]); +vcvttps2dqs(zm1, ptr_b[rax+128]); +// +vcvtps2ibs(xm1, xm2); +vcvtps2ibs(xm1, ptr[rax+128]); +vcvtps2ibs(xm1, ptr_b[rax+128]); + +vcvtps2ibs(ym1, ym2); +vcvtps2ibs(ym1, ym2|T_rd_sae); +vcvtps2ibs(ym1, ptr[rax+128]); +vcvtps2ibs(ym1, ptr_b[rax+128]); + +vcvtps2ibs(zm1, zm2); +vcvtps2ibs(zm1, zm2|T_ru_sae); +vcvtps2ibs(zm1, ptr[rax+128]); +vcvtps2ibs(zm1, ptr_b[rax+128]); +// +vcvtps2iubs(xm1, xm2); +vcvtps2iubs(xm1, ptr[rax+128]); +vcvtps2iubs(xm1, ptr_b[rax+128]); + +vcvtps2iubs(ym1, ym2); +vcvtps2iubs(ym1, ym2|T_rd_sae); +vcvtps2iubs(ym1, ptr[rax+128]); +vcvtps2iubs(ym1, ptr_b[rax+128]); + +vcvtps2iubs(zm1, zm2); +vcvtps2iubs(zm1, zm2|T_ru_sae); +vcvtps2iubs(zm1, ptr[rax+128]); +vcvtps2iubs(zm1, ptr_b[rax+128]); +// +vcvttps2ibs(xm1, xm2); +vcvttps2ibs(xm1, ptr[rax+128]); +vcvttps2ibs(xm1, ptr_b[rax+128]); + +vcvttps2ibs(ym1, ym2); +vcvttps2ibs(ym1, ym2|T_rd_sae); +vcvttps2ibs(ym1, ptr[rax+128]); +vcvttps2ibs(ym1, ptr_b[rax+128]); + +vcvttps2ibs(zm1, zm2); +vcvttps2ibs(zm1, zm2|T_ru_sae); +vcvttps2ibs(zm1, ptr[rax+128]); +vcvttps2ibs(zm1, ptr_b[rax+128]); +// +vcvttps2iubs(xm1, xm2); +vcvttps2iubs(xm1, ptr[rax+128]); +vcvttps2iubs(xm1, ptr_b[rax+128]); + +vcvttps2iubs(ym1, ym2); +vcvttps2iubs(ym1, ym2|T_rd_sae); +vcvttps2iubs(ym1, ptr[rax+128]); +vcvttps2iubs(ym1, ptr_b[rax+128]); + +vcvttps2iubs(zm1, zm2); +vcvttps2iubs(zm1, zm2|T_ru_sae); +vcvttps2iubs(zm1, ptr[rax+128]); +vcvttps2iubs(zm1, ptr_b[rax+128]); diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index b4cb11c..4db4f9e 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -2202,6 +2202,8 @@ void vcvtne2ph2bf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X void vcvtne2ph2hf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x18); } void vcvtne2ph2hf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x1B); } void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x72); } +void vcvtnebf162ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x69); } +void vcvtnebf162iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x6B); } void vcvtneph2bf8(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); } void vcvtneph2bf8s(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); } void vcvtneph2hf8(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x18); } @@ -2212,6 +2214,8 @@ void vcvtpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0 void vcvtpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x79); } void vcvtpd2uqq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x79); } void vcvtph2dq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_MUST_EVEX|T_B16, 0x5B); } +void vcvtph2ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B16, 0x69); } +void vcvtph2iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B16, 0x6B); } void vcvtph2pd(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_MUST_EVEX|T_B16, 0x5A); } void vcvtph2psx(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_MAP6|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B16, 0x13); } void vcvtph2qq(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_ER_X|T_MUST_EVEX|T_B16, 0x7B); } @@ -2219,6 +2223,8 @@ void vcvtph2udq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, void vcvtph2uqq(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_ER_X|T_MUST_EVEX|T_B16, 0x79); } void vcvtph2uw(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x7D); } void vcvtph2w(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x7D); } +void vcvtps2ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x69); } +void vcvtps2iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x6B); } void vcvtps2phx(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, T_N16|T_N_VL|T_66|T_MAP5|T_EW0|T_ER_Z|T_MUST_EVEX|T_B32, 0x1D); } void vcvtps2qq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_0F|T_EW0|T_YMM|T_ER_Y|T_MUST_EVEX|T_B32, 0x7B); } void vcvtps2udq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_0F|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x79); } @@ -2235,15 +2241,24 @@ void vcvtsh2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N2|T_F3 void vcvtsi2sh(const Xmm& x1, const Xmm& x2, const Operand& op) { if (!(x1.isXMM() && x2.isXMM() && op.isBit(32|64))) XBYAK_THROW(ERR_BAD_COMBINATION) uint64_t type = (T_F3|T_MAP5|T_ER_R|T_MUST_EVEX|T_M_K) | (op.isBit(32) ? (T_EW0 | T_N4) : (T_EW1 | T_N8)); opVex(x1, &x2, op, type, 0x2A); } void vcvtss2sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_MAP5|T_EW0|T_ER_X|T_MUST_EVEX, 0x1D); } void vcvtss2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N4|T_F3|T_0F|T_ER_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x79); } +void vcvttnebf162ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x68); } +void vcvttnebf162iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x6A); } void vcvttpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x7A); } +void vcvttpd2qqs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x6D); } void vcvttpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x78); } void vcvttpd2uqq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x78); } +void vcvttpd2uqqs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x6C); } void vcvttph2dq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_F3|T_MAP5|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B16, 0x5B); } +void vcvttph2ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B16, 0x68); } +void vcvttph2iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B16, 0x6A); } void vcvttph2qq(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_MUST_EVEX|T_B16, 0x7A); } void vcvttph2udq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_MAP5|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B16, 0x78); } void vcvttph2uqq(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_MUST_EVEX|T_B16, 0x78); } void vcvttph2uw(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x7C); } void vcvttph2w(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x7C); } +void vcvttps2dqs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B32, 0x6D); } +void vcvttps2ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x68); } +void vcvttps2iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x6A); } void vcvttps2qq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_0F|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B32, 0x7A); } void vcvttps2udq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_0F|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x78); } void vcvttps2uqq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_0F|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B32, 0x78); } From de32c7e99f412f3935cadd69fb2e19fdb6929475 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Mon, 14 Oct 2024 11:30:39 +0900 Subject: [PATCH 05/15] add avx10 saturating convert --- gen/gen_avx512.cpp | 22 +++++++- test/avx10/saturation.txt | 108 ++++++++++++++++++++++++++++++++++++++ xbyak/xbyak_mnemonic.h | 9 ++++ 3 files changed, 137 insertions(+), 2 deletions(-) diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index 46b00b5..07e68b4 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -231,6 +231,8 @@ void putX_XM() { 0x6B, "vcvtps2iubs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW0 | T_B32 | T_ER_Y | T_ER_Z }, { 0x68, "vcvttps2ibs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW0 | T_B32 | T_ER_Y | T_ER_Z }, { 0x6A, "vcvttps2iubs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW0 | T_B32 | T_ER_Y | T_ER_Z }, + // 13.10 + { 0x6C, "vcvttps2udqs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW0 | T_B32 | T_SAE_Y | T_SAE_Z }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -687,6 +689,22 @@ void putCvt() { 0x7B, "vcvtusi2sh", T_F3 | T_MAP5 | T_MUST_EVEX | T_ER_R | T_M_K, 6 }, { 0x72, "vcvtneps2bf16", T_MUST_EVEX | T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 2 }, + // 13.2 + { 0x6D, "vcvttpd2dqs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW1 | T_B64 | T_SAE_Y | T_SAE_Z, 2 }, + // 13.4 + { 0x6C, "vcvttpd2udqs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW1 | T_B64 | T_SAE_Y | T_SAE_Z, 2 }, + // 13.9 + { 0x6D, "vcvttps2qqs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW0 | T_B32 | T_SAE_X | T_SAE_Y | T_N8 | T_N_VL, 1 }, + // 13.11 + { 0x6C, "vcvttps2uqqs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW0 | T_B32 | T_SAE_X | T_SAE_Y | T_N8 | T_N_VL, 1 }, + // 13.12 + { 0x6D, "vcvttsd2sis", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_SAE_X | T_N8, 0 }, + // 13.13 + { 0x6C, "vcvttsd2usis", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_SAE_X | T_N8, 0 }, + // 13.14 + { 0x6D, "vcvttss2sis", T_MUST_EVEX | T_F3 | T_MAP5 | T_EW0 | T_SAE_X | T_N4, 0 }, + // 13.15 + { 0x6C, "vcvttss2usis", T_MUST_EVEX | T_F3 | T_MAP5 | T_EW0 | T_SAE_X | T_N4, 0 }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl& p = tbl[i]; @@ -695,10 +713,10 @@ void putCvt() case 0: printf("void %s(const Reg32e& r, const Operand& op) { uint64_t type = (%s) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x%02X); }\n", p.name, s.c_str(), p.code); break; - case 1: + case 1: // (x, x/m), (y, x/m256), (z, y/m) printf("void %s(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, %s, 0x%02X); }\n", p.name, s.c_str(), p.code); break; - case 2: + case 2: // (x, x/m), (x, y/m256), (y, z/m) printf("void %s(const Xmm& x, const Operand& op) { opCvt2(x, op, %s, 0x%02X); }\n", p.name, s.c_str(), p.code); break; case 3: diff --git a/test/avx10/saturation.txt b/test/avx10/saturation.txt index 591960e..f3ebf3d 100644 --- a/test/avx10/saturation.txt +++ b/test/avx10/saturation.txt @@ -200,3 +200,111 @@ vcvttps2iubs(zm1, zm2); vcvttps2iubs(zm1, zm2|T_ru_sae); vcvttps2iubs(zm1, ptr[rax+128]); vcvttps2iubs(zm1, ptr_b[rax+128]); +// +vcvttps2udqs(xm1, xm2); +vcvttps2udqs(xm1, ptr[rax+128]); +vcvttps2udqs(xm1, ptr_b[rax+128]); + +vcvttps2udqs(ym1, ym2); +vcvttps2udqs(ym1, ym2|T_sae); +vcvttps2udqs(ym1, ptr[rax+128]); +vcvttps2udqs(ym1, ptr_b[rax+128]); + +vcvttps2udqs(zm1, zm2); +vcvttps2udqs(zm1, zm2|T_sae); +vcvttps2udqs(zm1, ptr[rax+128]); +vcvttps2udqs(zm1, ptr_b[rax+128]); + +// +vcvttpd2dqs(xm1|k1|T_z, xm2); +vcvttpd2dqs(xm1|k1|T_z, xword [rax+128]); +vcvttpd2dqs(xm1|k1|T_z, xword_b[rax+128]); + +vcvttpd2dqs(xm1|k1|T_z, ym2); +vcvttpd2dqs(xm1|k1|T_z, ym2|T_sae); +vcvttpd2dqs(xm1|k1|T_z, yword [rax+128]); +vcvttpd2dqs(xm1|k1|T_z, yword_b[rax+128]); + +vcvttpd2dqs(ym1|k1|T_z, zm2); +vcvttpd2dqs(ym1|k1|T_z, zm2|T_sae); +vcvttpd2dqs(ym1|k1|T_z, zword [rax+128]); +vcvttpd2dqs(ym1|k1|T_z, zword_b[rax+128]); + +// +vcvttpd2udqs(xm1|k1|T_z, xm2); +vcvttpd2udqs(xm1|k1|T_z, xword [rax+128]); +vcvttpd2udqs(xm1|k1|T_z, xword_b[rax+128]); + +vcvttpd2udqs(xm1|k1|T_z, ym2); +vcvttpd2udqs(xm1|k1|T_z, ym2|T_sae); +vcvttpd2udqs(xm1|k1|T_z, yword [rax+128]); +vcvttpd2udqs(xm1|k1|T_z, yword_b[rax+128]); + +vcvttpd2udqs(ym1|k1|T_z, zm2); +vcvttpd2udqs(ym1|k1|T_z, zm2|T_sae); +vcvttpd2udqs(ym1|k1|T_z, zword [rax+128]); +vcvttpd2udqs(ym1|k1|T_z, zword_b[rax+128]); +// +vcvttps2qqs(xm1|k1|T_z, xm2); +vcvttps2qqs(xm1|k1|T_z, ptr [rax+128]); +vcvttps2qqs(xm1|k1|T_z, ptr_b[rax+128]); + +vcvttps2qqs(ym1|k1|T_z, xm2); +vcvttps2qqs(ym1|k1|T_z, xm2|T_sae); +vcvttps2qqs(ym1|k1|T_z, ptr [rax+128]); +vcvttps2qqs(ym1|k1|T_z, ptr_b[rax+128]); + +vcvttps2qqs(zm1, ym2); +vcvttps2qqs(zm1|k1|T_z, ym2); +vcvttps2qqs(zm1|k1|T_z|T_sae, ym2); +vcvttps2qqs(zm1|k1|T_z, ptr [rax+128]); +vcvttps2qqs(zm1|k1|T_z, ptr_b[rax+128]); + +// +vcvttps2uqqs(xm1|k1|T_z, xm2); +vcvttps2uqqs(xm1|k1|T_z, ptr [rax+128]); +vcvttps2uqqs(xm1|k1|T_z, ptr_b[rax+128]); + +vcvttps2uqqs(ym1|k1|T_z, xm2); +vcvttps2uqqs(ym1|k1|T_z, xm2|T_sae); +vcvttps2uqqs(ym1|k1|T_z, ptr [rax+128]); +vcvttps2uqqs(ym1|k1|T_z, ptr_b[rax+128]); + +vcvttps2uqqs(zm1, ym2); +vcvttps2uqqs(zm1|k1|T_z, ym2); +vcvttps2uqqs(zm1|k1|T_z|T_sae, ym2); +vcvttps2uqqs(zm1|k1|T_z, ptr [rax+128]); +vcvttps2uqqs(zm1|k1|T_z, ptr_b[rax+128]); + +// +vcvttsd2sis(eax, xm1); +vcvttsd2sis(eax, xm1|T_sae); +vcvttsd2sis(eax, ptr[rax+128]); + +vcvttsd2sis(r30, xm1); +vcvttsd2sis(r30, xm1|T_sae); +vcvttsd2sis(r30, ptr[rax+128]); +// +vcvttsd2usis(eax, xm1); +vcvttsd2usis(eax, xm1|T_sae); +vcvttsd2usis(eax, ptr[rax+128]); + +vcvttsd2usis(r30, xm1); +vcvttsd2usis(r30, xm1|T_sae); +vcvttsd2usis(r30, ptr[rax+128]); +// +vcvttss2sis(eax, xm1); +vcvttss2sis(eax, xm1|T_sae); +vcvttss2sis(eax, ptr[rax+128]); + +vcvttss2sis(r30, xm1); +vcvttss2sis(r30, xm1|T_sae); +vcvttss2sis(r30, ptr[rax+128]); +// +vcvttss2usis(eax, xm1); +vcvttss2usis(eax, xm1|T_sae); +vcvttss2usis(eax, ptr[rax+128]); + +vcvttss2usis(r30, xm1); +vcvttss2usis(r30, xm1|T_sae); +vcvttss2usis(r30, ptr[rax+128]); diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index 4db4f9e..f9a038a 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -2243,9 +2243,11 @@ void vcvtss2sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x void vcvtss2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N4|T_F3|T_0F|T_ER_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x79); } void vcvttnebf162ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x68); } void vcvttnebf162iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x6A); } +void vcvttpd2dqs(const Xmm& x, const Operand& op) { opCvt2(x, op, T_MAP5|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x6D); } void vcvttpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x7A); } void vcvttpd2qqs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x6D); } void vcvttpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x78); } +void vcvttpd2udqs(const Xmm& x, const Operand& op) { opCvt2(x, op, T_MAP5|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x6C); } void vcvttpd2uqq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x78); } void vcvttpd2uqqs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x6C); } void vcvttph2dq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_F3|T_MAP5|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B16, 0x5B); } @@ -2260,12 +2262,19 @@ void vcvttps2dqs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5 void vcvttps2ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x68); } void vcvttps2iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x6A); } void vcvttps2qq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_0F|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B32, 0x7A); } +void vcvttps2qqs(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_SAE_Y|T_MUST_EVEX|T_B32, 0x6D); } void vcvttps2udq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_0F|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x78); } +void vcvttps2udqs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B32, 0x6C); } void vcvttps2uqq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_0F|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B32, 0x78); } +void vcvttps2uqqs(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_SAE_Y|T_MUST_EVEX|T_B32, 0x6C); } +void vcvttsd2sis(const Reg32e& r, const Operand& op) { uint64_t type = (T_N8|T_F2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x6D); } void vcvttsd2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N8|T_F2|T_0F|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x78); } +void vcvttsd2usis(const Reg32e& r, const Operand& op) { uint64_t type = (T_N8|T_F2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x6C); } void vcvttsh2si(const Reg32e& r, const Operand& op) { uint64_t type = (T_N2|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x2C); } void vcvttsh2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N2|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x78); } +void vcvttss2sis(const Reg32e& r, const Operand& op) { uint64_t type = (T_N4|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x6D); } void vcvttss2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N4|T_F3|T_0F|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x78); } +void vcvttss2usis(const Reg32e& r, const Operand& op) { uint64_t type = (T_N4|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x6C); } void vcvtudq2pd(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_F3|T_0F|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x7A); } void vcvtudq2ph(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, T_N16|T_N_VL|T_F2|T_MAP5|T_EW0|T_ER_Z|T_MUST_EVEX|T_B32, 0x7A); } void vcvtudq2ps(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_0F|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x7A); } From 5582082d7b925a29298881c2ba457cec5c0eb8f1 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Mon, 14 Oct 2024 11:52:20 +0900 Subject: [PATCH 06/15] tweak --- gen/gen_code.cpp | 1 + xbyak/xbyak.h | 4 ---- xbyak/xbyak_mnemonic.h | 1 + 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp index a22c12b..e72df50 100644 --- a/gen/gen_code.cpp +++ b/gen/gen_code.cpp @@ -1443,6 +1443,7 @@ void put() printf("void %s(const Xmm& x, const Operand& op) { opSSE_APX(x, op, T_0F38, 0x%02X, T_MUST_EVEX, 0x%02X); }\n", p->name, p->code, p->code2); } puts("void sha1rnds4(const Xmm& x, const Operand& op, uint8_t imm) { opSSE_APX(x, op, T_0F3A, 0xCC, T_MUST_EVEX, 0xD4, imm); }"); + puts("void sha1msg12(const Xmm& x, const Operand& op) { opROO(Reg(), op, x, T_MUST_EVEX, 0xD9); }"); } // (m, x), (m, y) { diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h index 774f147..17a9597 100644 --- a/xbyak/xbyak.h +++ b/xbyak/xbyak.h @@ -3176,10 +3176,6 @@ public: void setDefaultEncoding(PreferredEncoding vnniEnc = EvexEncoding, PreferredEncoding avx10Enc = VexEncoding) { defaultEncoding_[0] = vnniEnc; defaultEncoding_[1] = avx10Enc; } - void sha1msg12(const Xmm& x, const Operand& op) - { - opROO(Reg(), op, x, T_MUST_EVEX, 0xD9); - } void bswap(const Reg32e& r) { int idx = r.getIdx(); diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index f9a038a..07ef43e 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -988,6 +988,7 @@ void sets(const Operand& op) { opSetCC(op, 8); }//-V524 void setz(const Operand& op) { opSetCC(op, 4); }//-V524 void sfence() { db(0x0F); db(0xAE); db(0xF8); } void sha1msg1(const Xmm& x, const Operand& op) { opSSE_APX(x, op, T_0F38, 0xC9, T_MUST_EVEX, 0xD9); } +void sha1msg12(const Xmm& x, const Operand& op) { opROO(Reg(), op, x, T_MUST_EVEX, 0xD9); } void sha1msg2(const Xmm& x, const Operand& op) { opSSE_APX(x, op, T_0F38, 0xCA, T_MUST_EVEX, 0xDA); } void sha1nexte(const Xmm& x, const Operand& op) { opSSE_APX(x, op, T_0F38, 0xC8, T_MUST_EVEX, 0xD8); } void sha1rnds4(const Xmm& x, const Operand& op, uint8_t imm) { opSSE_APX(x, op, T_0F3A, 0xCC, T_MUST_EVEX, 0xD4, imm); } From 21e80948eec08c3ccd0eef8654441cba55ae75f7 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Mon, 14 Oct 2024 12:24:47 +0900 Subject: [PATCH 07/15] tweak vmovd --- gen/gen_code.cpp | 34 ++-------------------------------- xbyak/xbyak.h | 21 ++++++++++++++++++--- xbyak/xbyak_mnemonic.h | 4 ++-- 3 files changed, 22 insertions(+), 37 deletions(-) diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp index e72df50..df4e5b9 100644 --- a/gen/gen_code.cpp +++ b/gen/gen_code.cpp @@ -1734,8 +1734,8 @@ void put() } // mov { - printf("void vmovd(const Xmm& x, const Operand& op) { if (!op.isREG(32) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x6E); }\n"); - printf("void vmovd(const Operand& op, const Xmm& x) { if (!op.isREG(32) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x7E); }\n"); + puts("void vmovd(const Xmm& x, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVmovd(x, op, false, encoding); }"); + puts("void vmovd(const Operand& op, const Xmm& x, PreferredEncoding encoding = DefaultEncoding) { opVmovd(x, op, true, encoding); }"); printf("void vmovq(const Xmm& x, const Address& addr) { uint64_t type; uint8_t code; if (x.getIdx() < 16) { type = T_0F | T_F3; code = 0x7E; } else { type = T_0F | T_66 | T_EVEX | T_EW1 | T_N8; code = 0x6E; } opAVX_X_X_XM(x, xm0, addr, type, code); }\n"); printf("void vmovq(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, x.getIdx() < 16 ? 0xD6 : 0x7E); }\n"); @@ -1900,36 +1900,6 @@ void put() printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, %s, 0x%02X, encoding); }\n", p->name, s.c_str(), p->code); } } - // avx-vnni-int8 - // avx-vnni-int16 -#if 0 - { - const struct Tbl { - uint8_t code; - const char *name; - uint64_t type; - } tbl[] = { -// { 0x50, "vpdpbssd", T_F2 | T_0F38 | T_W0 | T_YMM }, -// { 0x51, "vpdpbssds", T_F2 | T_0F38 | T_W0 | T_YMM }, -// { 0x50, "vpdpbsud", T_F3 | T_0F38 | T_W0 | T_YMM }, -// { 0x51, "vpdpbsuds", T_F3 | T_0F38 | T_W0 | T_YMM }, -// { 0x50, "vpdpbuud", T_0F38 | T_W0 | T_YMM }, -// { 0x51, "vpdpbuuds", T_0F38 | T_W0 | T_YMM }, - -// { 0xD2, "vpdpwsud", T_F3 | T_0F38 | T_W0 | T_YMM }, -// { 0xD3, "vpdpwsuds", T_F3 | T_0F38 | T_W0 | T_YMM }, -// { 0xD2, "vpdpwusd", T_66 | T_0F38 | T_W0 | T_YMM }, -// { 0xD3, "vpdpwusds", T_66 | T_0F38 | T_W0 | T_YMM }, -// { 0xD2, "vpdpwuud", T_0F38 | T_W0 | T_YMM }, -// { 0xD3, "vpdpwuuds", T_0F38 | T_W0 | T_YMM }, - }; - for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { - const Tbl *p = &tbl[i]; - std::string s = type2String(p->type); - printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, %s, 0x%02X); }\n", p->name, s.c_str(), p->code); - } - } -#endif } void put32() diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h index 17a9597..ed5c361 100644 --- a/xbyak/xbyak.h +++ b/xbyak/xbyak.h @@ -2665,7 +2665,8 @@ private: { opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding, typeVex, typeEvex, sel), code, imm); } - uint64_t orEvexIf(PreferredEncoding encoding, uint64_t typeVex, uint64_t typeEvex, int sel) { + bool isVexEncoding(PreferredEncoding encoding, int sel) const + { if (encoding == DefaultEncoding) { encoding = defaultEncoding_[sel]; } @@ -2673,9 +2674,13 @@ private: #ifdef XBYAK_DISABLE_AVX512 XBYAK_THROW(ERR_EVEX_IS_INVALID) #endif - return T_MUST_EVEX | typeEvex; + return false; } - return typeVex; + return true; + } + uint64_t orEvexIf(PreferredEncoding encoding, uint64_t typeVex, uint64_t typeEvex, int sel) { + bool isVex = isVexEncoding(encoding, sel); + return isVex ? typeVex : T_MUST_EVEX | typeEvex; } void opInOut(const Reg& a, const Reg& d, uint8_t code) { @@ -3188,6 +3193,16 @@ public: } db(0xC8 + (idx & 7)); } + void opVmovd(const Xmm& x, const Operand& op, bool rev, PreferredEncoding encoding) + { + if (isVexEncoding(encoding, 1)) { + if (!op.isREG(32) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) + uint64_t type = T_0F | T_66 | T_W0 | T_EVEX | T_N4; + int code = rev ? 0x7E : 0x6E; + opAVX_X_X_XM(x, xm0, op, type, code); + } else { + } + } /* use single byte nop if useMultiByteNop = false */ diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index 07ef43e..efd207a 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -1332,8 +1332,8 @@ void vmovapd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_ void vmovapd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F|T_EW1|T_YMM|T_EVEX, 0x28); } void vmovaps(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_0F|T_EW0|T_YMM|T_EVEX|T_M_K, 0x29); } void vmovaps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F|T_EW0|T_YMM|T_EVEX, 0x28); } -void vmovd(const Operand& op, const Xmm& x) { if (!op.isREG(32) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x7E); } -void vmovd(const Xmm& x, const Operand& op) { if (!op.isREG(32) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x6E); } +void vmovd(const Operand& op, const Xmm& x, PreferredEncoding encoding = DefaultEncoding) { opVmovd(x, op, true, encoding); } +void vmovd(const Xmm& x, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVmovd(x, op, false, encoding); } void vmovddup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_DUP|T_F2|T_0F|T_EW1|T_YMM|T_EVEX|T_ER_X|T_ER_Y|T_ER_Z, 0x12); } void vmovdqa(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_66|T_0F|T_YMM, 0x7F); } void vmovdqa(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F|T_YMM, 0x6F); } From 64140448b45e21cbda6360834ec33c3020e93fb2 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Mon, 14 Oct 2024 16:41:53 +0900 Subject: [PATCH 08/15] avoid my alias of register --- test/test_by_xed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_by_xed.py b/test/test_by_xed.py index afd77d8..1e84c6a 100644 --- a/test/test_by_xed.py +++ b/test/test_by_xed.py @@ -366,7 +366,7 @@ def parseNmemonicTest(): ('vpshldw(xmm9|k3|T_z, xmm2, ptr [rax + 0x40], 5);', Nmemonic('vpshldw', [xmm9, xmm2, Memory(0, rax, None, 0, 0x40), 5], [k3, T_z])), ('vpshrdd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5);', Nmemonic('vpshrdd', [xmm5, xmm2, Memory(0, rax, None, 0, 0x40, True), 5], [k3, T_z])), ('vpshrdd xmm5{k3}{z}, xmm2, dword ptr [rax+0x40]{1to4}, 0x5', Nmemonic('vpshrdd', [xmm5, xmm2, Memory(0, rax, None, 0, 0x40, True), 5], [k3, T_z])), - ('vcmpph(k1, xm15, ptr[rax+64], 1);', Nmemonic('vcmpph', [k1, xm15, Memory(0, rax, None, 0, 64), 1])), + ('vcmpph(k1, xmm15, ptr[rax+64], 1);', Nmemonic('vcmpph', [k1, xmm15, Memory(0, rax, None, 0, 64), 1])), ] for (s, expected) in tbl: e = parseNmemonic(s) From 8b0a1acf0e6a2d066086de001637ff4426b50e77 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Mon, 14 Oct 2024 18:04:25 +0900 Subject: [PATCH 09/15] make fails if test_by_xed.sh fails --- test/Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/Makefile b/test/Makefile index d5613e4..8313a6c 100644 --- a/test/Makefile +++ b/test/Makefile @@ -62,7 +62,10 @@ avx10_test: avx10_test.cpp $(XBYAK_INC) TEST_FILES=old.txt new-ymm.txt bf16.txt comp.txt convert.txt minmax.txt saturation.txt xed_test: - @for target in $(addprefix avx10/, $(TEST_FILES)); do ./test_by_xed.sh $$target; done + @set -e; \ + for target in $(addprefix avx10/, $(TEST_FILES)); do \ + ./test_by_xed.sh $$target || exit 1; \ + done test_nm: normalize_prefix $(TARGET) $(MAKE) -C ../gen From 220ca76f41f8c78bc32b9e00c50f42344fdc5792 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Mon, 14 Oct 2024 19:40:52 +0900 Subject: [PATCH 10/15] vmovd supports avx10.2 --- gen/gen_code.cpp | 3 --- test/avx10/misc.txt | 5 +++++ test/test_by_xed.cpp | 2 +- xbyak/xbyak.h | 44 +++++++++++++++++++++++++++++------------- xbyak/xbyak_mnemonic.h | 2 -- 5 files changed, 37 insertions(+), 19 deletions(-) diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp index df4e5b9..c2db4ac 100644 --- a/gen/gen_code.cpp +++ b/gen/gen_code.cpp @@ -1734,9 +1734,6 @@ void put() } // mov { - puts("void vmovd(const Xmm& x, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVmovd(x, op, false, encoding); }"); - puts("void vmovd(const Operand& op, const Xmm& x, PreferredEncoding encoding = DefaultEncoding) { opVmovd(x, op, true, encoding); }"); - printf("void vmovq(const Xmm& x, const Address& addr) { uint64_t type; uint8_t code; if (x.getIdx() < 16) { type = T_0F | T_F3; code = 0x7E; } else { type = T_0F | T_66 | T_EVEX | T_EW1 | T_N8; code = 0x6E; } opAVX_X_X_XM(x, xm0, addr, type, code); }\n"); printf("void vmovq(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, x.getIdx() < 16 ? 0xD6 : 0x7E); }\n"); printf("void vmovq(const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x1, xm0, x2, T_0F | T_F3 | T_EVEX | T_EW1 | T_N8, 0x7E); }\n"); diff --git a/test/avx10/misc.txt b/test/avx10/misc.txt index 9464d03..7c969bf 100644 --- a/test/avx10/misc.txt +++ b/test/avx10/misc.txt @@ -165,3 +165,8 @@ vpdpwuuds(ym1, ym2, ptr_b[rax+128]); vpdpwuuds(zm1, zm2, zm3); vpdpwuuds(zm1, zm2, ptr[rax+128]); vpdpwuuds(zm1, zm2, ptr_b[rax+128]); + +// +vmovd(xm10, xm20); +vmovd(xm10, ptr[rax+128]); +vmovd(ptr[rax+128], xm30); diff --git a/test/test_by_xed.cpp b/test/test_by_xed.cpp index ddac779..71b5137 100644 --- a/test/test_by_xed.cpp +++ b/test/test_by_xed.cpp @@ -7,7 +7,7 @@ struct Code : Xbyak::CodeGenerator { Code() : Xbyak::CodeGenerator(4096*8) { - setDefaultEncoding(VexEncoding, EvexEncoding); + setDefaultEncoding(EvexEncoding, AVX10p2Encoding); #include "tmp.cpp" } }; diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h index ed5c361..a3d1fca 100644 --- a/xbyak/xbyak.h +++ b/xbyak/xbyak.h @@ -1673,7 +1673,9 @@ inline const uint8_t* Label::getAddress() const typedef enum { DefaultEncoding, VexEncoding, - EvexEncoding + EvexEncoding, + AVX512Encoding = EvexEncoding, + AVX10p2Encoding } PreferredEncoding; class CodeGenerator : public CodeArray { @@ -2665,7 +2667,7 @@ private: { opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding, typeVex, typeEvex, sel), code, imm); } - bool isVexEncoding(PreferredEncoding encoding, int sel) const + PreferredEncoding getEncoding(PreferredEncoding encoding, int sel) const { if (encoding == DefaultEncoding) { encoding = defaultEncoding_[sel]; @@ -2674,12 +2676,11 @@ private: #ifdef XBYAK_DISABLE_AVX512 XBYAK_THROW(ERR_EVEX_IS_INVALID) #endif - return false; } - return true; + return encoding; } uint64_t orEvexIf(PreferredEncoding encoding, uint64_t typeVex, uint64_t typeEvex, int sel) { - bool isVex = isVexEncoding(encoding, sel); + bool isVex = getEncoding(encoding, sel) == VexEncoding; return isVex ? typeVex : T_MUST_EVEX | typeEvex; } void opInOut(const Reg& a, const Reg& d, uint8_t code) @@ -3177,7 +3178,7 @@ public: // set default encoding // vnniEnc : control AVX512_VNNI (evex:default) or AVX-VNNI (vex) - // avx10Enc : control mpsadbw, AVX-VNNI-INT8 (vex:default) or AVX10.2 (evex) + // avx10Enc : control mpsadbw, AVX-VNNI-INT8 (vex:default) or AVX10.2 (AVX10p2Encoding) void setDefaultEncoding(PreferredEncoding vnniEnc = EvexEncoding, PreferredEncoding avx10Enc = VexEncoding) { defaultEncoding_[0] = vnniEnc; defaultEncoding_[1] = avx10Enc; } @@ -3193,15 +3194,32 @@ public: } db(0xC8 + (idx & 7)); } - void opVmovd(const Xmm& x, const Operand& op, bool rev, PreferredEncoding encoding) + void vmovd(const Operand& op1, const Operand& op2, PreferredEncoding encoding = DefaultEncoding) { - if (isVexEncoding(encoding, 1)) { - if (!op.isREG(32) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) - uint64_t type = T_0F | T_66 | T_W0 | T_EVEX | T_N4; - int code = rev ? 0x7E : 0x6E; - opAVX_X_X_XM(x, xm0, op, type, code); - } else { + const Operand *p1 = &op1; + const Operand *p2 = &op2; + bool rev = false; + if (p1->isMEM()) { + std::swap(p1, p2); + rev = true; } + if (p1->isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) + if (p1->isXMM()) { + std::swap(p1, p2); + rev = !rev; + } + if (getEncoding(encoding, 1) == AVX10p2Encoding) { + if ((p1->isXMM() || p1->isMEM()) && p2->isXMM()) { + opAVX_X_X_XM(*static_cast(p2), xm0, *p1, T_EVEX|(rev ? T_F3 : T_66)|T_MUST_EVEX|T_0F|T_EW0|T_N4, rev ? 0x7E : 0xD6); + return; + } + } else { + if ((p1->isREG(32) || p1->isMEM()) && p2->isXMM()) { + opAVX_X_X_XM(*static_cast(p2), xm0, *p1, T_EVEX|T_66|T_0F|T_W0|T_N4, rev ? 0x6E : 0x7E); + return; + } + } + XBYAK_THROW(ERR_BAD_COMBINATION) } /* use single byte nop if useMultiByteNop = false diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index efd207a..cea4e61 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -1332,8 +1332,6 @@ void vmovapd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_ void vmovapd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F|T_EW1|T_YMM|T_EVEX, 0x28); } void vmovaps(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_0F|T_EW0|T_YMM|T_EVEX|T_M_K, 0x29); } void vmovaps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F|T_EW0|T_YMM|T_EVEX, 0x28); } -void vmovd(const Operand& op, const Xmm& x, PreferredEncoding encoding = DefaultEncoding) { opVmovd(x, op, true, encoding); } -void vmovd(const Xmm& x, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVmovd(x, op, false, encoding); } void vmovddup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_DUP|T_F2|T_0F|T_EW1|T_YMM|T_EVEX|T_ER_X|T_ER_Y|T_ER_Z, 0x12); } void vmovdqa(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_66|T_0F|T_YMM, 0x7F); } void vmovdqa(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F|T_YMM, 0x6F); } From 46238d9845ff1226b029152d7c787aa661324620 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Mon, 14 Oct 2024 19:44:10 +0900 Subject: [PATCH 11/15] [doc] update setDefaultEncoding --- doc/usage.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/doc/usage.md b/doc/usage.md index 5b25513..9636613 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -109,7 +109,7 @@ vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, yword_b [rax+64], ``` ## Selecting AVX512-VNNI, AVX-VNNI, AVX-VNNI-INT8 etc. -Some mnemonics have two types of encodings: VEX and EVEX. +Some mnemonics have some types of encodings: VEX, EVEX, AVX10.2. The functions for these mnemonics include an optional parameter as the last argument to specify the encoding. The default behavior depends on the order in which the instruction was introduced (whether VEX or EVEX came first), and can be specified using setDefaultEncoding. @@ -124,8 +124,8 @@ vpdpbusd(xm0, xm1, xm2); // VEX vmpsadbw(xm1, xm3, xm15, 3); // default encoding: VEX (AVX-VNNI) vmpsadbw(xm1, xm3, xm15, 3, VexEncoding); // same as the above vmpsadbw(xm1, xm3, xm15, 3, EvexEncoding); // EVEX (AVX10.2) -setDefaultEncoding(VexEncoding, EvexEncoding); // use 2nd argument. -vmpsadbw(xm1, xm3, xm15, 3); // EVEX +setDefaultEncoding(VexEncoding, AVX10p2Encoding); // use 2nd argument. +vmpsadbw(xm1, xm3, xm15, 3); // EVEX (AVX10.2) ``` - `setDefaultEncoding(PreferredEncoding vnniEnc = EvexEncoding, PreferredEncoding avx10Enc = VexEncoding)` @@ -133,10 +133,11 @@ Control the default encoding of mnemonics with `Xbyak::PreferredEncoding` param. param|vnniEnc|avx10Enc -|-|- -EvexEncoding|AVX512-VNNI|AVX10.2 VexEncoding|AVX-VNNI|AVX-VNNI-INT8 +EvexEncoding|AVX512-VNNI|- +AVX10p2Encoding|-|AVX10.2 default|EvexEncoding|VexEncoding -mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd, vpdpbssds, vpdpbsud, vpdpbsuds, vpdpbuud, vpdpbuuds, vpdpwsud vpdpwsuds vpdpwusd vpdpwusds vpdpwuud, vpdpwuuds +mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd, vpdpbssds, vpdpbsud, vpdpbsuds, vpdpbuud, vpdpbuuds, vpdpwsud vpdpwsuds vpdpwusd vpdpwusds vpdpwuud, vpdpwuuds, vmovd, vmovw ### Remark * `k1`, ..., `k7` are opmask registers. From 0c2f7fc6dbd713b2d690a5859f562746b4dd568d Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Tue, 15 Oct 2024 03:50:27 +0900 Subject: [PATCH 12/15] vmovw supports avx10.2 --- doc/usage.md | 7 ++++--- gen/gen_avx512.cpp | 7 ------- test/Makefile | 2 +- test/avx10/misc.txt | 7 +++++++ test/avx10/old.txt | 4 ---- test/test_by_xed.cpp | 2 +- xbyak/xbyak.h | 47 ++++++++++++++++++++++++++++-------------- xbyak/xbyak_mnemonic.h | 3 --- 8 files changed, 44 insertions(+), 35 deletions(-) diff --git a/doc/usage.md b/doc/usage.md index 9636613..ef38d63 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -124,7 +124,7 @@ vpdpbusd(xm0, xm1, xm2); // VEX vmpsadbw(xm1, xm3, xm15, 3); // default encoding: VEX (AVX-VNNI) vmpsadbw(xm1, xm3, xm15, 3, VexEncoding); // same as the above vmpsadbw(xm1, xm3, xm15, 3, EvexEncoding); // EVEX (AVX10.2) -setDefaultEncoding(VexEncoding, AVX10p2Encoding); // use 2nd argument. +setDefaultEncoding(VexEncoding, AVX10v2Encoding); // use 2nd argument. vmpsadbw(xm1, xm3, xm15, 3); // EVEX (AVX10.2) ``` @@ -133,9 +133,10 @@ Control the default encoding of mnemonics with `Xbyak::PreferredEncoding` param. param|vnniEnc|avx10Enc -|-|- -VexEncoding|AVX-VNNI|AVX-VNNI-INT8 +VexEncoding|AVX-VNNI|- EvexEncoding|AVX512-VNNI|- -AVX10p2Encoding|-|AVX10.2 +PreAVX10v2Encoding|-|AVX-VNNI-INT8, AVX512-FP16 +AVX10v2Encoding|-|AVX10.2 default|EvexEncoding|VexEncoding mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd, vpdpbssds, vpdpbsud, vpdpbsuds, vpdpbuud, vpdpbuuds, vpdpwsud vpdpwsuds vpdpwusd vpdpwusds vpdpwuud, vpdpwuuds, vmovd, vmovw diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index 07e68b4..e4d319e 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -264,7 +264,6 @@ void putM_X() { 0x7F, "vmovdqu32", T_F3 | T_0F | T_MUST_EVEX | T_YMM | T_EW0 | T_ER_X | T_ER_Y | T_ER_Z | T_M_K }, { 0x7F, "vmovdqu64", T_F3 | T_0F | T_MUST_EVEX | T_YMM | T_EW1 | T_ER_X | T_ER_Y | T_ER_Z | T_M_K }, { 0x11, "vmovsh", T_F3 | T_MAP5 | T_MUST_EVEX | T_EW0 | T_N2 | T_M_K }, - { 0x7E, "vmovw", T_66 | T_MAP5 | T_MUST_EVEX | T_N2 }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -1079,12 +1078,6 @@ void putFP16_2() printf("void vmovsh(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, %s, 0x10); }\n", s.c_str()); printf("void vmovsh(const Xmm& x1, const Xmm& x2, const Xmm& x3) { opAVX_X_X_XM(x1, x2, x3, %s, 0x10); }\n", s.c_str()); } - { - uint64_t type = T_66 | T_MAP5 | T_MUST_EVEX | T_N2; - std::string s = type2String(type); - printf("void vmovw(const Xmm& x, const Operand& op) { if (!op.isREG(32|64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, %s, 0x6E); }\n", s.c_str()); - printf("void vmovw(const Reg32e& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, r, %s, 0x7E); }\n", s.c_str()); - } } void putFP16() diff --git a/test/Makefile b/test/Makefile index 8313a6c..cf5c716 100644 --- a/test/Makefile +++ b/test/Makefile @@ -60,7 +60,7 @@ apx: apx.cpp $(XBYAK_INC) avx10_test: avx10_test.cpp $(XBYAK_INC) $(CXX) $(CFLAGS) avx10_test.cpp -o $@ -DXBYAK64 -TEST_FILES=old.txt new-ymm.txt bf16.txt comp.txt convert.txt minmax.txt saturation.txt +TEST_FILES=old.txt new-ymm.txt bf16.txt comp.txt misc.txt convert.txt minmax.txt saturation.txt xed_test: @set -e; \ for target in $(addprefix avx10/, $(TEST_FILES)); do \ diff --git a/test/avx10/misc.txt b/test/avx10/misc.txt index 7c969bf..6f5c156 100644 --- a/test/avx10/misc.txt +++ b/test/avx10/misc.txt @@ -1,3 +1,4 @@ +// AVX10 integer and FP16 VNNI, media and zero-extending vdpphps(xm1, xm2, xm3); vdpphps(xm1, xm2, ptr[rax+128]); vdpphps(xm1, xm2, ptr_b[rax+128]); @@ -168,5 +169,11 @@ vpdpwuuds(zm1, zm2, ptr_b[rax+128]); // vmovd(xm10, xm20); +vmovd(xm1, xm2); vmovd(xm10, ptr[rax+128]); vmovd(ptr[rax+128], xm30); +// +vmovw(xm1, xm20); +vmovw(xm1, xm2); +vmovw(xm3, ptr [rax+0x40]); +vmovw(ptr [rax+0x40], xm7); diff --git a/test/avx10/old.txt b/test/avx10/old.txt index 9e4f097..f5a143c 100644 --- a/test/avx10/old.txt +++ b/test/avx10/old.txt @@ -355,10 +355,6 @@ vgetmantsh(xmm1|k1|T_z|T_sae, xmm3, xmm5, 0x6); vmovsh(xmm1|k1|T_z, ptr [rax+0x40]); vmovsh(ptr [rax+0x40]|k1, xmm1); vmovsh(xmm1|k2|T_z, xmm3, xmm5); -vmovw(xmm1, r13d); -vmovw(xmm3, ptr [rax+0x40]); -vmovw(r9d, xmm1); -vmovw(ptr [rax+0x40], xmm7); vcvtsd2sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); vcvtsd2sh(xmm1, xmm2, ptr [rax+0x40]); vcvtsh2sd(xmm1|k1|T_z|T_sae, xmm2, xmm3); diff --git a/test/test_by_xed.cpp b/test/test_by_xed.cpp index 71b5137..af39296 100644 --- a/test/test_by_xed.cpp +++ b/test/test_by_xed.cpp @@ -7,7 +7,7 @@ struct Code : Xbyak::CodeGenerator { Code() : Xbyak::CodeGenerator(4096*8) { - setDefaultEncoding(EvexEncoding, AVX10p2Encoding); + setDefaultEncoding(EvexEncoding, AVX10v2Encoding); #include "tmp.cpp" } }; diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h index a3d1fca..5367d83 100644 --- a/xbyak/xbyak.h +++ b/xbyak/xbyak.h @@ -1674,8 +1674,8 @@ typedef enum { DefaultEncoding, VexEncoding, EvexEncoding, - AVX512Encoding = EvexEncoding, - AVX10p2Encoding + PreAVX10v2Encoding = EvexEncoding, + AVX10v2Encoding } PreferredEncoding; class CodeGenerator : public CodeArray { @@ -3177,9 +3177,9 @@ public: #endif // set default encoding - // vnniEnc : control AVX512_VNNI (evex:default) or AVX-VNNI (vex) - // avx10Enc : control mpsadbw, AVX-VNNI-INT8 (vex:default) or AVX10.2 (AVX10p2Encoding) - void setDefaultEncoding(PreferredEncoding vnniEnc = EvexEncoding, PreferredEncoding avx10Enc = VexEncoding) + // vnniEnc : AVX512_VNNI (default:EvexEncoding) or AVX-VNNI (VexEncoding) + // avx10Enc : mpsadbw etc., AVX-VNNI-INT8/AVX512-FP16 (default:PreAVX10v2Encoding) or AVX10.2 (AVX10v2Encoding) + void setDefaultEncoding(PreferredEncoding vnniEnc = EvexEncoding, PreferredEncoding avx10Enc = PreAVX10v2Encoding) { defaultEncoding_[0] = vnniEnc; defaultEncoding_[1] = avx10Enc; } void bswap(const Reg32e& r) @@ -3194,7 +3194,8 @@ public: } db(0xC8 + (idx & 7)); } - void vmovd(const Operand& op1, const Operand& op2, PreferredEncoding encoding = DefaultEncoding) + // AVX10 zero-extending for vmovd, vmovw + void opAVX10ZeroExt(const Operand& op1, const Operand& op2, const uint64_t typeTbl[4], const int codeTbl[4], PreferredEncoding encoding, int bit) { const Operand *p1 = &op1; const Operand *p2 = &op2; @@ -3208,18 +3209,32 @@ public: std::swap(p1, p2); rev = !rev; } - if (getEncoding(encoding, 1) == AVX10p2Encoding) { - if ((p1->isXMM() || p1->isMEM()) && p2->isXMM()) { - opAVX_X_X_XM(*static_cast(p2), xm0, *p1, T_EVEX|(rev ? T_F3 : T_66)|T_MUST_EVEX|T_0F|T_EW0|T_N4, rev ? 0x7E : 0xD6); - return; - } + int sel = -1; + if (getEncoding(encoding, 1) == AVX10v2Encoding) { + if ((p1->isXMM() || p1->isMEM()) && p2->isXMM()) sel = 2 + int(rev); } else { - if ((p1->isREG(32) || p1->isMEM()) && p2->isXMM()) { - opAVX_X_X_XM(*static_cast(p2), xm0, *p1, T_EVEX|T_66|T_0F|T_W0|T_N4, rev ? 0x6E : 0x7E); - return; - } + if ((p1->isREG(bit) || p1->isMEM()) && p2->isXMM()) sel = int(rev); } - XBYAK_THROW(ERR_BAD_COMBINATION) + if (sel == -1) XBYAK_THROW(ERR_BAD_COMBINATION) + opAVX_X_X_XM(*static_cast(p2), xm0, *p1, typeTbl[sel], codeTbl[sel]); + } + void vmovd(const Operand& op1, const Operand& op2, PreferredEncoding encoding = DefaultEncoding) + { + const uint64_t typeTbl[] = { + T_EVEX|T_66|T_0F|T_W0|T_N4, T_EVEX|T_66|T_0F|T_W0|T_N4, // legacy, avx, avx512 + T_MUST_EVEX|T_66|T_0F|T_EW0|T_N4, T_MUST_EVEX|T_F3|T_0F|T_EW0|T_N4, // avx10.2 + }; + const int codeTbl[] = { 0x7E, 0x6E, 0xD6, 0x7E }; + opAVX10ZeroExt(op1, op2, typeTbl, codeTbl, encoding, 32); + } + void vmovw(const Operand& op1, const Operand& op2, PreferredEncoding encoding = DefaultEncoding) + { + const uint64_t typeTbl[] = { + T_MUST_EVEX|T_66|T_MAP5|T_N2, T_MUST_EVEX|T_66|T_MAP5|T_N2, // avx512-fp16 + T_MUST_EVEX|T_F3|T_MAP5|T_EW0|T_N2, T_MUST_EVEX|T_F3|T_MAP5|T_EW0|T_N2, // avx10.2 + }; + const int codeTbl[] = { 0x7E, 0x6E, 0x7E, 0x6E }; + opAVX10ZeroExt(op1, op2, typeTbl, codeTbl, encoding, 16|32|64); } /* use single byte nop if useMultiByteNop = false diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index cea4e61..314bb13 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -2422,9 +2422,6 @@ void vmovdqu8(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_0F void vmovsh(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_N2|T_F3|T_MAP5|T_EW0|T_MUST_EVEX|T_M_K, 0x11); } void vmovsh(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, T_N2|T_F3|T_MAP5|T_EW0|T_MUST_EVEX, 0x10); } void vmovsh(const Xmm& x1, const Xmm& x2, const Xmm& x3) { opAVX_X_X_XM(x1, x2, x3, T_N2|T_F3|T_MAP5|T_EW0|T_MUST_EVEX, 0x10); } -void vmovw(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x7E); } -void vmovw(const Reg32e& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, r, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x7E); } -void vmovw(const Xmm& x, const Operand& op) { if (!op.isREG(32|64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x6E); } void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F3A|T_YMM, 0x42, encoding, imm, T_66|T_W0|T_YMM, T_F3|T_0F3A|T_EW0|T_B32, 1); } void vmulnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x59); } void vmulph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x59); } From ae76be35ac33f1fc1b94b866c6d85549969682a7 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Tue, 15 Oct 2024 08:51:01 +0900 Subject: [PATCH 13/15] setDefaultEncoding has changed. --- doc/changelog.md | 1 + doc/usage.md | 10 ++++---- readme.txt | 5 ++-- test/avx10_test.cpp | 4 +-- test/test_by_xed.cpp | 2 +- xbyak/xbyak.h | 59 +++++++++++++++++++++++++------------------- 6 files changed, 46 insertions(+), 35 deletions(-) diff --git a/doc/changelog.md b/doc/changelog.md index 5e25c2d..1d39ae6 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -1,5 +1,6 @@ # History +* 2024/Oct/15 ver 7.11 Added full support for AVX10.2 * 2024/Oct/13 ver 7.10 support AVX10 integer and fp16 vnni, media new instructions. setDefaultEncoding is extended. * 2024/Oct/10 ver 7.09.1 fix the names of vpcompressb and vpcompressw * 2024/Oct/08 ver 7.09 support YMM embedded rounding of AVX10.2 and fix some mnemonics with {sae}/{er}. diff --git a/doc/usage.md b/doc/usage.md index ef38d63..9015bff 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -111,13 +111,13 @@ vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, yword_b [rax+64], ## Selecting AVX512-VNNI, AVX-VNNI, AVX-VNNI-INT8 etc. Some mnemonics have some types of encodings: VEX, EVEX, AVX10.2. The functions for these mnemonics include an optional parameter as the last argument to specify the encoding. -The default behavior depends on the order in which the instruction was introduced (whether VEX or EVEX came first), +The default behavior depends on the order in which the instruction was introduced (whether VEX, EVEX or AVX10.2 came first), and can be specified using setDefaultEncoding. ``` vpdpbusd(xm0, xm1, xm2); // default encoding: EVEX (AVX512-VNNI) -vpdpbusd(xm0, xm1, xm2, EvexEncoding); // same as the above -vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX (AVX-VNNI) +vpdpbusd(xm0, xm1, xm2, AVX10v2Encoding); // same as the above +vpdpbusd(xm0, xm1, xm2, PreAVXv2Encoding); // VEX (AVX-VNNI) setDefaultEncoding(VexEncoding); // default encoding is VEX vpdpbusd(xm0, xm1, xm2); // VEX @@ -128,7 +128,7 @@ setDefaultEncoding(VexEncoding, AVX10v2Encoding); // use 2nd argument. vmpsadbw(xm1, xm3, xm15, 3); // EVEX (AVX10.2) ``` -- `setDefaultEncoding(PreferredEncoding vnniEnc = EvexEncoding, PreferredEncoding avx10Enc = VexEncoding)` +- `setDefaultEncoding(PreferredEncoding vnniEnc = EvexEncoding, PreferredEncoding avx10Enc = PreAVXv2Encoding)` Control the default encoding of mnemonics with `Xbyak::PreferredEncoding` param. param|vnniEnc|avx10Enc @@ -137,7 +137,7 @@ VexEncoding|AVX-VNNI|- EvexEncoding|AVX512-VNNI|- PreAVX10v2Encoding|-|AVX-VNNI-INT8, AVX512-FP16 AVX10v2Encoding|-|AVX10.2 -default|EvexEncoding|VexEncoding +default|EvexEncoding|PreAVXv2Encoding mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd, vpdpbssds, vpdpbsud, vpdpbsuds, vpdpbuud, vpdpbuuds, vpdpwsud vpdpwsuds vpdpwusd vpdpwusds vpdpwuud, vpdpwuuds, vmovd, vmovw ### Remark diff --git a/readme.txt b/readme.txt index 417c50e..2fb242b 100644 --- a/readme.txt +++ b/readme.txt @@ -14,7 +14,7 @@ xbyak.hをインクルードするだけですぐ利用することができます。 C++の枠組み内で閉じているため、外部アセンブラは不要です。 32bit/64bit両対応です。 - 対応ニーモニック:特権命令除くx86, MMX/MMX2/SSE/SSE2/SSE3/SSSE3/SSE4/FPU(一部)/AVX/AVX2/FMA/VEX-encoded GPR + 対応ニーモニック:特権命令除くx86, MMX/MMX2/SSE/SSE2/SSE3/SSSE3/SSE4/FPU(一部)/AVX/AVX2/FMA/AVX-512/APX/AVX10.2 ・Windows Xp(32bit, 64bit), Windows 7/Linux(32bit, 64bit)/Intel Mac対応 Windows Xp, Windows 7上ではVC2008, VC2010, VC2012 @@ -46,7 +46,7 @@ Linuxではmake installで/usr/local/include/xbyakにコピーされます。 ----------------------------------------------------------------------------- ◎新機能 -APX/AVX10対応 +APX/AVX10.2対応 例外なしモード追加 XBYAK_NO_EXCEPTIONを定義してコンパイルするとgcc/clangで-fno-exceptionsオプションでコンパイルできます。 @@ -404,6 +404,7 @@ sample/{echo,hello}.bfは http://www.kmonos.net/alang/etc/brainfuck.php から ----------------------------------------------------------------------------- ◎履歴 +2024/10/15 ver 7.11 AVX10.2完全サポート 2024/10/13 ver 7.10 AVX10 integer and fp16 vnni, mediaの新命令対応. setDefaultEncodingの拡張. 2024/10/10 ver 7.09.1 vpcompressbとvpcompresswの名前修正 2024/10/08 ver 7.09 AVX10.2のYMMレジスタの埋め込み丸め対応 diff --git a/test/avx10_test.cpp b/test/avx10_test.cpp index 5f742fe..1ceb52a 100644 --- a/test/avx10_test.cpp +++ b/test/avx10_test.cpp @@ -234,10 +234,10 @@ CYBOZU_TEST_AUTO(vmpsadbw) struct Code : Xbyak::CodeGenerator { Code() { - setDefaultEncoding(); + setDefaultEncodingAVX10(); vmpsadbw(xm1, xm3, xm15, 3); // vex(avx) vmpsadbw(ym1, ym3, ptr[rax+128], 3); // vex(avx2) - setDefaultEncoding(VexEncoding, EvexEncoding); + setDefaultEncodingAVX10(AVX10v2Encoding); vmpsadbw(ym1, ym3, ym15, 3); // evex(avx10.2) vmpsadbw(ym1, ym3, ptr[rax+128], 3); // evex(avx10.2) } diff --git a/test/test_by_xed.cpp b/test/test_by_xed.cpp index af39296..9be9199 100644 --- a/test/test_by_xed.cpp +++ b/test/test_by_xed.cpp @@ -7,7 +7,7 @@ struct Code : Xbyak::CodeGenerator { Code() : Xbyak::CodeGenerator(4096*8) { - setDefaultEncoding(EvexEncoding, AVX10v2Encoding); + setDefaultEncodingAVX10(AVX10v2Encoding); #include "tmp.cpp" } }; diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h index 5367d83..b56bfb4 100644 --- a/xbyak/xbyak.h +++ b/xbyak/xbyak.h @@ -232,6 +232,7 @@ enum { ERR_CANT_USE_REX2, ERR_INVALID_DFV, ERR_INVALID_REG_IDX, + ERR_BAD_ENCODING_MODE, ERR_INTERNAL // Put it at last. }; @@ -290,6 +291,7 @@ inline const char *ConvertErrorToString(int err) "can't use rex2", "invalid dfv", "invalid reg index", + "bad encoding mode", "internal error" }; assert(ERR_INTERNAL + 1 == sizeof(errTbl) / sizeof(*errTbl)); @@ -1674,7 +1676,7 @@ typedef enum { DefaultEncoding, VexEncoding, EvexEncoding, - PreAVX10v2Encoding = EvexEncoding, + PreAVX10v2Encoding, AVX10v2Encoding } PreferredEncoding; @@ -2663,25 +2665,24 @@ private: if (addr.getRegExp().getIndex().getKind() != kind) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING) opVex(x, 0, addr, type, code); } - void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, uint64_t type, int code, PreferredEncoding encoding, int imm = NONE, uint64_t typeVex = 0, uint64_t typeEvex = 0, int sel = 0) + void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, uint64_t type, int code, PreferredEncoding enc, int imm = NONE, uint64_t typeVex = 0, uint64_t typeEvex = 0, int sel = 0) { - opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding, typeVex, typeEvex, sel), code, imm); + opAVX_X_X_XM(x1, x2, op, type | orEvexIf(enc, typeVex, typeEvex, sel), code, imm); } - PreferredEncoding getEncoding(PreferredEncoding encoding, int sel) const + PreferredEncoding getEncoding(PreferredEncoding enc, int sel) const { - if (encoding == DefaultEncoding) { - encoding = defaultEncoding_[sel]; + if (enc == DefaultEncoding) { + enc = defaultEncoding_[sel]; } - if (encoding == EvexEncoding) { + if ((sel == 0 && enc != VexEncoding && enc != EvexEncoding) || (sel == 1 && enc != PreAVX10v2Encoding && enc != AVX10v2Encoding)) XBYAK_THROW_RET(ERR_BAD_ENCODING_MODE, VexEncoding) #ifdef XBYAK_DISABLE_AVX512 - XBYAK_THROW(ERR_EVEX_IS_INVALID) + if (enc == EvexEncoding || enc == AVX10v2Encoding) XBYAK_THROW(ERR_EVEX_IS_INVALID) #endif - } - return encoding; + return enc; } - uint64_t orEvexIf(PreferredEncoding encoding, uint64_t typeVex, uint64_t typeEvex, int sel) { - bool isVex = getEncoding(encoding, sel) == VexEncoding; - return isVex ? typeVex : T_MUST_EVEX | typeEvex; + uint64_t orEvexIf(PreferredEncoding enc, uint64_t typeVex, uint64_t typeEvex, int sel) { + enc = getEncoding(enc, sel); + return ((sel == 0 && enc == VexEncoding) || (sel == 1 && enc != AVX10v2Encoding)) ? typeVex : (T_MUST_EVEX | typeEvex); } void opInOut(const Reg& a, const Reg& d, uint8_t code) { @@ -3138,8 +3139,8 @@ public: #endif , isDefaultJmpNEAR_(false) { - // select avx512-vnni, vmpsadbw(avx) setDefaultEncoding(); + setDefaultEncodingAVX10(); labelMgr_.set(this); } void reset() @@ -3176,11 +3177,19 @@ public: #undef jnl #endif - // set default encoding - // vnniEnc : AVX512_VNNI (default:EvexEncoding) or AVX-VNNI (VexEncoding) - // avx10Enc : mpsadbw etc., AVX-VNNI-INT8/AVX512-FP16 (default:PreAVX10v2Encoding) or AVX10.2 (AVX10v2Encoding) - void setDefaultEncoding(PreferredEncoding vnniEnc = EvexEncoding, PreferredEncoding avx10Enc = PreAVX10v2Encoding) - { defaultEncoding_[0] = vnniEnc; defaultEncoding_[1] = avx10Enc; } + // set default encoding of VNNI + // EvexEncoding : AVX512_VNNI, VexEncoding : AVX-VNNI + void setDefaultEncoding(PreferredEncoding enc = EvexEncoding) + { + if (enc != VexEncoding && enc != EvexEncoding) XBYAK_THROW(ERR_BAD_ENCODING_MODE) + defaultEncoding_[0] = enc; + } + // default : PreferredEncoding : AVX-VNNI-INT8/AVX512-FP16 + void setDefaultEncodingAVX10(PreferredEncoding enc = PreAVX10v2Encoding) + { + if (enc != PreAVX10v2Encoding && enc != AVX10v2Encoding) XBYAK_THROW(ERR_BAD_ENCODING_MODE) + defaultEncoding_[1] = enc; + } void bswap(const Reg32e& r) { @@ -3195,7 +3204,7 @@ public: db(0xC8 + (idx & 7)); } // AVX10 zero-extending for vmovd, vmovw - void opAVX10ZeroExt(const Operand& op1, const Operand& op2, const uint64_t typeTbl[4], const int codeTbl[4], PreferredEncoding encoding, int bit) + void opAVX10ZeroExt(const Operand& op1, const Operand& op2, const uint64_t typeTbl[4], const int codeTbl[4], PreferredEncoding enc, int bit) { const Operand *p1 = &op1; const Operand *p2 = &op2; @@ -3210,7 +3219,7 @@ public: rev = !rev; } int sel = -1; - if (getEncoding(encoding, 1) == AVX10v2Encoding) { + if (getEncoding(enc, 1) == AVX10v2Encoding) { if ((p1->isXMM() || p1->isMEM()) && p2->isXMM()) sel = 2 + int(rev); } else { if ((p1->isREG(bit) || p1->isMEM()) && p2->isXMM()) sel = int(rev); @@ -3218,23 +3227,23 @@ public: if (sel == -1) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(*static_cast(p2), xm0, *p1, typeTbl[sel], codeTbl[sel]); } - void vmovd(const Operand& op1, const Operand& op2, PreferredEncoding encoding = DefaultEncoding) + void vmovd(const Operand& op1, const Operand& op2, PreferredEncoding enc = DefaultEncoding) { const uint64_t typeTbl[] = { T_EVEX|T_66|T_0F|T_W0|T_N4, T_EVEX|T_66|T_0F|T_W0|T_N4, // legacy, avx, avx512 T_MUST_EVEX|T_66|T_0F|T_EW0|T_N4, T_MUST_EVEX|T_F3|T_0F|T_EW0|T_N4, // avx10.2 }; const int codeTbl[] = { 0x7E, 0x6E, 0xD6, 0x7E }; - opAVX10ZeroExt(op1, op2, typeTbl, codeTbl, encoding, 32); + opAVX10ZeroExt(op1, op2, typeTbl, codeTbl, enc, 32); } - void vmovw(const Operand& op1, const Operand& op2, PreferredEncoding encoding = DefaultEncoding) + void vmovw(const Operand& op1, const Operand& op2, PreferredEncoding enc = DefaultEncoding) { const uint64_t typeTbl[] = { T_MUST_EVEX|T_66|T_MAP5|T_N2, T_MUST_EVEX|T_66|T_MAP5|T_N2, // avx512-fp16 T_MUST_EVEX|T_F3|T_MAP5|T_EW0|T_N2, T_MUST_EVEX|T_F3|T_MAP5|T_EW0|T_N2, // avx10.2 }; const int codeTbl[] = { 0x7E, 0x6E, 0x7E, 0x6E }; - opAVX10ZeroExt(op1, op2, typeTbl, codeTbl, encoding, 16|32|64); + opAVX10ZeroExt(op1, op2, typeTbl, codeTbl, enc, 16|32|64); } /* use single byte nop if useMultiByteNop = false From c3a5c4ba3d47d8f16e13b21cf5c2353936de8a3f Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Tue, 15 Oct 2024 09:54:47 +0900 Subject: [PATCH 14/15] [doc] update doc --- doc/changelog.md | 1 + doc/usage.md | 103 ++++++++++++++++++++++++++--------------------- readme.txt | 1 + 3 files changed, 59 insertions(+), 46 deletions(-) diff --git a/doc/changelog.md b/doc/changelog.md index 1d39ae6..1461f6e 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -1,5 +1,6 @@ # History +* 2024/Oct/15 ver 7.20 Fixed the specification of setDefaultEncoding, setDefaultEncodingAVX10./ * 2024/Oct/15 ver 7.11 Added full support for AVX10.2 * 2024/Oct/13 ver 7.10 support AVX10 integer and fp16 vnni, media new instructions. setDefaultEncoding is extended. * 2024/Oct/10 ver 7.09.1 fix the names of vpcompressb and vpcompressw diff --git a/doc/usage.md b/doc/usage.md index 9015bff..dcb3e10 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -1,7 +1,7 @@ # Usage Inherit `Xbyak::CodeGenerator` class and make the class method. -``` +```cpp #include struct Code : Xbyak::CodeGenerator { @@ -13,7 +13,7 @@ struct Code : Xbyak::CodeGenerator { }; ``` Or you can pass the instance of CodeGenerator without inheriting. -``` +```cpp void genCode(Xbyak::CodeGenerator& code, int x) { using namespace Xbyak::util; code.mov(eax, x); @@ -23,7 +23,7 @@ void genCode(Xbyak::CodeGenerator& code, int x) { Make an instance of the class and get the function pointer by calling `getCode()` and call it. -``` +```cpp Code c(5); int (*f)() = c.getCode(); printf("ret=%d\n", f()); // ret = 5 @@ -32,7 +32,7 @@ printf("ret=%d\n", f()); // ret = 5 ## Syntax Similar to MASM/NASM syntax with parentheses. -``` +```cpp NASM Xbyak mov eax, ebx --> mov(eax, ebx); inc ecx inc(ecx); @@ -43,7 +43,7 @@ ret --> ret(); Use `qword`, `dword`, `word` and `byte` if it is necessary to specify the size of memory, otherwise use `ptr`. -``` +```cpp (ptr|qword|dword|word|byte) [base + index * (1|2|4|8) + displacement] [rip + 32bit disp] ; x64 only @@ -53,19 +53,21 @@ mov al, [ebx+ecx] --> mov(al, ptr [ebx + ecx]); test byte [esp], 4 --> test(byte [esp], 4); inc qword [rax] --> inc(qword [rax]); ``` + **Note**: `qword`, ... are member variables, then don't use `dword` as unsigned int type. ### How to use Selector (Segment Register) -``` +```cpp mov eax, [fs:eax] --> putSeg(fs); mov(eax, ptr [eax]); mov ax, cs --> mov(ax, cs); ``` + **Note**: Segment class is not derived from `Operand`. ## AVX -``` +```cpp vaddps(xmm1, xmm2, xmm3); // xmm1 <- xmm2 + xmm3 vaddps(xmm2, xmm3, ptr [rax]); // use ptr to access memory vgatherdpd(xmm1, ptr [ebp + 256 + xmm2*4], xmm3); @@ -74,13 +76,13 @@ vgatherdpd(xmm1, ptr [ebp + 256 + xmm2*4], xmm3); **Note**: If `XBYAK_ENABLE_OMITTED_OPERAND` is defined, then you can use two operand version for backward compatibility. But the newer version will not support it. -``` +```cpp vaddps(xmm2, xmm3); // xmm2 <- xmm2 + xmm3 ``` ## AVX-512 -``` +```cpp vaddpd zmm2, zmm5, zmm30 --> vaddpd(zmm2, zmm5, zmm30); vaddpd xmm30, xmm20, [rax] --> vaddpd(xmm30, xmm20, ptr [rax]); vaddps xmm30, xmm20, [rax] --> vaddps(xmm30, xmm20, ptr [rax]); @@ -108,37 +110,44 @@ vfpclasspd k5{k3}, [rax+64]{1to2}, 5 --> vfpclasspd(k5|k3, xword_b [rax+64], vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, yword_b [rax+64], 5); // broadcast 64-bit to 256-bit ``` -## Selecting AVX512-VNNI, AVX-VNNI, AVX-VNNI-INT8 etc. +## Selecting AVX512-VNNI, AVX-VNNI, AVX-VNNI-INT8, AVX10.2. Some mnemonics have some types of encodings: VEX, EVEX, AVX10.2. The functions for these mnemonics include an optional parameter as the last argument to specify the encoding. The default behavior depends on the order in which the instruction was introduced (whether VEX, EVEX or AVX10.2 came first), and can be specified using setDefaultEncoding. -``` +```cpp vpdpbusd(xm0, xm1, xm2); // default encoding: EVEX (AVX512-VNNI) -vpdpbusd(xm0, xm1, xm2, AVX10v2Encoding); // same as the above -vpdpbusd(xm0, xm1, xm2, PreAVXv2Encoding); // VEX (AVX-VNNI) -setDefaultEncoding(VexEncoding); // default encoding is VEX +vpdpbusd(xm0, xm1, xm2, EvexEncoding); // same as the above +vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX (AVX-VNNI) +setDefaultEncoding(VexEncoding); // change default encoding vpdpbusd(xm0, xm1, xm2); // VEX -vmpsadbw(xm1, xm3, xm15, 3); // default encoding: VEX (AVX-VNNI) -vmpsadbw(xm1, xm3, xm15, 3, VexEncoding); // same as the above -vmpsadbw(xm1, xm3, xm15, 3, EvexEncoding); // EVEX (AVX10.2) -setDefaultEncoding(VexEncoding, AVX10v2Encoding); // use 2nd argument. -vmpsadbw(xm1, xm3, xm15, 3); // EVEX (AVX10.2) +vmpsadbw(xm1, xm3, xm15, 3); // default encoding: AVX +vmpsadbw(xm1, xm3, xm15, 3, PreAVX10v2Encoding); // same as the above +vmpsadbw(xm1, xm3, xm15, 3, AVX10v2Encoding); // AVX10.2 +setDefaultEncodingAVX10(AVX10v2Encoding); // change default encoding +vmpsadbw(xm1, xm3, xm15, 3); // AVX10.2 ``` -- `setDefaultEncoding(PreferredEncoding vnniEnc = EvexEncoding, PreferredEncoding avx10Enc = PreAVXv2Encoding)` -Control the default encoding of mnemonics with `Xbyak::PreferredEncoding` param. +- `setDefaultEncoding(PreferredEncoding enc = EvexEncoding)` + - Configure encoding for AVX512-VNNI or AVX-VNNI instructions. +- `setDefaultEncodingAVX10(PreferredEncoding enc = PreAVXv2Encoding)` + - Configure encoding for pre-AVX10.2 and AVX10.2 instructions. -param|vnniEnc|avx10Enc +`setDefaultEncoding`|EvexEncoding (default)|VexEncoding -|-|- -VexEncoding|AVX-VNNI|- -EvexEncoding|AVX512-VNNI|- -PreAVX10v2Encoding|-|AVX-VNNI-INT8, AVX512-FP16 -AVX10v2Encoding|-|AVX10.2 -default|EvexEncoding|PreAVXv2Encoding -mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd, vpdpbssds, vpdpbsud, vpdpbsuds, vpdpbuud, vpdpbuuds, vpdpwsud vpdpwsuds vpdpwusd vpdpwusds vpdpwuud, vpdpwuuds, vmovd, vmovw +feature|AVX512-VNNI|AVX-VNNI + +- Target functions: vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds + +`setDefaultEncodingAVX10`|PreAVX10v2Encoding (default)|AVX10v2Encoding +-|-|- +feature|AVX-VNNI-INT8, AVX512-FP16|AVX10.2 + +- Target functions: vmpsadbw, vpdpbssd, vpdpbssds, vpdpbsud, vpdpbsuds, vpdpbuud, vpdpbuuds, vpdpwsud vpdpwsuds vpdpwusd vpdpwusds vpdpwuud, vpdpwuuds, vmovd, vmovw + +- Remark: vmovd and vmovw several kinds of encoding such as AVX/AVX512F/AVX512-FP16/AVX10.2. ### Remark * `k1`, ..., `k7` are opmask registers. @@ -181,7 +190,7 @@ mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd, vpdpbssds, Two kinds of Label are supported. (String literal and Label class). ### String literal -``` +```cpp L("L1"); jmp("L1"); @@ -203,7 +212,7 @@ L("L3"); ### Support `@@`, `@f`, `@b` like MASM -``` +```cpp L("@@"); // jmp("@b"); // jmp to jmp("@f"); // jmp to @@ -219,7 +228,7 @@ Label symbols beginning with a period between `inLocalLabel()` and `outLocalLabe are treated as a local label. `inLocalLabel()` and `outLocalLabel()` can be nested. -``` +```cpp void func1() { inLocalLabel(); @@ -242,7 +251,7 @@ void func1() Xbyak deals with jump mnemonics of an undefined label as short jump if no type is specified. So if the size between jmp and label is larger than 127 byte, then xbyak will cause an error. -``` +```cpp jmp("short-jmp"); // short jmp // small code L("short-jmp"); @@ -251,14 +260,16 @@ jmp("long-jmp"); // long code L("long-jmp"); // throw exception ``` + Then specify T_NEAR for jmp. -``` +```cpp jmp("long-jmp", T_NEAR); // long jmp // long code L("long-jmp"); ``` + Or call `setDefaultJmpNEAR(true);` once, then the default type is set to T_NEAR. -``` +```cpp jmp("long-jmp"); // long jmp // long code L("long-jmp"); @@ -268,7 +279,7 @@ L("long-jmp"); `L()` and `jxx()` support Label class. -``` +```cpp Xbyak::Label label1, label2; L(label1); ... @@ -280,7 +291,7 @@ L(label2); ``` Use `putL` for jmp table -``` +```cpp Label labelTbl, L0, L1, L2; mov(rax, labelTbl); // rdx is an index of jump table @@ -297,7 +308,7 @@ L(L1); `assignL(dstLabel, srcLabel)` binds dstLabel with srcLabel. -``` +```cpp Label label2; Label label1 = L(); // make label1 ; same to Label label1; L(label1); ... @@ -312,7 +323,7 @@ The `jmp` in the above code jumps to label1 assigned by `assignL`. * dstLabel must not be used in `L()`. `Label::getAddress()` returns the address specified by the label instance and 0 if not specified. -``` +```cpp // not AutoGrow mode Label label; assert(label.getAddress() == 0); @@ -321,7 +332,7 @@ assert(label.getAddress() == getCurr()); ``` ### Rip ; relative addressing -``` +```cpp Label label; mov(eax, ptr [rip + label]); // eax = 4 ... @@ -329,7 +340,7 @@ mov(eax, ptr [rip + label]); // eax = 4 L(label); dd(4); ``` -``` +```cpp int x; ... mov(eax, ptr[rip + &x]); // throw exception if the difference between &x and current position is larger than 2GiB @@ -340,13 +351,13 @@ int x; Use `word|dword|qword` instead of `ptr` to specify the address size. ### 32 bit mode -``` +```cpp jmp(word[eax], T_FAR); // jmp m16:16(FF /5) jmp(dword[eax], T_FAR); // jmp m16:32(FF /5) ``` ### 64 bit mode -``` +```cpp jmp(word[rax], T_FAR); // jmp m16:16(FF /5) jmp(dword[rax], T_FAR); // jmp m16:32(FF /5) jmp(qword[rax], T_FAR); // jmp m16:64(REX.W FF /5) @@ -357,7 +368,7 @@ The same applies to `call`. The default max code size is 4096 bytes. Specify the size in constructor of `CodeGenerator()` if necessary. -``` +```cpp class Quantize : public Xbyak::CodeGenerator { public: Quantize() @@ -374,7 +385,7 @@ You can make jit code on prepared memory. Call `setProtectModeRE` yourself to change memory mode if using the prepared memory. -``` +```cpp uint8_t alignas(4096) buf[8192]; // C++11 or later struct Code : Xbyak::CodeGenerator { @@ -400,7 +411,7 @@ int main() The memory region for jit is automatically extended if necessary when `AutoGrow` is specified in a constructor of `CodeGenerator`. Call `ready()` or `readyRE()` before calling `getCode()` to fix jump address. -``` +```cpp struct Code : Xbyak::CodeGenerator { Code() : Xbyak::CodeGenerator(, Xbyak::AutoGrow) @@ -421,7 +432,7 @@ Xbyak set Read/Write/Exec mode to memory to run jit code. If you want to use Read/Exec mode for security, then specify `DontSetProtectRWE` for `CodeGenerator` and call `setProtectModeRE()` after generating jit code. -``` +```cpp struct Code : Xbyak::CodeGenerator { Code() : Xbyak::CodeGenerator(4096, Xbyak::DontSetProtectRWE) diff --git a/readme.txt b/readme.txt index 2fb242b..44a7937 100644 --- a/readme.txt +++ b/readme.txt @@ -404,6 +404,7 @@ sample/{echo,hello}.bfは http://www.kmonos.net/alang/etc/brainfuck.php から ----------------------------------------------------------------------------- ◎履歴 +2024/10/15 ver 7.20 setDefaultEncoding/setDefaultEncodingAVX10の仕様確定 2024/10/15 ver 7.11 AVX10.2完全サポート 2024/10/13 ver 7.10 AVX10 integer and fp16 vnni, mediaの新命令対応. setDefaultEncodingの拡張. 2024/10/10 ver 7.09.1 vpcompressbとvpcompresswの名前修正 From 2d70c949056ef78d0ffe9b7231544fdab6c3fdc0 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Tue, 15 Oct 2024 09:57:28 +0900 Subject: [PATCH 15/15] v7.20 --- CMakeLists.txt | 2 +- meson.build | 2 +- readme.md | 6 +++--- readme.txt | 2 +- xbyak/xbyak.h | 2 +- xbyak/xbyak_mnemonic.h | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 72dad78..3ded27c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.5) -project(xbyak LANGUAGES CXX VERSION 7.10) +project(xbyak LANGUAGES CXX VERSION 7.20) file(GLOB headers xbyak/*.h) diff --git a/meson.build b/meson.build index 3fb5e51..b69a379 100644 --- a/meson.build +++ b/meson.build @@ -5,7 +5,7 @@ project( 'xbyak', 'cpp', - version: '7.10', + version: '7.20', license: 'BSD-3-Clause', default_options: 'b_ndebug=if-release' ) diff --git a/readme.md b/readme.md index 49f0a9d..90d2934 100644 --- a/readme.md +++ b/readme.md @@ -1,5 +1,5 @@ -# Xbyak 7.10 [![Badge Build]][Build Status] +# Xbyak 7.20 [![Badge Build]][Build Status] *A C++ JIT assembler for x86 (IA32), x64 (AMD64, x86-64)* @@ -20,8 +20,7 @@ It is named from a Japanese word [開闢](https://translate.google.com/?hl=ja&sl - header file only - Intel/MASM like syntax -- fully support AVX-512 -- support APX/AVX10 +- Full support for AVX-512, APX, and AVX10.2 **Note**: Use `and_()`, `or_()`, ... instead of `and()`, `or()`. @@ -33,6 +32,7 @@ If you want to use them, then specify `-fno-operator-names` option to gcc/clang. ### News +- support AVX10.2 - support xresldtrk/xsusldtrk - support RAO-INT for APX - support AVX10 detection, AESKLE, WIDE_KL, KEYLOCKER, KEYLOCKER_WIDE diff --git a/readme.txt b/readme.txt index 44a7937..65527f3 100644 --- a/readme.txt +++ b/readme.txt @@ -1,5 +1,5 @@ - C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 7.10 + C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 7.20 ----------------------------------------------------------------------------- ◎概要 diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h index b56bfb4..c0bd83e 100644 --- a/xbyak/xbyak.h +++ b/xbyak/xbyak.h @@ -155,7 +155,7 @@ namespace Xbyak { enum { DEFAULT_MAX_CODE_SIZE = 4096, - VERSION = 0x7100 /* 0xABCD = A.BC(.D) */ + VERSION = 0x7200 /* 0xABCD = A.BC(.D) */ }; #ifndef MIE_INTEGER_TYPE_DEFINED diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index 314bb13..087db03 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -1,4 +1,4 @@ -const char *getVersionString() const { return "7.10"; } +const char *getVersionString() const { return "7.20"; } void aadd(const Address& addr, const Reg32e ®) { opMR(addr, reg, T_0F38, 0x0FC, T_APX); } void aand(const Address& addr, const Reg32e ®) { opMR(addr, reg, T_0F38|T_66, 0x0FC, T_APX|T_66); } void adc(const Operand& op, uint32_t imm) { opOI(op, imm, 0x10, 2); }