diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs index e0c7fcfab6..081609ecea 100644 --- a/crates/core_arch/src/x86/avx2.rs +++ b/crates/core_arch/src/x86/avx2.rs @@ -2585,44 +2585,52 @@ pub unsafe fn _mm256_slli_si256(a: __m256i) -> __m256i { #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_bslli_epi128(a: __m256i) -> __m256i { static_assert_imm8!(IMM8); + const fn mask(shift: i32, i: u32) -> u32 { + let shift = shift as u32 & 0xff; + if shift > 15 || i % 16 < shift { + 0 + } else { + 32 + (i - shift) + } + } let a = a.as_i8x32(); let zero = _mm256_setzero_si256().as_i8x32(); let r: i8x32 = simd_shuffle32!( zero, a, [ - 32 - (IMM8 as u32 & 0xff), - 33 - (IMM8 as u32 & 0xff), - 34 - (IMM8 as u32 & 0xff), - 35 - (IMM8 as u32 & 0xff), - 36 - (IMM8 as u32 & 0xff), - 37 - (IMM8 as u32 & 0xff), - 38 - (IMM8 as u32 & 0xff), - 39 - (IMM8 as u32 & 0xff), - 40 - (IMM8 as u32 & 0xff), - 41 - (IMM8 as u32 & 0xff), - 42 - (IMM8 as u32 & 0xff), - 43 - (IMM8 as u32 & 0xff), - 44 - (IMM8 as u32 & 0xff), - 45 - (IMM8 as u32 & 0xff), - 46 - (IMM8 as u32 & 0xff), - 47 - (IMM8 as u32 & 0xff), - 48 - (IMM8 as u32 & 0xff) - 16, - 49 - (IMM8 as u32 & 0xff) - 16, - 50 - (IMM8 as u32 & 0xff) - 16, - 51 - (IMM8 as u32 & 0xff) - 16, - 52 - (IMM8 as u32 & 0xff) - 16, - 53 - (IMM8 as u32 & 0xff) - 16, - 54 - (IMM8 as u32 & 0xff) - 16, - 55 - (IMM8 as u32 & 0xff) - 16, - 56 - (IMM8 as u32 & 0xff) - 16, - 57 - (IMM8 as u32 & 0xff) - 16, - 58 - (IMM8 as u32 & 0xff) - 16, - 59 - (IMM8 as u32 & 0xff) - 16, - 60 - (IMM8 as u32 & 0xff) - 16, - 61 - (IMM8 as u32 & 0xff) - 16, - 62 - (IMM8 as u32 & 0xff) - 16, - 63 - (IMM8 as u32 & 0xff) - 16, + mask(IMM8, 0), + mask(IMM8, 1), + mask(IMM8, 2), + mask(IMM8, 3), + mask(IMM8, 4), + mask(IMM8, 5), + mask(IMM8, 6), + mask(IMM8, 7), + mask(IMM8, 8), + mask(IMM8, 9), + mask(IMM8, 10), + mask(IMM8, 11), + mask(IMM8, 12), + mask(IMM8, 13), + mask(IMM8, 14), + mask(IMM8, 15), + mask(IMM8, 16), + mask(IMM8, 17), + mask(IMM8, 18), + mask(IMM8, 19), + mask(IMM8, 20), + mask(IMM8, 21), + mask(IMM8, 22), + mask(IMM8, 23), + mask(IMM8, 24), + mask(IMM8, 25), + mask(IMM8, 26), + mask(IMM8, 27), + mask(IMM8, 28), + mask(IMM8, 29), + mask(IMM8, 30), + mask(IMM8, 31), ], ); transmute(r) diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs index 8c2c9a2058..4834f19edd 100644 --- a/crates/core_arch/src/x86/avx512bw.rs +++ b/crates/core_arch/src/x86/avx512bw.rs @@ -8873,76 +8873,84 @@ pub unsafe fn _mm_maskz_cvtepu8_epi16(k: __mmask8, a: __m128i) -> __m128i { #[rustc_legacy_const_generics(1)] pub unsafe fn _mm512_bslli_epi128(a: __m512i) -> __m512i { static_assert_imm8!(IMM8); + const fn mask(shift: i32, i: u32) -> u32 { + let shift = shift as u32 & 0xff; + if shift > 15 || i % 16 < shift { + 0 + } else { + 64 + (i - shift) + } + } let a = a.as_i8x64(); let zero = _mm512_setzero_si512().as_i8x64(); let r: i8x64 = simd_shuffle64!( zero, a, [ - 64 - (IMM8 as u32 & 0xff), - 65 - (IMM8 as u32 & 0xff), - 66 - (IMM8 as u32 & 0xff), - 67 - (IMM8 as u32 & 0xff), - 68 - (IMM8 as u32 & 0xff), - 69 - (IMM8 as u32 & 0xff), - 70 - (IMM8 as u32 & 0xff), - 71 - (IMM8 as u32 & 0xff), - 72 - (IMM8 as u32 & 0xff), - 73 - (IMM8 as u32 & 0xff), - 74 - (IMM8 as u32 & 0xff), - 75 - (IMM8 as u32 & 0xff), - 76 - (IMM8 as u32 & 0xff), - 77 - (IMM8 as u32 & 0xff), - 78 - (IMM8 as u32 & 0xff), - 79 - (IMM8 as u32 & 0xff), - 80 - (IMM8 as u32 & 0xff) - 16, - 81 - (IMM8 as u32 & 0xff) - 16, - 82 - (IMM8 as u32 & 0xff) - 16, - 83 - (IMM8 as u32 & 0xff) - 16, - 84 - (IMM8 as u32 & 0xff) - 16, - 85 - (IMM8 as u32 & 0xff) - 16, - 86 - (IMM8 as u32 & 0xff) - 16, - 87 - (IMM8 as u32 & 0xff) - 16, - 88 - (IMM8 as u32 & 0xff) - 16, - 89 - (IMM8 as u32 & 0xff) - 16, - 90 - (IMM8 as u32 & 0xff) - 16, - 91 - (IMM8 as u32 & 0xff) - 16, - 92 - (IMM8 as u32 & 0xff) - 16, - 93 - (IMM8 as u32 & 0xff) - 16, - 94 - (IMM8 as u32 & 0xff) - 16, - 95 - (IMM8 as u32 & 0xff) - 16, - 96 - (IMM8 as u32 & 0xff) - 32, - 97 - (IMM8 as u32 & 0xff) - 32, - 98 - (IMM8 as u32 & 0xff) - 32, - 99 - (IMM8 as u32 & 0xff) - 32, - 100 - (IMM8 as u32 & 0xff) - 32, - 101 - (IMM8 as u32 & 0xff) - 32, - 102 - (IMM8 as u32 & 0xff) - 32, - 103 - (IMM8 as u32 & 0xff) - 32, - 104 - (IMM8 as u32 & 0xff) - 32, - 105 - (IMM8 as u32 & 0xff) - 32, - 106 - (IMM8 as u32 & 0xff) - 32, - 107 - (IMM8 as u32 & 0xff) - 32, - 108 - (IMM8 as u32 & 0xff) - 32, - 109 - (IMM8 as u32 & 0xff) - 32, - 110 - (IMM8 as u32 & 0xff) - 32, - 111 - (IMM8 as u32 & 0xff) - 32, - 112 - (IMM8 as u32 & 0xff) - 48, - 113 - (IMM8 as u32 & 0xff) - 48, - 114 - (IMM8 as u32 & 0xff) - 48, - 115 - (IMM8 as u32 & 0xff) - 48, - 116 - (IMM8 as u32 & 0xff) - 48, - 117 - (IMM8 as u32 & 0xff) - 48, - 118 - (IMM8 as u32 & 0xff) - 48, - 119 - (IMM8 as u32 & 0xff) - 48, - 120 - (IMM8 as u32 & 0xff) - 48, - 121 - (IMM8 as u32 & 0xff) - 48, - 122 - (IMM8 as u32 & 0xff) - 48, - 123 - (IMM8 as u32 & 0xff) - 48, - 124 - (IMM8 as u32 & 0xff) - 48, - 125 - (IMM8 as u32 & 0xff) - 48, - 126 - (IMM8 as u32 & 0xff) - 48, - 127 - (IMM8 as u32 & 0xff) - 48, + mask(IMM8, 0), + mask(IMM8, 1), + mask(IMM8, 2), + mask(IMM8, 3), + mask(IMM8, 4), + mask(IMM8, 5), + mask(IMM8, 6), + mask(IMM8, 7), + mask(IMM8, 8), + mask(IMM8, 9), + mask(IMM8, 10), + mask(IMM8, 11), + mask(IMM8, 12), + mask(IMM8, 13), + mask(IMM8, 14), + mask(IMM8, 15), + mask(IMM8, 16), + mask(IMM8, 17), + mask(IMM8, 18), + mask(IMM8, 19), + mask(IMM8, 20), + mask(IMM8, 21), + mask(IMM8, 22), + mask(IMM8, 23), + mask(IMM8, 24), + mask(IMM8, 25), + mask(IMM8, 26), + mask(IMM8, 27), + mask(IMM8, 28), + mask(IMM8, 29), + mask(IMM8, 30), + mask(IMM8, 31), + mask(IMM8, 32), + mask(IMM8, 33), + mask(IMM8, 34), + mask(IMM8, 35), + mask(IMM8, 36), + mask(IMM8, 37), + mask(IMM8, 38), + mask(IMM8, 39), + mask(IMM8, 40), + mask(IMM8, 41), + mask(IMM8, 42), + mask(IMM8, 43), + mask(IMM8, 44), + mask(IMM8, 45), + mask(IMM8, 46), + mask(IMM8, 47), + mask(IMM8, 48), + mask(IMM8, 49), + mask(IMM8, 50), + mask(IMM8, 51), + mask(IMM8, 52), + mask(IMM8, 53), + mask(IMM8, 54), + mask(IMM8, 55), + mask(IMM8, 56), + mask(IMM8, 57), + mask(IMM8, 58), + mask(IMM8, 59), + mask(IMM8, 60), + mask(IMM8, 61), + mask(IMM8, 62), + mask(IMM8, 63), ], ); transmute(r) diff --git a/crates/core_arch/src/x86/sse2.rs b/crates/core_arch/src/x86/sse2.rs index 2f3e719668..307dae77d2 100644 --- a/crates/core_arch/src/x86/sse2.rs +++ b/crates/core_arch/src/x86/sse2.rs @@ -425,10 +425,11 @@ pub unsafe fn _mm_slli_si128(a: __m128i) -> __m128i { #[target_feature(enable = "sse2")] unsafe fn _mm_slli_si128_impl(a: __m128i) -> __m128i { const fn mask(shift: i32, i: u32) -> u32 { - if (shift as u32) > 15 { + let shift = shift as u32 & 0xff; + if shift > 15 { i } else { - 16 - (shift as u32) + i + 16 - shift + i } } let zero = _mm_set1_epi8(0).as_i8x16();