Skip to content

Commit 2c4d880

Browse files
gwennalexcrichton
authored andcommitted
sse2: implements last remaining intrinsics (#244)
* sse2: __m64 related intrinsics _mm_add_si64 _mm_mul_su32 _mm_sub_si64 _mm_cvtpi32_pd _mm_set_epi64 _mm_set1_epi64 _mm_setr_epi64 * sse2: _mm_load_sd, _mm_loadh_pd, _mm_loadl_pd * sse2: _mm_store_sd, _mm_storeh_pd, _mm_storel_pd * sse2: _mm_shuffle_pd, _mm_move_sd * sse2: _mm_cast* _mm_castpd_ps _mm_castpd_si128 _mm_castps_pd _mm_castps_si128 _mm_castsi128_pd _mm_castsi128_ps * sse2: add some tests * Try to fix AppVeyor build * sse2: add more tests * sse2: fix assert_instr for _mm_shuffle_pd * Try to fix Travis build * sse2: try to fix AppVeyor build * sse2: try to fix AppVeyor build
1 parent 8550a9c commit 2c4d880

File tree

2 files changed

+380
-1
lines changed

2 files changed

+380
-1
lines changed

coresimd/src/x86/i586/sse2.rs

Lines changed: 245 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1865,6 +1865,35 @@ pub unsafe fn _mm_load_pd(mem_addr: *const f64) -> f64x2 {
18651865
*(mem_addr as *const f64x2)
18661866
}
18671867

1868+
/// Loads a 64-bit double-precision value to the low element of a
1869+
/// 128-bit integer vector and clears the upper element.
1870+
#[inline(always)]
1871+
#[target_feature = "+sse2"]
1872+
#[cfg_attr(test, assert_instr(movsd))]
1873+
pub unsafe fn _mm_load_sd(mem_addr: *const f64) -> f64x2 {
1874+
f64x2::new(*mem_addr, 0.)
1875+
}
1876+
1877+
/// Loads a double-precision value into the high-order bits of a 128-bit
1878+
/// vector of [2 x double]. The low-order bits are copied from the low-order
1879+
/// bits of the first operand.
1880+
#[inline(always)]
1881+
#[target_feature = "+sse2"]
1882+
#[cfg_attr(test, assert_instr(movhpd))]
1883+
pub unsafe fn _mm_loadh_pd(a: f64x2, mem_addr: *const f64) -> f64x2 {
1884+
f64x2::new(a.extract(0), *mem_addr)
1885+
}
1886+
1887+
/// Loads a double-precision value into the low-order bits of a 128-bit
1888+
/// vector of [2 x double]. The high-order bits are copied from the
1889+
/// high-order bits of the first operand.
1890+
#[inline(always)]
1891+
#[target_feature = "+sse2"]
1892+
#[cfg_attr(test, assert_instr(movlpd))]
1893+
pub unsafe fn _mm_loadl_pd(a: f64x2, mem_addr: *const f64) -> f64x2 {
1894+
f64x2::new(*mem_addr, a.extract(1))
1895+
}
1896+
18681897
/// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
18691898
/// aligned memory location.
18701899
/// To minimize caching, the data is flagged as non-temporal (unlikely to be
@@ -1876,6 +1905,15 @@ pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: f64x2) {
18761905
::core::intrinsics::nontemporal_store(mem::transmute(mem_addr), a);
18771906
}
18781907

1908+
/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1909+
/// memory location.
1910+
#[inline(always)]
1911+
#[target_feature = "+sse2"]
1912+
#[cfg_attr(all(test, not(windows)), assert_instr(movlps))] // FIXME movsd only on windows
1913+
pub unsafe fn _mm_store_sd(mem_addr: *mut f64, a: f64x2) {
1914+
*mem_addr = a.extract(0)
1915+
}
1916+
18791917
/// Store 128-bits (composed of 2 packed double-precision (64-bit)
18801918
/// floating-point elements) from `a` into memory. `mem_addr` must be aligned
18811919
/// on a 16-byte boundary or a general-protection exception may be generated.
@@ -1931,6 +1969,24 @@ pub unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: f64x2) {
19311969
*(mem_addr as *mut f64x2) = b;
19321970
}
19331971

1972+
/// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
1973+
/// memory location.
1974+
#[inline(always)]
1975+
#[target_feature = "+sse2"]
1976+
#[cfg_attr(test, assert_instr(movhpd))]
1977+
pub unsafe fn _mm_storeh_pd(mem_addr: *mut f64, a: f64x2) {
1978+
*mem_addr = a.extract(1)
1979+
}
1980+
1981+
/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1982+
/// memory location.
1983+
#[inline(always)]
1984+
#[target_feature = "+sse2"]
1985+
#[cfg_attr(all(test, not(windows)), assert_instr(movlps))] // FIXME movlpd (movsd on windows)
1986+
pub unsafe fn _mm_storel_pd(mem_addr: *mut f64, a: f64x2) {
1987+
*mem_addr = a.extract(0)
1988+
}
1989+
19341990
/// Load a double-precision (64-bit) floating-point element from memory
19351991
/// into both elements of returned vector.
19361992
#[inline(always)]
@@ -1976,6 +2032,79 @@ pub unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> f64x2 {
19762032
dst
19772033
}
19782034

2035+
/// Constructs a 128-bit floating-point vector of [2 x double] from two
2036+
/// 128-bit vector parameters of [2 x double], using the immediate-value
2037+
/// parameter as a specifier.
2038+
#[inline(always)]
2039+
#[target_feature = "+sse2"]
2040+
#[cfg_attr(test, assert_instr(shufpd, imm8 = 1))]
2041+
pub unsafe fn _mm_shuffle_pd(a: f64x2, b: f64x2, imm8: i32) -> f64x2 {
2042+
match imm8 & 0b11 {
2043+
0b00 => simd_shuffle2(a, b, [0, 2]),
2044+
0b01 => simd_shuffle2(a, b, [1, 2]),
2045+
0b10 => simd_shuffle2(a, b, [0, 3]),
2046+
_ => simd_shuffle2(a, b, [1, 3]),
2047+
}
2048+
}
2049+
2050+
/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
2051+
/// 64 bits are set to the lower 64 bits of the second parameter. The upper
2052+
/// 64 bits are set to the upper 64 bits of the first parameter.
2053+
#[inline(always)]
2054+
#[target_feature = "+sse2"]
2055+
#[cfg_attr(test, assert_instr(movsd))]
2056+
pub unsafe fn _mm_move_sd(a: f64x2, b: f64x2) -> f64x2 {
2057+
f64x2::new(b.extract(0), a.extract(1))
2058+
}
2059+
2060+
/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
2061+
/// floating-point vector of [4 x float].
2062+
#[inline(always)]
2063+
#[target_feature = "+sse2"]
2064+
pub unsafe fn _mm_castpd_ps(a: f64x2) -> f32x4 {
2065+
mem::transmute(a)
2066+
}
2067+
2068+
/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
2069+
/// integer vector.
2070+
#[inline(always)]
2071+
#[target_feature = "+sse2"]
2072+
pub unsafe fn _mm_castpd_si128(a: f64x2) -> __m128i {
2073+
simd_cast(a)
2074+
}
2075+
2076+
/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
2077+
/// floating-point vector of [2 x double].
2078+
#[inline(always)]
2079+
#[target_feature = "+sse2"]
2080+
pub unsafe fn _mm_castps_pd(a: f32x4) -> f64x2 {
2081+
mem::transmute(a)
2082+
}
2083+
2084+
/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
2085+
/// integer vector.
2086+
#[inline(always)]
2087+
#[target_feature = "+sse2"]
2088+
pub unsafe fn _mm_castps_si128(a: f32x4) -> __m128i {
2089+
mem::transmute(a)
2090+
}
2091+
2092+
/// Casts a 128-bit integer vector into a 128-bit floating-point vector
2093+
/// of [2 x double].
2094+
#[inline(always)]
2095+
#[target_feature = "+sse2"]
2096+
pub unsafe fn _mm_castsi128_pd(a: __m128i) -> f64x2 {
2097+
simd_cast(a)
2098+
}
2099+
2100+
/// Casts a 128-bit integer vector into a 128-bit floating-point vector
2101+
/// of [4 x float].
2102+
#[inline(always)]
2103+
#[target_feature = "+sse2"]
2104+
pub unsafe fn _mm_castsi128_ps(a: __m128i) -> f32x4 {
2105+
mem::transmute(a)
2106+
}
2107+
19792108
/// Return vector of type __m128d with undefined elements.
19802109
#[inline(always)]
19812110
#[target_feature = "+sse2"]
@@ -3760,6 +3889,32 @@ mod tests {
37603889
assert_eq!(r, f64x2::new(1.0, 2.0));
37613890
}
37623891

3892+
#[simd_test = "sse2"]
3893+
unsafe fn _mm_load_sd() {
3894+
let a = 1.;
3895+
let expected = f64x2::new(a, 0.);
3896+
let r = sse2::_mm_load_sd(&a);
3897+
assert_eq!(r, expected);
3898+
}
3899+
3900+
#[simd_test = "sse2"]
3901+
unsafe fn _mm_loadh_pd() {
3902+
let a = f64x2::new(1., 2.);
3903+
let b = 3.;
3904+
let expected = f64x2::new(a.extract(0), 3.);
3905+
let r = sse2::_mm_loadh_pd(a, &b);
3906+
assert_eq!(r, expected);
3907+
}
3908+
3909+
#[simd_test = "sse2"]
3910+
unsafe fn _mm_loadl_pd() {
3911+
let a = f64x2::new(1., 2.);
3912+
let b = 3.;
3913+
let expected = f64x2::new(3., a.extract(1));
3914+
let r = sse2::_mm_loadl_pd(a, &b);
3915+
assert_eq!(r, expected);
3916+
}
3917+
37633918
#[simd_test = "sse2"]
37643919
unsafe fn _mm_stream_pd() {
37653920
#[repr(align(128))]
@@ -3775,6 +3930,14 @@ mod tests {
37753930
}
37763931
}
37773932

3933+
#[simd_test = "sse2"]
3934+
unsafe fn _mm_store_sd() {
3935+
let mut dest = 0.;
3936+
let a = f64x2::new(1., 2.);
3937+
sse2::_mm_store_sd(&mut dest, a);
3938+
assert_eq!(dest, a.extract(0));
3939+
}
3940+
37783941
#[simd_test = "sse2"]
37793942
unsafe fn _mm_store_pd() {
37803943
let mut mem = Memory { data: [0.0f64; 4] };
@@ -3847,6 +4010,22 @@ mod tests {
38474010
assert_eq!(vals[1], 1.0);
38484011
}
38494012

4013+
#[simd_test = "sse2"]
4014+
unsafe fn _mm_storeh_pd() {
4015+
let mut dest = 0.;
4016+
let a = f64x2::new(1., 2.);
4017+
sse2::_mm_storeh_pd(&mut dest, a);
4018+
assert_eq!(dest, a.extract(1));
4019+
}
4020+
4021+
#[simd_test = "sse2"]
4022+
unsafe fn _mm_storel_pd() {
4023+
let mut dest = 0.;
4024+
let a = f64x2::new(1., 2.);
4025+
sse2::_mm_storel_pd(&mut dest, a);
4026+
assert_eq!(dest, a.extract(0));
4027+
}
4028+
38504029
#[simd_test = "sse2"]
38514030
unsafe fn _mm_loadr_pd() {
38524031
let mut mem = Memory {
@@ -4105,4 +4284,70 @@ mod tests {
41054284
let r = sse2::_mm_unpacklo_pd(a, b);
41064285
assert_eq!(r, f64x2::new(1.0, 3.0));
41074286
}
4287+
4288+
#[simd_test = "sse2"]
4289+
unsafe fn _mm_shuffle_pd() {
4290+
let a = f64x2::new(1., 2.);
4291+
let b = f64x2::new(3., 4.);
4292+
let expected = f64x2::new(1., 3.);
4293+
let r = sse2::_mm_shuffle_pd(a, b, 0);
4294+
assert_eq!(r, expected);
4295+
}
4296+
4297+
#[simd_test = "sse2"]
4298+
unsafe fn _mm_move_sd() {
4299+
let a = f64x2::new(1., 2.);
4300+
let b = f64x2::new(3., 4.);
4301+
let expected = f64x2::new(3., 2.);
4302+
let r = sse2::_mm_move_sd(a, b);
4303+
assert_eq!(r, expected);
4304+
}
4305+
4306+
#[simd_test = "sse2"]
4307+
unsafe fn _mm_castpd_ps() {
4308+
let a = f64x2::splat(0.);
4309+
let expected = f32x4::splat(0.);
4310+
let r = sse2::_mm_castpd_ps(a);
4311+
assert_eq!(r, expected);
4312+
}
4313+
4314+
#[simd_test = "sse2"]
4315+
unsafe fn _mm_castpd_si128() {
4316+
let a = f64x2::splat(0.);
4317+
let expected = i64x2::splat(0);
4318+
let r = sse2::_mm_castpd_si128(a);
4319+
assert_eq!(r, __m128i::from(expected));
4320+
}
4321+
4322+
#[simd_test = "sse2"]
4323+
unsafe fn _mm_castps_pd() {
4324+
let a = f32x4::splat(0.);
4325+
let expected = f64x2::splat(0.);
4326+
let r = sse2::_mm_castps_pd(a);
4327+
assert_eq!(r, expected);
4328+
}
4329+
4330+
#[simd_test = "sse2"]
4331+
unsafe fn _mm_castps_si128() {
4332+
let a = f32x4::splat(0.);
4333+
let expected = i32x4::splat(0);
4334+
let r = sse2::_mm_castps_si128(a);
4335+
assert_eq!(r, __m128i::from(expected));
4336+
}
4337+
4338+
#[simd_test = "sse2"]
4339+
unsafe fn _mm_castsi128_pd() {
4340+
let a = __m128i::from(i64x2::splat(0));
4341+
let expected = f64x2::splat(0.);
4342+
let r = sse2::_mm_castsi128_pd(a);
4343+
assert_eq!(r, expected);
4344+
}
4345+
4346+
#[simd_test = "sse2"]
4347+
unsafe fn _mm_castsi128_ps() {
4348+
let a = __m128i::from(i32x4::splat(0));
4349+
let expected = f32x4::splat(0.);
4350+
let r = sse2::_mm_castsi128_ps(a);
4351+
assert_eq!(r, expected);
4352+
}
41084353
}

0 commit comments

Comments
 (0)