[naga] Vectorize [un]pack4x{I, U}8[Clamp] on spv

robamler · robamler · commit 1de2c4f0b970 · 2025-05-03T16:29:08.000+02:00
Emits vectorized SPIR-V code for the WGSL functions `unpack4xI8`, `unpack4xU8`, `pack4xI8`, `pack4xU8`, `pack4xI8Clamp`, `pack4xU8Clamp`. Exploits the following facts about SPIR-V ops: - `SClamp`, `UClamp`, and `OpUConvert` accept vector arguments, in which case results are computed per component; and - `OpBitcast` can cast between vectors and scalars, with a well-defined bit order that matches that required by the WGSL spec, see below. WGSL spec for `pack4xI8` [1]: > Component e[i] of the input is mapped to bits 8 x i through 8 x i + 7 > of the result. SPIR-V spec for `OpBitcast` [2]: > Within this mapping, any single component of `S` [remark: the type > with fewer but wider components] (mapping to multiple components of > `L` [remark: the type with more but narrower components]) maps its > lower-ordered bits to the lower-numbered components of `L`. [1] https://www.w3.org/TR/WGSL/#pack4xI8-builtin [2] https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpBitcast
diff --git a/naga/src/back/spv/block.rs b/naga/src/back/spv/block.rs
@@ -1557,162 +1557,120 @@ impl BlockContext<'_> {
                             Mf::Pack4xU8 | Mf::Pack4xU8Clamp => (crate::ScalarKind::Uint, false),
                             _ => unreachable!(),
                         };
+
                         let should_clamp = matches!(fun, Mf::Pack4xI8Clamp | Mf::Pack4xU8Clamp);
-                        let uint_type_id =
-                            self.get_numeric_type_id(NumericType::Scalar(crate::Scalar::U32));
 
-                        let int_type_id =
-                            self.get_numeric_type_id(NumericType::Scalar(crate::Scalar {
+                        let wide_vector_type_id = self.get_numeric_type_id(NumericType::Vector {
+                            size: crate::VectorSize::Quad,
+                            scalar: crate::Scalar {
                                 kind: int_type,
                                 width: 4,
-                            }));
-
-                        let mut last_instruction = Instruction::new(spirv::Op::Nop);
-
-                        let zero = self.writer.get_constant_scalar(crate::Literal::U32(0));
-                        let mut preresult = zero;
-                        block
-                            .body
-                            .reserve(usize::from(VEC_LENGTH) * (2 + usize::from(is_signed)));
-
-                        let eight = self.writer.get_constant_scalar(crate::Literal::U32(8));
-                        const VEC_LENGTH: u8 = 4;
-                        for i in 0..u32::from(VEC_LENGTH) {
-                            let offset =
-                                self.writer.get_constant_scalar(crate::Literal::U32(i * 8));
-                            let mut extracted = self.gen_id();
-                            block.body.push(Instruction::binary(
-                                spirv::Op::CompositeExtract,
-                                int_type_id,
-                                extracted,
-                                arg0_id,
-                                i,
-                            ));
-                            if is_signed {
-                                let casted = self.gen_id();
-                                block.body.push(Instruction::unary(
-                                    spirv::Op::Bitcast,
-                                    uint_type_id,
-                                    casted,
-                                    extracted,
-                                ));
-                                extracted = casted;
-                            }
-                            if should_clamp {
-                                let (min, max, clamp_op) = if is_signed {
-                                    (
-                                        crate::Literal::I32(-128),
-                                        crate::Literal::I32(127),
-                                        spirv::GLOp::SClamp,
-                                    )
-                                } else {
-                                    (
-                                        crate::Literal::U32(0),
-                                        crate::Literal::U32(255),
-                                        spirv::GLOp::UClamp,
-                                    )
-                                };
-                                let [min, max] =
-                                    [min, max].map(|lit| self.writer.get_constant_scalar(lit));
-
-                                let clamp_id = self.gen_id();
-                                block.body.push(Instruction::ext_inst(
-                                    self.writer.gl450_ext_inst_id,
-                                    clamp_op,
-                                    result_type_id,
-                                    clamp_id,
-                                    &[extracted, min, max],
-                                ));
+                            },
+                        });
+                        let packed_vector_type_id = self.get_numeric_type_id(NumericType::Vector {
+                            size: crate::VectorSize::Quad,
+                            scalar: crate::Scalar {
+                                kind: crate::ScalarKind::Uint,
+                                width: 1,
+                            },
+                        });
 
-                                extracted = clamp_id;
-                            }
-                            let is_last = i == u32::from(VEC_LENGTH - 1);
-                            if is_last {
-                                last_instruction = Instruction::quaternary(
-                                    spirv::Op::BitFieldInsert,
-                                    result_type_id,
-                                    id,
-                                    preresult,
-                                    extracted,
-                                    offset,
-                                    eight,
+                        let mut wide_vector = arg0_id;
+                        if should_clamp {
+                            let (min, max, clamp_op) = if is_signed {
+                                (
+                                    crate::Literal::I32(-128),
+                                    crate::Literal::I32(127),
+                                    spirv::GLOp::SClamp,
                                 )
                             } else {
-                                let new_preresult = self.gen_id();
-                                block.body.push(Instruction::quaternary(
-                                    spirv::Op::BitFieldInsert,
-                                    result_type_id,
-                                    new_preresult,
-                                    preresult,
-                                    extracted,
-                                    offset,
-                                    eight,
+                                (
+                                    crate::Literal::U32(0),
+                                    crate::Literal::U32(255),
+                                    spirv::GLOp::UClamp,
+                                )
+                            };
+                            let [min, max] = [min, max].map(|lit| {
+                                let scalar = self.writer.get_constant_scalar(lit);
+                                // TODO: can we cache these constant vectors somehow?
+                                let id = self.gen_id();
+                                block.body.push(Instruction::composite_construct(
+                                    wide_vector_type_id,
+                                    id,
+                                    &[scalar; 4],
                                 ));
-                                preresult = new_preresult;
-                            }
+                                id
+                            });
+
+                            let clamp_id = self.gen_id();
+                            block.body.push(Instruction::ext_inst(
+                                self.writer.gl450_ext_inst_id,
+                                clamp_op,
+                                wide_vector_type_id,
+                                clamp_id,
+                                &[wide_vector, min, max],
+                            ));
+
+                            wide_vector = clamp_id;
                         }
 
-                        MathOp::Custom(last_instruction)
+                        let packed_vector = self.gen_id();
+                        block.body.push(Instruction::unary(
+                            spirv::Op::UConvert, // We truncate, so `UConvert` and `SConvert` behave identically.
+                            packed_vector_type_id,
+                            packed_vector,
+                            wide_vector,
+                        ));
+
+                        // The SPIR-V spec [1] defines the bit order for bit casting between a vector
+                        // and a scalar precisely as required by the WGSL spec [2].
+                        // [1]: https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpBitcast
+                        // [2]: https://www.w3.org/TR/WGSL/#pack4xI8-builtin
+                        MathOp::Custom(Instruction::unary(
+                            spirv::Op::Bitcast,
+                            result_type_id,
+                            id,
+                            packed_vector,
+                        ))
                     }
                     Mf::Unpack4x8unorm => MathOp::Ext(spirv::GLOp::UnpackUnorm4x8),
                     Mf::Unpack4x8snorm => MathOp::Ext(spirv::GLOp::UnpackSnorm4x8),
                     Mf::Unpack2x16float => MathOp::Ext(spirv::GLOp::UnpackHalf2x16),
                     Mf::Unpack2x16unorm => MathOp::Ext(spirv::GLOp::UnpackUnorm2x16),
                     Mf::Unpack2x16snorm => MathOp::Ext(spirv::GLOp::UnpackSnorm2x16),
                     fun @ (Mf::Unpack4xI8 | Mf::Unpack4xU8) => {
-                        let (int_type, extract_op, is_signed) = match fun {
-                            Mf::Unpack4xI8 => {
-                                (crate::ScalarKind::Sint, spirv::Op::BitFieldSExtract, true)
-                            }
-                            Mf::Unpack4xU8 => {
-                                (crate::ScalarKind::Uint, spirv::Op::BitFieldUExtract, false)
-                            }
+                        let (int_type, convert_op) = match fun {
+                            Mf::Unpack4xI8 => (crate::ScalarKind::Sint, spirv::Op::SConvert),
+                            Mf::Unpack4xU8 => (crate::ScalarKind::Uint, spirv::Op::UConvert),
                             _ => unreachable!(),
                         };
 
-                        let sint_type_id =
-                            self.get_numeric_type_id(NumericType::Scalar(crate::Scalar::I32));
-
-                        let eight = self.writer.get_constant_scalar(crate::Literal::U32(8));
-                        let int_type_id =
-                            self.get_numeric_type_id(NumericType::Scalar(crate::Scalar {
+                        let packed_vector_type_id = self.get_numeric_type_id(NumericType::Vector {
+                            size: crate::VectorSize::Quad,
+                            scalar: crate::Scalar {
                                 kind: int_type,
-                                width: 4,
-                            }));
-                        block
-                            .body
-                            .reserve(usize::from(VEC_LENGTH) * 2 + usize::from(is_signed));
-                        let arg_id = if is_signed {
-                            let new_arg_id = self.gen_id();
-                            block.body.push(Instruction::unary(
-                                spirv::Op::Bitcast,
-                                sint_type_id,
-                                new_arg_id,
-                                arg0_id,
-                            ));
-                            new_arg_id
-                        } else {
-                            arg0_id
-                        };
-
-                        const VEC_LENGTH: u8 = 4;
-                        let parts: [_; VEC_LENGTH as usize] =
-                            core::array::from_fn(|_| self.gen_id());
-                        for (i, part_id) in parts.into_iter().enumerate() {
-                            let index = self
-                                .writer
-                                .get_constant_scalar(crate::Literal::U32(i as u32 * 8));
-                            block.body.push(Instruction::ternary(
-                                extract_op,
-                                int_type_id,
-                                part_id,
-                                arg_id,
-                                index,
-                                eight,
-                            ));
-                        }
+                                width: 1,
+                            },
+                        });
+
+                        // The SPIR-V spec [1] defines the bit order for bit casting between a vector
+                        // and a scalar precisely as required by the WGSL spec [2].
+                        // [1]: https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpBitcast
+                        // [2]: https://www.w3.org/TR/WGSL/#pack4xI8-builtin
+                        let packed_vector = self.gen_id();
+                        block.body.push(Instruction::unary(
+                            spirv::Op::Bitcast,
+                            packed_vector_type_id,
+                            packed_vector,
+                            arg0_id,
+                        ));
 
-                        MathOp::Custom(Instruction::composite_construct(result_type_id, id, &parts))
+                        MathOp::Custom(Instruction::unary(
+                            convert_op,
+                            result_type_id,
+                            id,
+                            packed_vector,
+                        ))
                     }
                 };
 
diff --git a/naga/tests/out/spv/wgsl-6772-unpack-expr-accesses.spvasm b/naga/tests/out/spv/wgsl-6772-unpack-expr-accesses.spvasm
@@ -1,8 +1,9 @@
 ; SPIR-V
 ; Version: 1.1
 ; Generator: rspirv
-; Bound: 30
+; Bound: 23
 OpCapability Shader
+OpCapability Int8
 %1 = OpExtInstImport "GLSL.std.450"
 OpMemoryModel Logical GLSL450
 OpEntryPoint GLCompute %4 "main"
@@ -14,27 +15,20 @@ OpExecutionMode %4 LocalSize 1 1 1
 %8 = OpTypeInt 32 0
 %9 = OpConstant  %8  12
 %11 = OpTypeVector %6 4
-%13 = OpConstant  %8  8
-%19 = OpConstant  %8  0
-%20 = OpConstant  %8  16
-%21 = OpConstant  %8  24
-%23 = OpTypeVector %8 4
+%14 = OpTypeInt 8 1
+%13 = OpTypeVector %14 4
+%17 = OpTypeVector %8 4
+%20 = OpTypeInt 8 0
+%19 = OpTypeVector %20 4
 %4 = OpFunction  %2  None %5
 %3 = OpLabel
 OpBranch %10
 %10 = OpLabel
-%14 = OpBitcast  %6  %9
-%15 = OpBitFieldSExtract  %6  %14 %19 %13
-%16 = OpBitFieldSExtract  %6  %14 %13 %13
-%17 = OpBitFieldSExtract  %6  %14 %20 %13
-%18 = OpBitFieldSExtract  %6  %14 %21 %13
-%12 = OpCompositeConstruct  %11  %15 %16 %17 %18
-%22 = OpCompositeExtract  %6  %12 2
-%25 = OpBitFieldUExtract  %8  %9 %19 %13
-%26 = OpBitFieldUExtract  %8  %9 %13 %13
-%27 = OpBitFieldUExtract  %8  %9 %20 %13
-%28 = OpBitFieldUExtract  %8  %9 %21 %13
-%24 = OpCompositeConstruct  %23  %25 %26 %27 %28
-%29 = OpCompositeExtract  %8  %24 1
+%15 = OpBitcast  %13  %9
+%12 = OpSConvert  %11  %15
+%16 = OpCompositeExtract  %6  %12 2
+%21 = OpBitcast  %19  %9
+%18 = OpUConvert  %17  %21
+%22 = OpCompositeExtract  %8  %18 1
 OpReturn
 OpFunctionEnd
diff --git a/naga/tests/out/spv/wgsl-bits.spvasm b/naga/tests/out/spv/wgsl-bits.spvasm