[ONNX] Add per-channel quantization support for QuantizeLinear op (#4092)

mmanzoorTT · web-flow · commit 0c29ccf1439c · 2025-03-28T13:15:30.000Z
diff --git a/lib/Conversion/TorchOnnxToTorch/DefaultDomainQtoZ.cpp b/lib/Conversion/TorchOnnxToTorch/DefaultDomainQtoZ.cpp
@@ -269,13 +269,11 @@ void mlir::torch::onnx_c::populateDefaultDomainQtoZ(
 
         auto resultETy = resultType.getDtype();
 
-        bool rank0 = scaleTy.getSizes().size() == 0;
-        bool length1 =
-            scaleTy.getSizes().size() == 1 && scaleTy.getSizes()[0] == 1;
-
-        if (!rank0 && !length1)
-          return rewriter.notifyMatchFailure(binder.op,
-                                             "unimplemented: non-scalar scale");
+        int64_t scaleRank = scaleTy.getSizes().size();
+        if (scaleRank > 1)
+          return rewriter.notifyMatchFailure(
+              binder.op, "unimplemented: only per-tensor or per-axis "
+                         "quantization supported");
 
         auto qTensorTy = getQTorchTypeFromTorchIntType(resultType);
         if (!qTensorTy) {
@@ -290,37 +288,66 @@ void mlir::torch::onnx_c::populateDefaultDomainQtoZ(
             rewriter.getIntegerAttr(rewriter.getIntegerType(64),
                                     static_cast<int64_t>(torchqTy)));
 
-        scale = rewriter.create<Torch::AtenItemOp>(
-            loc, rewriter.getType<Torch::FloatType>(), scale);
-
         bool fpResult = isa<mlir::FloatType>(resultETy);
-        Type zeropointTy = rewriter.getType<Torch::IntType>();
-        if (fpResult)
-          zeropointTy = rewriter.getType<Torch::FloatType>();
-        zeropoint =
-            rewriter.create<Torch::AtenItemOp>(loc, zeropointTy, zeropoint);
-
-        if (fpResult) {
-          Value none = rewriter.create<Torch::ConstantNoneOp>(loc);
-          Value cstFalse = rewriter.create<Torch::ConstantBoolOp>(loc, false);
-          Value one = rewriter.create<Torch::ConstantFloatOp>(
-              loc, rewriter.getF64FloatAttr(1.0));
-          Value div = rewriter.create<Torch::AtenDivScalarOp>(
-              loc, operand.getType(), operand, scale);
-          Value add = rewriter.create<Torch::AtenAddScalarOp>(
-              loc, operand.getType(), div, zeropoint, one);
+        bool isPerTensorQuantization = false;
+        if (scaleRank == 0 ||
+            llvm::all_of(scaleTy.getSizes(), [](int64_t s) { return s == 1; }))
+          isPerTensorQuantization = true;
 
-          rewriter.replaceOpWithNewOp<Torch::AtenToDtypeOp>(
-              binder.op, resultType, add, tyConst,
-              /*non_blocking=*/cstFalse, /*copy=*/cstFalse,
-              /*memory_format=*/none);
+        // (TODO) Case: Per-Channel Quantization for floating point output.
+        if (scaleRank == 1 && fpResult)
+          return rewriter.notifyMatchFailure(
+              binder.op, "unimplemented: support for per-Channel Quantization "
+                         "for floating point output.");
+
+        if (isPerTensorQuantization) {
+          scale = rewriter.create<Torch::AtenItemOp>(
+              loc, rewriter.getType<Torch::FloatType>(), scale);
+
+          Type zeropointTy = rewriter.getType<Torch::IntType>();
+          if (fpResult)
+            zeropointTy = rewriter.getType<Torch::FloatType>();
+          zeropoint =
+              rewriter.create<Torch::AtenItemOp>(loc, zeropointTy, zeropoint);
+        }
+
+        if (!fpResult) {
+          Value quantize;
+          // Case 1: Per-Tensor Quantization for non-floating point input.
+          if (isPerTensorQuantization) {
+            quantize = rewriter.create<Torch::AtenQuantizePerTensorOp>(
+                loc, qTensorTy, operand, scale, zeropoint, tyConst);
+          } else {
+            // Case 2: Per-Channel Quantization for non-floating point input.
+            int64_t axis;
+            if (binder.s64IntegerAttr(axis, "axis", 1))
+              return failure();
+
+            Value cstAxis = rewriter.create<Torch::ConstantIntOp>(
+                loc, rewriter.getI64IntegerAttr(axis));
+            quantize = rewriter.create<Torch::AtenQuantizePerChannelOp>(
+                loc, qTensorTy, operand, scale, zeropoint, cstAxis, tyConst);
+          }
+          rewriter.replaceOpWithNewOp<Torch::AtenIntReprOp>(
+              binder.op, resultType, quantize);
           return success();
         }
 
-        auto quantize = rewriter.create<Torch::AtenQuantizePerTensorOp>(
-            loc, qTensorTy, operand, scale, zeropoint, tyConst);
-        rewriter.replaceOpWithNewOp<Torch::AtenIntReprOp>(binder.op, resultType,
-                                                          quantize);
+        // Case 3: Per-Tensor Quantization for floating point input.
+        Value none = rewriter.create<Torch::ConstantNoneOp>(loc);
+        Value cstFalse = rewriter.create<Torch::ConstantBoolOp>(loc, false);
+        Value one = rewriter.create<Torch::ConstantFloatOp>(
+            loc, rewriter.getF64FloatAttr(1.0));
+        Value div = rewriter.create<Torch::AtenDivScalarOp>(
+            loc, operand.getType(), operand, scale);
+        Value add = rewriter.create<Torch::AtenAddScalarOp>(
+            loc, operand.getType(), div, zeropoint, one);
+
+        rewriter.replaceOpWithNewOp<Torch::AtenToDtypeOp>(
+            binder.op, resultType, add, tyConst,
+            /*non_blocking=*/cstFalse, /*copy=*/cstFalse,
+            /*memory_format=*/none);
+
         return success();
       });
   patterns.onOp(
diff --git a/test/Conversion/TorchOnnxToTorch/simple_ops_q_to_z.mlir b/test/Conversion/TorchOnnxToTorch/simple_ops_q_to_z.mlir
@@ -64,6 +64,54 @@ func.func @test_quantizelinear_f8(%arg0: !torch.vtensor<[6],f32>, %arg1: !torch.
 
 // -----
 
+// CHECK-LABEL: @test_quantizelinear_per_channel_si8
+func.func @test_quantizelinear_per_channel_si8(%arg0: !torch.vtensor<[4,3,7,7],f32>, %arg1: !torch.vtensor<[4],f32>, %arg2: !torch.vtensor<[4],si8>) -> !torch.vtensor<[4,3,7,7],si8> attributes {torch.onnx_meta.ir_version = 10 : si64, torch.onnx_meta.opset_version = 19 : si64} {
+  // CHECK: %[[DTYPE:.+]] = torch.constant.int 12
+  // CHECK: %[[AXIS:.+]] = torch.constant.int 1
+  // CHECK: %[[QUANT:.+]] = torch.aten.quantize_per_channel %arg0, %arg1, %arg2, %[[AXIS]], %[[DTYPE]]
+  // CHECK: %[[REPR:.+]] = torch.aten.int_repr %[[QUANT]]
+  %0 = torch.operator "onnx.QuantizeLinear"(%arg0, %arg1, %arg2) {torch.onnx.axis = 1 : si64} : (!torch.vtensor<[4,3,7,7],f32>, !torch.vtensor<[4],f32>, !torch.vtensor<[4],si8>) -> !torch.vtensor<[4,3,7,7],si8>
+  return %0: !torch.vtensor<[4,3,7,7],si8>
+}
+
+// -----
+
+// CHECK-LABEL: @test_quantizelinear_per_channel_ui8
+func.func @test_quantizelinear_per_channel_ui8(%arg0: !torch.vtensor<[4,3,7,7],f32>, %arg1: !torch.vtensor<[4],f32>, %arg2: !torch.vtensor<[4],ui8>) -> !torch.vtensor<[4,3,7,7],ui8> attributes {torch.onnx_meta.ir_version = 10 : si64, torch.onnx_meta.opset_version = 19 : si64} {
+  // CHECK: %[[DTYPE:.+]] = torch.constant.int 13
+  // CHECK: %[[AXIS:.+]] = torch.constant.int 1
+  // CHECK: %[[QUANT:.+]] = torch.aten.quantize_per_channel %arg0, %arg1, %arg2, %[[AXIS]], %[[DTYPE]]
+  // CHECK: %[[REPR:.+]] = torch.aten.int_repr %[[QUANT]]
+  %0 = torch.operator "onnx.QuantizeLinear"(%arg0, %arg1, %arg2) {torch.onnx.axis = 1 : si64} : (!torch.vtensor<[4,3,7,7],f32>, !torch.vtensor<[4],f32>, !torch.vtensor<[4],ui8>) -> !torch.vtensor<[4,3,7,7],ui8>
+  return %0: !torch.vtensor<[4,3,7,7],ui8>
+}
+
+// -----
+
+// CHECK-LABEL: @test_quantizelinear_per_channel_si16
+func.func @test_quantizelinear_per_channel_si16(%arg0: !torch.vtensor<[4,3,7,7],f32>, %arg1: !torch.vtensor<[4],f32>, %arg2: !torch.vtensor<[4],si16>) -> !torch.vtensor<[4,3,7,7],si16> attributes {torch.onnx_meta.ir_version = 10 : si64, torch.onnx_meta.opset_version = 19 : si64} {
+  // CHECK: %[[DTYPE:.+]] = torch.constant.int 27
+  // CHECK: %[[AXIS:.+]] = torch.constant.int 1
+  // CHECK: %[[QUANT:.+]] = torch.aten.quantize_per_channel %arg0, %arg1, %arg2, %[[AXIS]], %[[DTYPE]]
+  // CHECK: %[[REPR:.+]] = torch.aten.int_repr %[[QUANT]]
+  %0 = torch.operator "onnx.QuantizeLinear"(%arg0, %arg1, %arg2) {torch.onnx.axis = 1 : si64} : (!torch.vtensor<[4,3,7,7],f32>, !torch.vtensor<[4],f32>, !torch.vtensor<[4],si16>) -> !torch.vtensor<[4,3,7,7],si16>
+  return %0: !torch.vtensor<[4,3,7,7],si16>
+}
+
+// -----
+
+// CHECK-LABEL: @test_quantizelinear_per_channel_si32
+func.func @test_quantizelinear_per_channel_si32(%arg0: !torch.vtensor<[4,3,7,7],f32>, %arg1: !torch.vtensor<[4],f32>, %arg2: !torch.vtensor<[4],si32>) -> !torch.vtensor<[4,3,7,7],si32> attributes {torch.onnx_meta.ir_version = 10 : si64, torch.onnx_meta.opset_version = 19 : si64} {
+  // CHECK: %[[DTYPE:.+]] = torch.constant.int 14
+  // CHECK: %[[AXIS:.+]] = torch.constant.int 1
+  // CHECK: %[[QUANT:.+]] = torch.aten.quantize_per_channel %arg0, %arg1, %arg2, %[[AXIS]], %[[DTYPE]]
+  // CHECK: %[[REPR:.+]] = torch.aten.int_repr %[[QUANT]]
+  %0 = torch.operator "onnx.QuantizeLinear"(%arg0, %arg1, %arg2) {torch.onnx.axis = 1 : si64} : (!torch.vtensor<[4,3,7,7],f32>, !torch.vtensor<[4],f32>, !torch.vtensor<[4],si32>) -> !torch.vtensor<[4,3,7,7],si32>
+  return %0: !torch.vtensor<[4,3,7,7],si32>
+}
+
+// -----
+
 // CHECK-LABEL: @test_qlinearconv_nobias
 func.func @test_qlinearconv_nobias(%arg0: !torch.vtensor<[1,1,7,7],ui8>, %arg1: !torch.vtensor<[],f32>, %arg2: !torch.vtensor<[],ui8>, %arg3: !torch.vtensor<[1,1,1,1],ui8>, %arg4: !torch.vtensor<[1],f32>, %arg5: !torch.vtensor<[1],ui8>, %arg6: !torch.vtensor<[],f32>, %arg7: !torch.vtensor<[],ui8>) -> !torch.vtensor<[1,1,7,7],ui8> attributes {torch.onnx_meta.ir_version = 5 : si64, torch.onnx_meta.opset_version = 10 : si64, torch.onnx_meta.producer_name = "backend-test", torch.onnx_meta.producer_version = ""} {
   %0 = torch.operator "onnx.QLinearConv"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7) : (!torch.vtensor<[1,1,7,7],ui8>, !torch.vtensor<[],f32>, !torch.vtensor<[],ui8>, !torch.vtensor<[1,1,1,1],ui8>, !torch.vtensor<[1],f32>, !torch.vtensor<[1],ui8>, !torch.vtensor<[],f32>, !torch.vtensor<[],ui8>) -> !torch.vtensor<[1,1,7,7],ui8>