Add im2row per tensor overload

mcremon-meta · web-flow · commit be92fb49e54c · 2025-03-31T18:15:13.000-07:00
Differential Revision: D70938715 Pull Request resolved: #9121
diff --git a/backends/cadence/aot/functions.yaml b/backends/cadence/aot/functions.yaml
@@ -234,6 +234,11 @@
     - arg_meta: null
       kernel_name: impl::reference::im2row_out
 
+- func: cadence::im2row.per_tensor_out(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, int in_zero_point, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::im2row_per_tensor_out
+
 - func: cadence::quantized_conv.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
@@ -253,3 +258,8 @@
   kernels:
     - arg_meta: null
       kernel_name: impl::reference::requantize_out
+
+- func: cadence::requantize.per_tensor_out(Tensor input, float in_scale, int in_zero_point, float out_scale, int out_zero_point, ScalarType out_dtype, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::requantize_per_tensor_out
diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
@@ -10,11 +10,14 @@
 from typing import Optional, Tuple
 
 import torch
+from executorch.backends.cadence.aot.utils import (
+    get_conv1d_output_size,
+    get_conv2d_output_size,
+    get_im2row_output_size,
+)
 from executorch.exir.scalar_type import ScalarType
 from torch.library import Library, register_fake
 
-from .utils import get_conv1d_output_size, get_conv2d_output_size
-
 lib = Library("cadence", "DEF")
 
 lib.define(
@@ -131,6 +134,10 @@
     "im2row(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, "
     "Tensor in_zero_point, bool channel_last=False) -> (Tensor out)"
 )
+lib.define(
+    "im2row.per_tensor(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, "
+    "int in_zero_point, bool channel_last=False) -> (Tensor out)"
+)
 lib.define("linalg_vector_norm(Tensor X) -> (Tensor Y)")
 lib.define(
     "transposed_im2row(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, "
@@ -140,6 +147,10 @@
     "requantize(Tensor input, Tensor in_scale, Tensor in_zero_point, Tensor out_scale, "
     "Tensor out_zero_point, ScalarType out_dtype) -> (Tensor Y)"
 )
+lib.define(
+    "requantize.per_tensor(Tensor input, float in_scale, int in_zero_point, float out_scale, "
+    "int out_zero_point, ScalarType out_dtype) -> (Tensor Y)"
+)
 lib.define(
     "fully_connected(Tensor input, Tensor weight, Tensor? bias=None) -> (Tensor out)"
 )
@@ -223,6 +234,10 @@
     "im2row.out(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, "
     "Tensor in_zero_point, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)"
 )
+lib.define(
+    "im2row.per_tensor_out(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, "
+    "int in_zero_point, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)"
+)
 lib.define(
     "transposed_im2row.out(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, "
     "int[2] stride, int[2] output_padding, Tensor in_zero_point, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)"
@@ -231,7 +246,10 @@
     "requantize.out(Tensor input, Tensor in_scale, Tensor in_zero_point, Tensor out_scale, "
     "Tensor out_zero_point, ScalarType out_dtype, *, Tensor(a!) out) -> Tensor(a!)"
 )
-
+lib.define(
+    "requantize.per_tensor_out(Tensor input, float in_scale, int in_zero_point, float out_scale, "
+    "int out_zero_point, ScalarType out_dtype, *, Tensor(a!) out) -> Tensor(a!)"
+)
 
 # Custom ops with aten namespace. Need to specify the lib var as FRAGMENT type as aten library is already defined
 aten_lib = Library("aten", "FRAGMENT")
@@ -562,22 +580,25 @@ def im2row_meta(
     in_zero_point: torch.Tensor,
     channel_last: bool = False,
 ) -> torch.Tensor:
-    if len(input.shape) == 3:
-        height_dim = 1 if channel_last else 2
-        input = input.unsqueeze(height_dim)
+    output_size = get_im2row_output_size(
+        input, kernel_size, dilation, padding, stride, channel_last
+    )
+    return input.new_empty(output_size, dtype=input.dtype)
 
-    batch_size = input.shape[0]
-    n_input_plane = input.shape[3] if channel_last else input.shape[1]
-    input_height = input.shape[1] if channel_last else input.shape[2]
-    input_width = input.shape[2] if channel_last else input.shape[3]
-    output_height = (
-        input_height + 2 * padding[0] - (dilation[0] * (kernel_size[0] - 1) + 1)
-    ) // stride[0] + 1
-    output_width = (
-        input_width + 2 * padding[1] - (dilation[1] * (kernel_size[1] - 1) + 1)
-    ) // stride[1] + 1
-    n_output_plane = n_input_plane * kernel_size[0] * kernel_size[1]
-    output_size = torch.Size((batch_size, output_height * output_width, n_output_plane))
+
+@register_fake("cadence::im2row.per_tensor")
+def im2row_per_tensor_meta(
+    input: torch.Tensor,
+    kernel_size: Tuple[int],
+    dilation: Tuple[int],
+    padding: Tuple[int],
+    stride: Tuple[int],
+    in_zero_point: int,
+    channel_last: bool = False,
+) -> torch.Tensor:
+    output_size = get_im2row_output_size(
+        input, kernel_size, dilation, padding, stride, channel_last
+    )
     return input.new_empty(output_size, dtype=input.dtype)
 
 
@@ -606,6 +627,22 @@ def requantize_meta(
     )
 
 
+@register_fake("cadence::requantize.per_tensor")
+def requantize_per_tensor_meta(
+    input: torch.Tensor,
+    in_scale: float,
+    in_zero_point: int,
+    out_scale: float,
+    out_zero_point: int,
+    dtype: ScalarType,
+) -> torch.Tensor:
+    return input.new_empty(
+        input.size(),
+        # pyre-ignore[6]: Incompatible type
+        dtype=dtype,
+    )
+
+
 @register_fake("cadence::quantized_relu.per_tensor")
 def quantized_relu_per_tensor_meta(
     input: torch.Tensor,
diff --git a/backends/cadence/aot/replace_ops.py b/backends/cadence/aot/replace_ops.py
@@ -1864,6 +1864,14 @@ class ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass(ExportPass):
             exir_ops.edge.cadence.quantized_relu.per_tensor,
             [1, 3, 4],
         ),
+        exir_ops.edge.cadence.im2row: (
+            exir_ops.edge.cadence.im2row.per_tensor,
+            [5],
+        ),
+        exir_ops.edge.cadence.requantize: (
+            exir_ops.edge.cadence.requantize.per_tensor,
+            [1, 2, 3, 4],
+        ),
     }
 
     def call_operator(self, op, args, kwargs, meta):
@@ -1884,6 +1892,9 @@ def call_operator(self, op, args, kwargs, meta):
             if not arg.is_tensor():
                 return super().call_operator(op, args, kwargs, meta)
 
+            if not isinstance(arg.node.target, EdgeOpOverload):
+                return super().call_operator(op, args, kwargs, meta)
+
             if get_edge_overload_packet(arg.node.target) != exir_ops.edge.aten.full:
                 # Only replace if arg generated by a full op.
                 return super().call_operator(op, args, kwargs, meta)
diff --git a/backends/cadence/aot/utils.py b/backends/cadence/aot/utils.py
@@ -75,6 +75,33 @@ def get_conv2d_output_size(
     return torch.Size((in_size[0], out_channels, hout, wout))
 
 
+def get_im2row_output_size(
+    input: torch.Tensor,
+    kernel_size: Tuple[int],
+    dilation: Tuple[int],
+    padding: Tuple[int],
+    stride: Tuple[int],
+    channel_last: bool,
+) -> torch.Size:
+    if len(input.shape) == 3:
+        height_dim = 1 if channel_last else 2
+        input = input.unsqueeze(height_dim)
+
+    batch_size = input.shape[0]
+    n_input_plane = input.shape[3] if channel_last else input.shape[1]
+    input_height = input.shape[1] if channel_last else input.shape[2]
+    input_width = input.shape[2] if channel_last else input.shape[3]
+    output_height = (
+        input_height + 2 * padding[0] - (dilation[0] * (kernel_size[0] - 1) + 1)
+    ) // stride[0] + 1
+    output_width = (
+        input_width + 2 * padding[1] - (dilation[1] * (kernel_size[1] - 1) + 1)
+    ) // stride[1] + 1
+    n_output_plane = n_input_plane * kernel_size[0] * kernel_size[1]
+    output_size = torch.Size((batch_size, output_height * output_width, n_output_plane))
+    return torch.Size(output_size)
+
+
 # Return the overload packet for the edge op
 def get_edge_overload_packet(edge_op: EdgeOpOverload) -> EdgeOpOverloadPacket:
     edge_op_namespace, edge_op_name = (
diff --git a/backends/cadence/reference/operators/im2row_out.cpp b/backends/cadence/reference/operators/im2row_out.cpp
@@ -207,6 +207,92 @@ void im2row_out(
 #undef typed_im2row
 }
 
+void im2row_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    IntArrayRef kernel_size,
+    IntArrayRef dilation,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    int64_t in_zero_point,
+    bool channel_last,
+    Tensor& out) {
+  // Compute the input tensor's dims
+  bool unit_height = input.dim() == 3;
+  const int32_t batch_size = input.size(0);
+  const int32_t in_c =
+      channel_last ? input.size(3 - unit_height) : input.size(1);
+  const int32_t in_h =
+      unit_height ? 1 : (channel_last ? input.size(1) : input.size(2));
+  const int32_t in_w =
+      channel_last ? input.size(2 - unit_height) : input.size(3 - unit_height);
+
+  // Get the kernel parameters
+  int32_t kernel_h = kernel_size[0];
+  int32_t kernel_w = kernel_size[1];
+  int32_t dilation_h = dilation[0];
+  int32_t dilation_w = dilation[1];
+  int32_t pad_h = padding[0];
+  int32_t pad_w = padding[1];
+  int32_t stride_h = stride[0];
+  int32_t stride_w = stride[1];
+
+  // If we were to apply a convolution on the input tensor, compute the output
+  // height and width.
+  int32_t out_h =
+      (in_h + 2 * pad_h - dilation_h * (kernel_h - 1) - 1) / stride_h + 1;
+  int32_t out_w =
+      (in_w + 2 * pad_w - dilation_w * (kernel_w - 1) - 1) / stride_w + 1;
+
+  ET_DCHECK_MSG(
+      (out_h * out_w) == out.size(1), "dimension mismatch for output");
+  ET_DCHECK_MSG(
+      (kernel_h * kernel_w * in_c) == out.size(2),
+      "dimension mismatch for output");
+
+#define typed_im2row_per_tensor(dtype, ctype)                          \
+  case ScalarType::dtype: {                                            \
+    const ctype* __restrict__ in_data = input.const_data_ptr<ctype>(); \
+    ctype* __restrict__ out_data = out.mutable_data_ptr<ctype>();      \
+    int32_t in_plane = in_c * in_h * in_w;                             \
+    int32_t out_plane = kernel_h * kernel_w * in_c * out_h * out_w;    \
+    for (size_t n = 0; n < batch_size; ++n) {                          \
+      im2row_<ctype>(                                                  \
+          &in_data[n * in_plane],                                      \
+          in_zero_point,                                               \
+          in_c,                                                        \
+          in_h,                                                        \
+          in_w,                                                        \
+          out_h,                                                       \
+          out_w,                                                       \
+          kernel_h,                                                    \
+          kernel_w,                                                    \
+          pad_h,                                                       \
+          pad_w,                                                       \
+          stride_h,                                                    \
+          stride_w,                                                    \
+          dilation_h,                                                  \
+          dilation_w,                                                  \
+          &out_data[n * out_plane],                                    \
+          channel_last);                                               \
+    }                                                                  \
+    break;                                                             \
+  }
+
+  ScalarType dtype = input.scalar_type();
+  switch (dtype) {
+    typed_im2row_per_tensor(Float, float);
+    typed_im2row_per_tensor(Byte, uint8_t);
+    typed_im2row_per_tensor(Char, int8_t);
+    default:
+      ET_DCHECK_MSG(
+          false,
+          "im2row.per_tensor not implemented for dtype %s",
+          torch::executor::toString(dtype));
+  }
+#undef typed_im2row_per_tensor
+}
+
 } // namespace native
 } // namespace reference
 } // namespace impl
diff --git a/backends/cadence/reference/operators/requantize_out.cpp b/backends/cadence/reference/operators/requantize_out.cpp

-Original file line number
+Diff line change
   return out;
+}
 +// Requantize the int8_t/uint8_t input tensor to a uint8_t/int8_t out tensor.
 +// The scale and zero_point for requantization are in the args.
 +Tensor& requantize_per_tensor_out(
 +    KernelRuntimeContext& ctx,
 +    const Tensor& input,
 +    double in_scale,
 +    int64_t in_zero_point,
 +    double out_scale,
 +    int64_t out_zero_point,
 +    const ScalarType out_dtype,
 +    Tensor& out) {
 +  ET_KERNEL_CHECK_MSG(
 +      ctx,
 +      out.scalar_type() == out_dtype,
 +      InvalidArgument,
 +      out,
 +      "Out tensor dtype (%s) does not match the passed in out dtype (%s)",
 +      torch::executor::toString(out.scalar_type()),
 +      torch::executor::toString(out_dtype));
++
 +  const size_t numel = out.numel();
 +  ScalarType in_dtype = input.scalar_type();
++
 +  // Assert that the output tensor's dtype is same as out_dtype.
 +  ET_KERNEL_CHECK_MSG(
 +      ctx,
 +      out_dtype == out.scalar_type(),
 +      InvalidArgument,
 +      out,
 +      "Out dtype %s does not match requant dtype %s",
 +      torch::executor::toString(out.scalar_type()),
 +      torch::executor::toString(out_dtype));
++
 +#define typed_requantize(ctype, dtype)                     \
 +  const ctype* input_data = input.const_data_ptr<ctype>(); \
 +  dtype* out_data = out.mutable_data_ptr<dtype>();         \
 +  kernels::requantize<ctype, dtype>(                       \
 +      out_data,                                            \
 +      input_data,                                          \
 +      static_cast<float>(in_scale),                        \
 +      static_cast<int32_t>(in_zero_point),                 \
 +      1.0 / static_cast<float>(out_scale),                 \
 +      static_cast<int32_t>(out_zero_point),                \
 +      numel);
++
 +#define typed_requantize_in(ctype)               \
 +  switch (out_dtype) {                           \
 +    case ScalarType::Byte: {                     \
 +      typed_requantize(ctype, uint8_t);          \
 +      break;                                     \
 +    }                                            \
 +    case ScalarType::Char: {                     \
 +      typed_requantize(ctype, int8_t);           \
 +      break;                                     \
 +    }                                            \
 +    case ScalarType::UInt16: {                   \
 +      typed_requantize(ctype, uint16_t);         \
 +      break;                                     \
 +    }                                            \
 +    case ScalarType::Short: {                    \
 +      typed_requantize(ctype, int16_t);          \
 +      break;                                     \
 +    }                                            \
 +    default:                                     \
 +      ET_KERNEL_CHECK_MSG(                       \
 +          ctx,                                   \
 +          false,                                 \
 +          InvalidArgument,                       \
 +          out,                                   \
 +          "Unhandled output dtype %s",           \
 +          torch::executor::toString(out_dtype)); \
 +  }
++
 +  switch (in_dtype) {
 +    case ScalarType::Byte: {
 +      typed_requantize_in(uint8_t);
 +      break;
 +    }
 +    case ScalarType::Char: {
 +      typed_requantize_in(int8_t);
 +      break;
 +    }
 +    case ScalarType::UInt16: {
 +      typed_requantize_in(uint16_t);
 +      break;
 +    }
 +    case ScalarType::Short: {
 +      typed_requantize_in(int16_t);
 +      break;
 +    }
 +    default:
 +      ET_KERNEL_CHECK_MSG(
 +          ctx,
 +          false,
 +          InvalidArgument,
 +          out,
 +          "Unhandled input dtype %s",
 +          torch::executor::toString(in_dtype));
 +  }
 +#undef typed_requantize_in
 +#undef typed_requantize
 +  return out;
 +}
++
 }; // namespace native
 }; // namespace reference
 }; // namespace impl