pytorch
diff --git a/‎.github/workflows/android-release-artifacts.yml
+11-3 b/‎.github/workflows/android-release-artifacts.yml
+11-3
diff --git a/‎backends/apple/coreml/README.md
+1-106 b/‎backends/apple/coreml/README.md
+1-106
diff --git a/‎backends/arm/_passes/__init__.py
+1 b/‎backends/arm/_passes/__init__.py
+1
diff --git a/‎backends/arm/_passes/arm_pass_manager.py
+2 b/‎backends/arm/_passes/arm_pass_manager.py
+2
diff --git a/‎backends/arm/_passes/cast_int64_pass.py
-1 b/‎backends/arm/_passes/cast_int64_pass.py
-1
diff --git a/‎backends/arm/_passes/decompose_gelu_pass.py
+149 b/‎backends/arm/_passes/decompose_gelu_pass.py
+149
diff --git a/‎backends/arm/_passes/insert_table_ops.py
+14 b/‎backends/arm/_passes/insert_table_ops.py
+14
diff --git a/‎backends/arm/arm_backend.py
-6 b/‎backends/arm/arm_backend.py
-6
diff --git a/‎backends/arm/operator_support/right_shift_support.py
-1 b/‎backends/arm/operator_support/right_shift_support.py
-1
diff --git a/‎backends/arm/operator_support/slice_copy_support.py
-1 b/‎backends/arm/operator_support/slice_copy_support.py
-1
@@ -11,6 +11,8 @@ on:
         description: Upload the AAR to maven staging repository
         required: false
         type: boolean
+  schedule:
+    - cron: 0 10 * * *
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
@@ -26,6 +28,10 @@ jobs:
         shell: bash
         run: |
           VERSION="${{ inputs.version }}"
+          if [ -z "$VERSION" ]; then
+            echo "No version name specified. Will create a snapshot AAR"
+            exit 0
+          fi
           if curl -I "https://ossci-android.s3.amazonaws.com/executorch/release/${VERSION}/executorch.aar" | grep "200 OK"; then
             echo "AAR already exists at https://ossci-android.s3.amazonaws.com/executorch/release/${VERSION}/executorch.aar"
             echo "Will skip build/upload"
@@ -107,6 +113,8 @@ jobs:
           pip install awscli==1.32.18
           AWS_CMD="aws s3 cp"
           VERSION="${{ inputs.version }}"
-          VERSION_NAME="${VERSION:-temp_snapshot}"
-          ${AWS_CMD} executorch.aar s3://ossci-android/executorch/release/${VERSION_NAME}/executorch.aar --acl public-read
-          ${AWS_CMD} executorch.aar.sha256sums s3://ossci-android/executorch/release/${VERSION_NAME}/executorch.aar.sha256sums --acl public-read
+          if [ -z "$VERSION" ]; then
+            VERSION="snapshot-$(date +"%Y%m%d")"
+          fi
+          ${AWS_CMD} executorch.aar s3://ossci-android/executorch/release/${VERSION}/executorch.aar --acl public-read
+          ${AWS_CMD} executorch.aar.sha256sums s3://ossci-android/executorch/release/${VERSION}/executorch.aar.sha256sums --acl public-read
@@ -1,8 +1,7 @@
 # ExecuTorch Core ML Delegate
 
-
 This subtree contains the Core ML Delegate implementation for ExecuTorch.
-Core ML is an optimized framework for running machine learning models on Apple devices. The delegate is the mechanism for leveraging the Core ML framework to accelerate operators when running on Apple devices.
+Core ML is an optimized framework for running machine learning models on Apple devices. The delegate is the mechanism for leveraging the Core ML framework to accelerate operators when running on Apple devices.  To learn how to use the CoreML delegate, see the [documentation](https://github.com/pytorch/executorch/blob/main/docs/source/backends-coreml.md). 
 
 ## Layout
 - `compiler/` : Lowers a module to Core ML backend.
@@ -19,110 +18,6 @@ Core ML is an optimized framework for running machine learning models on Apple d
     - `workspace` : Xcode workspace for the runtime.
 - `third-party/`: External dependencies.
 
-## Partition and Delegation
-
-To delegate a Program to the **Core ML** backend, the client must call `to_backend` with the **CoreMLPartitioner**.
-
-```python
-import torch
-import executorch.exir
-
-from executorch.backends.apple.coreml.compiler import CoreMLBackend
-from executorch.backends.apple.coreml.partition import CoreMLPartitioner
-
-class Model(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x):
-        return torch.sin(x)
-
-source_model = Model()
-example_inputs = (torch.ones(1), )
-
-# Export the source model to Edge IR representation
-aten_program = torch.export.export(source_model, example_inputs)
-edge_program_manager = executorch.exir.to_edge(aten_program)
-
-# Delegate to Core ML backend
-delegated_program_manager = edge_program_manager.to_backend(CoreMLPartitioner())
-
-# Serialize delegated program
-executorch_program = delegated_program_manager.to_executorch()
-with open("model.pte", "wb") as f:
-    f.write(executorch_program.buffer)
-```
-
-The module will be fully or partially delegated to **Core ML**, depending on whether all or part of ops are supported by the **Core ML** backend. User may force skip certain ops by `CoreMLPartitioner(skip_ops_for_coreml_delegation=...)`
-
-The `to_backend` implementation is a thin wrapper over [coremltools](https://apple.github.io/coremltools/docs-guides/), `coremltools` is responsible for converting an **ExportedProgram** to a **MLModel**. The converted **MLModel** data is saved, flattened, and returned as bytes to **ExecuTorch**.
-
-## Quantization
-
-To quantize a Program in a Core ML favored way, the client may utilize **CoreMLQuantizer**.
-
-```python
-import torch
-import executorch.exir
-
-from torch.export import export_for_training
-from torch.ao.quantization.quantize_pt2e import (
-    convert_pt2e,
-    prepare_pt2e,
-    prepare_qat_pt2e,
-)
-
-from executorch.backends.apple.coreml.quantizer import CoreMLQuantizer
-from coremltools.optimize.torch.quantization.quantization_config import (
-    LinearQuantizerConfig,
-    QuantizationScheme,
-)
-
-class Model(torch.nn.Module):
-    def __init__(self) -> None:
-        super().__init__()
-        self.conv = torch.nn.Conv2d(
-            in_channels=3, out_channels=16, kernel_size=3, padding=1
-        )
-        self.relu = torch.nn.ReLU()
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        a = self.conv(x)
-        return self.relu(a)
-
-source_model = Model()
-example_inputs = (torch.randn((1, 3, 256, 256)), )
-
-pre_autograd_aten_dialect = export_for_training(source_model, example_inputs).module()
-
-quantization_config = LinearQuantizerConfig.from_dict(
-    {
-        "global_config": {
-            "quantization_scheme": QuantizationScheme.symmetric,
-            "activation_dtype": torch.quint8,
-            "weight_dtype": torch.qint8,
-            "weight_per_channel": True,
-        }
-    }
-)
-quantizer = CoreMLQuantizer(quantization_config)
-
-# For post-training quantization, use `prepare_pt2e`
-# For quantization-aware trainin,g use `prepare_qat_pt2e`
-prepared_graph = prepare_pt2e(pre_autograd_aten_dialect, quantizer)
-
-prepared_graph(*example_inputs)
-converted_graph = convert_pt2e(prepared_graph)
-```
-
-The `converted_graph` is the quantized torch model, and can be delegated to **Core ML** similarly through **CoreMLPartitioner**
-
-## Runtime
-
-To execute a Core ML delegated program, the application must link to the `coremldelegate` library. Once linked there are no additional steps required, ExecuTorch when running the program would call the Core ML runtime to execute the Core ML delegated part of the program.
-
-Please follow the instructions described in the [Core ML setup](/backends/apple/coreml/setup.md) to link the `coremldelegate` library.
-
 ## Help & Improvements
 If you have problems or questions or have suggestions for ways to make
 implementation and testing better, please create an issue on [github](https://www.github.com/pytorch/executorch/issues).
@@ -20,6 +20,7 @@
 from .convert_to_clamp import ConvertToClampPass  # noqa
 from .decompose_batchnorm_pass import DecomposeBatchNormPass  # noqa
 from .decompose_div_pass import DecomposeDivPass  # noqa
+from .decompose_gelu_pass import DecomposeGeluPass  # noqa
 from .decompose_layernorm_pass import DecomposeLayerNormPass  # noqa
 from .decompose_leaky_relu_pass import DecomposeLeakyReLUPass  # noqa
 from .decompose_linear_pass import DecomposeLinearPass  # noqa
 
@@ -25,6 +25,7 @@
     ConvertToClampPass,
     DecomposeBatchNormPass,
     DecomposeDivPass,
+    DecomposeGeluPass,
     DecomposeLayerNormPass,
     DecomposeLeakyReLUPass,
     DecomposeLinearPass,
@@ -132,6 +133,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(ConvertMeanDimToAveragePoolPass())
         self.add_pass(DecomposeDivPass())
         self.add_pass(DecomposeSoftmaxPass())
+        self.add_pass(DecomposeGeluPass())
         self.add_pass(ConvertFullLikeToFullPass())
         self.add_pass(ConvertToClampPass())
         self.add_pass(ConvertMinMaxPass())
 
@@ -12,7 +12,6 @@
 from torch._export.utils import is_buffer
 
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.WARNING)
 
 
 class CastInt64BuffersToInt32Pass(ExportPass):
 
@@ -0,0 +1,149 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.backends.arm._passes.arm_pass_utils import get_node_arg
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+
+torch_gelu = (torch.ops.aten.gelu.default,)
+
+edge_gelu = (exir_ops.edge.aten.gelu.default,)
+
+
+def _get_gelu_ops(op) -> tuple:
+    """
+    Returns the operators needed to decompose GELU
+    """
+
+    if op in edge_gelu:
+        return (
+            exir_ops.edge.aten.full.default,
+            exir_ops.edge.aten.add.Tensor,
+            exir_ops.edge.aten.mul.Tensor,
+            exir_ops.edge.aten.tanh.default,
+            exir_ops.edge.aten.erf.default,
+        )
+    if op in torch_gelu:
+        return (
+            torch.ops.aten.full.default,
+            torch.ops.aten.add.Tensor,
+            torch.ops.aten.mul.Tensor,
+            torch.ops.aten.tanh.default,
+            torch.ops.aten.erf.default,
+        )
+    raise RuntimeError(f"Can't get GeLU decomposition ops for op {op}")
+
+
+class DecomposeGeluPass(ExportPass):
+    """
+    This pass decomposes the GELU operator into primitive ops.
+    Aiming to adhere closely to the reference implementations built into
+    ExecuTorch. Including using the same pre-calculated constants.
+
+    This operator has two formulae depending on the value of the
+    approximate argument. Examples below include the added full
+    operators necessary for the initialization for constants used in
+    each respective formula.
+
+    aten.gelu(x, approximate="none") becomes:
+        %FULL_0_5 = full()
+        %FULL_1 = full()
+        %FULL_SQRT1_2 = full()
+        %op1 = mul(x, %FULL_SQRT1_2)
+        %op2 = erf(%op1)
+        %op3 = add(%op2, %FULL_1)
+        %op4 = mul(%op3, %FULL_0_5)
+        %op5 = mul(%x, %op4)
+
+    aten.gelu(x, approximate="tanh") becomes:
+        %FULL_0_5 = full()
+        %FULL_1 = full()
+        %FULL_SQRT2 = full()
+        %FULL_2_SQRTPI = full()
+        %FULL_CUBE_COEFF = full()
+        %SQRT_MUL = mul(%FULL_SQRT2, %FULL_2_SQRTPI)
+        %SQRT_2_PI = mul(%SQRT_MUL, %FULL_0_5)
+        %sqr_x = mul(x, x)
+        %cube_x = mul(sqr_x, x)
+        %op1 = mul(%cube_x, %FULL_CUBE_COEFF)
+        %op2 = add(%x, %op1)
+        %op3 = mul(%op2, %SQRT_2_PI)
+        %op4 = tanh(%op3)
+        %op5 = add(%op4, %FULL_1)
+        %op6 = mul(%x, %op5)
+        %op7 = mul(%op6, %FULL_0_5)
+    """
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in torch_gelu + edge_gelu:
+            return super().call_operator(op, args, kwargs, meta)
+
+        full_op, add_op, mul_op, tanh_op, erf_op = _get_gelu_ops(op)
+
+        input = get_node_arg(args, 0)
+        # If approximate is default (none) it does not appear in kwargs
+        approximate = get_node_arg(kwargs, "approximate", "none")
+
+        shape = meta["val"].size()
+        dtype = meta["val"].dtype
+
+        FULL_0_5 = super().call_operator(
+            full_op, ([1] * len(shape), 0.5), {"dtype": dtype}, meta
+        )
+        FULL_1 = super().call_operator(
+            full_op, ([1] * len(shape), 1), {"dtype": dtype}, meta
+        )
+
+        if approximate == "none":
+            # Constant mirrors ExecuTorch implementation for parity.
+            FULL_SQRT1_2 = super().call_operator(
+                full_op, ([1] * len(shape), 0.70710678118654752440), {}, meta
+            )
+
+            op1 = super().call_operator(mul_op, (input, FULL_SQRT1_2), {}, meta)
+            op2 = super().call_operator(erf_op, (op1,), {}, meta)
+            op3 = super().call_operator(add_op, (op2, FULL_1), {}, meta)
+            op4 = super().call_operator(mul_op, (op3, FULL_0_5), {}, meta)
+            return super().call_operator(mul_op, (input, op4), {}, meta)
+
+        elif approximate == "tanh":
+            # Constants mirror ExecuTorch implementation for parity.
+            FULL_SQRT2 = super().call_operator(
+                full_op,
+                ([1] * len(shape), 1.41421356237309504880),
+                {"dtype": dtype},
+                meta,
+            )
+            FULL_2_SQRTPI = super().call_operator(
+                full_op,
+                ([1] * len(shape), 1.12837916709551257390),
+                {"dtype": dtype},
+                meta,
+            )
+            FULL_CUBE_COEFF = super().call_operator(
+                full_op, ([1] * len(shape), 0.044715), {"dtype": dtype}, meta
+            )
+
+            # Mirrors ExecuTorch implementations for calculating this value
+            SQRT_MUL = super().call_operator(
+                mul_op, (FULL_SQRT2, FULL_2_SQRTPI), {}, meta
+            )
+            SQRT_2_PI = super().call_operator(mul_op, (SQRT_MUL, FULL_0_5), {}, meta)
+
+            # Avoiding using POW in order to reduce pass order reliance.
+            sqr_x = super().call_operator(mul_op, (input, input), {}, meta)
+            cube_x = super().call_operator(mul_op, (sqr_x, input), {}, meta)
+            op1 = super().call_operator(mul_op, (cube_x, FULL_CUBE_COEFF), {}, meta)
+            op2 = super().call_operator(add_op, (input, op1), {}, meta)
+            op3 = super().call_operator(mul_op, (op2, SQRT_2_PI), {}, meta)
+            op4 = super().call_operator(tanh_op, (op3,), {}, meta)
+            op5 = super().call_operator(add_op, (op4, FULL_1), {}, meta)
+            op6 = super().call_operator(mul_op, (input, op5), {}, meta)
+            return super().call_operator(mul_op, (op6, FULL_0_5), {}, meta)
+        else:
+            raise RuntimeError(
+                f"approximate argument expected 'none' or 'tanh' but got {approximate}"
+            )
@@ -56,6 +56,7 @@ class TableOps:
     # Targets that must be treated explicitly
     special_table_ops: Set[EdgeOpOverload] = {
         exir_ops.edge.aten.pow.Tensor_Scalar,
+        exir_ops.edge.aten.gelu.default,
     }
 
     def __init__(self, exported_program: ExportedProgram):
@@ -76,6 +77,19 @@ def __getitem__(self, node: Node):
                     # Exponent is a constant. Embed it into a lambda.
                     exp = cast(int, node.args[1])
                     return lambda x: torch.pow(x, exp).flatten()
+                case exir_ops.edge.aten.gelu.default:
+                    # If kwargs not present it is default "none"
+                    approximate = cast(
+                        str,
+                        (
+                            node.kwargs["approximate"]
+                            if "approximate" in node.kwargs
+                            else "none"
+                        ),
+                    )
+                    return lambda x: torch.nn.functional.gelu(
+                        x, approximate=approximate
+                    ).flatten()
                 case _:
                     # Op must be handled if it's inside self.special_ops
                     raise AssertionError("Unhandled table operation")
 
@@ -11,19 +11,13 @@
 # JIT compiler flows.
 #
 
-import logging
-
 from typing import List, Optional
 
 from executorch.backends.arm.tosa_specification import TosaSpecification
 
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 
 
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.WARNING)
-
-
 class ArmCompileSpecBuilder:
     def __init__(self):
         self.compile_spec: List[CompileSpec] = []
 
@@ -17,7 +17,6 @@
 from executorch.exir.dialects._ops import ops as exir_ops
 
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.WARNING)
 
 
 @register_tosa_support_check
 
@@ -16,7 +16,6 @@
 from executorch.exir.dialects._ops import ops as exir_ops
 
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.WARNING)
 
 
 @register_tosa_support_check