Run CI using Triton CPU backend

oulgen · oulgen · commit 5ca7283586d3 · 2025-06-16T12:05:16.000-07:00
stack-info: PR: #174, branch: oulgen/stack/8
diff --git a/.github/scripts/install_triton.sh b/.github/scripts/install_triton.sh
@@ -1,11 +1,37 @@
 #!/bin/bash
 set -ex
+
+# Parse command line arguments
+USE_CPU_BACKEND=false
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --cpu)
+            USE_CPU_BACKEND=true
+            shift
+            ;;
+        *)
+            echo "Unknown option: $1"
+            exit 1
+            ;;
+    esac
+done
+
 (
     mkdir -p /tmp/$USER
     pushd /tmp/$USER
     pip uninstall -y triton pytorch-triton || true
     rm -rf triton/ || true
-    git clone https://github.com/triton-lang/triton.git  # install triton latest main
+
+    # Clone the appropriate repository based on backend
+    if [ "$USE_CPU_BACKEND" = true ]; then
+        # Install triton-cpu from triton-cpu repository
+        git clone --recursive https://github.com/triton-lang/triton-cpu.git triton
+    else
+        # Install triton from main repository for GPU backend
+        git clone https://github.com/triton-lang/triton.git triton
+    fi
+
+    # Shared build process for both backends
     (
         pushd triton/
         conda config --set channel_priority strict
@@ -14,10 +40,14 @@ set -ex
         conda install -y -c conda-forge gcc_linux-64=13 gxx_linux-64=13 gcc=13 gxx=13
         pip install -r python/requirements.txt
         # Use TRITON_PARALLEL_LINK_JOBS=2 to avoid OOM on CPU CI machines
-        MAX_JOBS=$(nproc) TRITON_PARALLEL_LINK_JOBS=2 pip install .  # install to conda site-packages/ folder
+        if [ "$USE_CPU_BACKEND" = true ]; then
+            MAX_JOBS=$(nproc) TRITON_PARALLEL_LINK_JOBS=2 pip install -e python  # install to conda site-packages/ folder
+        else
+            MAX_JOBS=$(nproc) TRITON_PARALLEL_LINK_JOBS=2 pip install .  # install to conda site-packages/ folder
+        fi
         popd
     )
-    rm -rf triton/
+    #rm -rf triton/
     popd
 )
 exit 0
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -39,3 +39,31 @@ jobs:
         ./.github/scripts/install_triton.sh
         pip install -r requirements.txt
         python -m unittest discover -s test/ -p "*.py" -v -t .
+
+  test_cpu_triton:
+    name: test-cpu-py${{ matrix.python-version }}-triton-cpu
+    strategy:
+      fail-fast: true
+      matrix:
+        python-version: ["3.12"]
+        include:
+          - name: A10G
+            runs-on: linux.g5.4xlarge.nvidia.gpu
+            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126'
+            gpu-arch-type: "cuda"
+            gpu-arch-version: "12.6"
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      timeout: 120
+      runner: ${{ matrix.runs-on }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      submodules: recursive
+      script: |
+        conda create -n venv python=${{ matrix.python-version }} -y
+        conda activate venv
+        python -m pip install --upgrade pip
+        pip install ${{ matrix.torch-spec }}
+        ./.github/scripts/install_triton.sh --cpu
+        pip install -r requirements.txt
+        TRITON_CPU_BACKEND=1 python -m unittest discover -s test/ -p "*.py" -v -t .
diff --git a/helion/_testing.py b/helion/_testing.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import importlib
+import os
 import sys
 from typing import TYPE_CHECKING
 
@@ -15,7 +16,7 @@
     from .runtime.kernel import Kernel
 
 
-DEVICE = torch.device("cuda")
+DEVICE = torch.device("cuda" if os.environ.get("TRITON_CPU_BACKEND") != "1" else "cpu")
 
 
 def import_path(filename: Path) -> types.ModuleType: