diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
index 7c558c72c6..27d79c515f 100644
--- a/.github/scripts/fbgemm_gpu_build.bash
+++ b/.github/scripts/fbgemm_gpu_build.bash
@@ -141,7 +141,7 @@ __configure_fbgemm_gpu_build_cpu () {
   # Update the package name and build args depending on if CUDA is specified
   echo "[BUILD] Setting CPU-only build args ..."
   build_args=(
-    --package_variant=cpu
+    --build-variant=cpu
   )
 }
 
@@ -149,7 +149,7 @@ __configure_fbgemm_gpu_build_docs () {
   # Update the package name and build args depending on if CUDA is specified
   echo "[BUILD] Setting CPU-only (docs) build args ..."
   build_args=(
-    --package_variant=docs
+    --build-variant=docs
   )
 }
 
@@ -206,7 +206,7 @@ __configure_fbgemm_gpu_build_rocm () {
   #   https://rocm.docs.amd.com/en/docs-6.1.1/reference/rocmcc.html
   echo "[BUILD] Setting ROCm build args ..."
   build_args=(
-    --package_variant=rocm
+    --build-variant=rocm
     # HIP_ROOT_DIR now required for HIP to be correctly detected by CMake
     -DHIP_ROOT_DIR=/opt/rocm
     # ROCm CMake complains about missing AMDGPU_TARGETS, so we explicitly set this
@@ -284,7 +284,7 @@ __configure_fbgemm_gpu_build_cuda () {
 
   echo "[BUILD] Setting CUDA build args ..."
   build_args=(
-    --package_variant=cuda
+    --build-variant=cuda
     --nvml_lib_path="${nvml_lib_path}"
     --nccl_lib_path="${nccl_lib_path}"
     # Pass to PyTorch CMake
@@ -303,10 +303,9 @@ __configure_fbgemm_gpu_build_genai () {
 
   __configure_fbgemm_gpu_build_cuda "$fbgemm_variant_targets" || return 1
 
-  # Replace the package_variant flag, since GenAI is also a CUDA-type build
-  for i in "${!build_args[@]}"; do
-    build_args[i]="${build_args[i]/--package_variant=cuda/--package_variant=genai}"
-  done
+  build_args+=(
+    --build-target=genai
+  )
 }
 
 # shellcheck disable=SC2120
diff --git a/.github/scripts/fbgemm_gpu_install.bash b/.github/scripts/fbgemm_gpu_install.bash
index 139c6cdc75..2eabfe635c 100644
--- a/.github/scripts/fbgemm_gpu_install.bash
+++ b/.github/scripts/fbgemm_gpu_install.bash
@@ -39,12 +39,15 @@ __install_fetch_version_and_variant_info () {
 
   echo "[CHECK] Printing out the FBGEMM-GPU version ..."
   # shellcheck disable=SC2086,SC2155
-  installed_fbgemm_gpu_version=$(conda run ${env_prefix} python -c "import fbgemm_gpu; print(fbgemm_gpu.__version__)")
+  installed_fbgemm_target=$(conda run ${env_prefix} python -c "import fbgemm_gpu; print(fbgemm_gpu.__target__)")
   # shellcheck disable=SC2086,SC2155
-  installed_fbgemm_gpu_variant=$(conda run ${env_prefix} python -c "import fbgemm_gpu; print(fbgemm_gpu.__variant__)")
+  installed_fbgemm_variant=$(conda run ${env_prefix} python -c "import fbgemm_gpu; print(fbgemm_gpu.__variant__)")
+  # shellcheck disable=SC2086,SC2155
+  installed_fbgemm_version=$(conda run ${env_prefix} python -c "import fbgemm_gpu; print(fbgemm_gpu.__version__)")
   echo "################################################################################"
-  echo "[CHECK] The installed VERSION of FBGEMM_GPU is: ${installed_fbgemm_gpu_version}"
-  echo "[CHECK] The installed VARIANT of FBGEMM_GPU is: ${installed_fbgemm_gpu_variant}"
+  echo "[CHECK] The installed FBGEMM TARGET is: ${installed_fbgemm_target}"
+  echo "[CHECK] The installed FBGEMM VARIANT is: ${installed_fbgemm_variant}"
+  echo "[CHECK] The installed FBGEMM VERSION is: ${installed_fbgemm_version}"
   echo "################################################################################"
   echo ""
 }
@@ -53,7 +56,7 @@ __install_check_subpackages () {
   # shellcheck disable=SC2086,SC2155
   local fbgemm_gpu_packages=$(conda run ${env_prefix} python -c "import fbgemm_gpu; print(dir(fbgemm_gpu))")
 
-  if [ "$installed_fbgemm_gpu_variant" == "cuda" ] || [ "$installed_fbgemm_gpu_variant" == "genai" ]; then
+  if [ "$installed_fbgemm_target" == "genai" ]; then
     # shellcheck disable=SC2086,SC2155
     local experimental_packages=$(conda run ${env_prefix} python -c "import fbgemm_gpu.experimental; print(dir(fbgemm_gpu.experimental))")
   fi
@@ -74,7 +77,7 @@ __install_check_subpackages () {
     "fbgemm_gpu.tbe.cache"
   )
 
-  if [ "$installed_fbgemm_gpu_variant" != "genai" ]; then
+  if [ "$installed_fbgemm_target" != "genai" ]; then
     subpackages+=(
       "fbgemm_gpu.split_embedding_codegen_lookup_invokers"
       "fbgemm_gpu.tbe.ssd"
@@ -89,7 +92,7 @@ __install_check_subpackages () {
 
 __install_check_operator_registrations () {
   echo "[INSTALL] Check for operator registrations ..."
-  if [ "$installed_fbgemm_gpu_variant" == "genai" ]; then
+  if [ "$installed_fbgemm_target" == "genai" ]; then
     local test_operators=(
       "torch.ops.fbgemm.nccl_init"
       "torch.ops.fbgemm.gqa_attn_splitk"
diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash
index 59ef177745..e2ce5c1ea1 100644
--- a/.github/scripts/fbgemm_gpu_test.bash
+++ b/.github/scripts/fbgemm_gpu_test.bash
@@ -154,16 +154,16 @@ __setup_fbgemm_gpu_test () {
 
   # Configure the environment for ignored test suites for each FBGEMM_GPU
   # variant
-  if [ "$fbgemm_gpu_variant" == "cpu" ]; then
+  if [ "$fbgemm_build_variant" == "cpu" ]; then
     echo "[TEST] Configuring for CPU-based testing ..."
     __configure_fbgemm_gpu_test_cpu
 
-  elif [ "$fbgemm_gpu_variant" == "rocm" ]; then
+  elif [ "$fbgemm_build_variant" == "rocm" ]; then
     echo "[TEST] Configuring for ROCm-based testing ..."
     __configure_fbgemm_gpu_test_rocm
 
   else
-    echo "[TEST] FBGEMM_GPU variant is ${fbgemm_gpu_variant}; configuring for CUDA-based testing ..."
+    echo "[TEST] FBGEMM_GPU variant is ${fbgemm_build_variant}; configuring for CUDA-based testing ..."
     __configure_fbgemm_gpu_test_cuda
   fi
 
@@ -189,9 +189,6 @@ __setup_fbgemm_gpu_test () {
 
   echo "[TEST] Checking imports ..."
   (test_python_import_package "${env_name}" fbgemm_gpu) || return 1
-  if [ "$fbgemm_gpu_variant" != "genai" ]; then
-    (test_python_import_package "${env_name}" fbgemm_gpu.split_embedding_codegen_lookup_invokers) || return 1
-  fi
 
   # Set the feature flags to enable experimental features as needed
   __set_feature_flags
@@ -251,21 +248,19 @@ __run_fbgemm_gpu_tests_in_directory () {
 __determine_test_directories () {
   target_directories=()
 
-  if [ "$fbgemm_gpu_variant" != "genai" ]; then
-    target_directories+=(
-      fbgemm_gpu/test
-    )
-  fi
-
-  if [ "$fbgemm_gpu_variant" == "genai" ]; then
+  if [ "$fbgemm_build_target" == "genai" ]; then
     target_directories+=(
       fbgemm_gpu/experimental/example/test
       fbgemm_gpu/experimental/gemm/test
       fbgemm_gpu/experimental/gen_ai/test
     )
+  else
+    target_directories+=(
+      fbgemm_gpu/test
+    )
   fi
 
-  echo "[TEST] Determined the testing directories:"
+  echo "[TEST] Determined the test directories:"
   for test_dir in "${target_directories[@]}"; do
     echo "$test_dir"
   done
@@ -274,14 +269,10 @@ __determine_test_directories () {
 
 test_all_fbgemm_gpu_modules () {
   env_name="$1"
-  fbgemm_gpu_variant="$2"
   if [ "$env_name" == "" ]; then
-    echo "Usage: ${FUNCNAME[0]} ENV_NAME [FBGEMM_GPU_VARIANT]"
+    echo "Usage: ${FUNCNAME[0]} ENV_NAME"
     echo "Example(s):"
     echo "    ${FUNCNAME[0]} build_env        # Test all FBGEMM_GPU modules applicable to to the installed variant"
-    echo "    ${FUNCNAME[0]} build_env cpu    # Test all FBGEMM_GPU modules applicable to CPU"
-    echo "    ${FUNCNAME[0]} build_env cuda   # Test all FBGEMM_GPU modules applicable to CUDA"
-    echo "    ${FUNCNAME[0]} build_env rocm   # Test all FBGEMM_GPU modules applicable to ROCm"
     return 1
   else
     echo "################################################################################"
@@ -295,14 +286,13 @@ test_all_fbgemm_gpu_modules () {
   # shellcheck disable=SC2155
   local env_prefix=$(env_name_or_prefix "${env_name}")
 
-  # Determine the FBGEMM_GPU varaiant if needed
-  if [ "$fbgemm_gpu_variant" == "" ]; then
-    echo "[TEST] FBGEMM_GPU variant not explicitly provided by user; will automatically determine from the FBGEMM_GPU installation ..."
-    # shellcheck disable=SC2086
-    fbgemm_gpu_variant=$(conda run ${env_prefix} python -c "import fbgemm_gpu; print(fbgemm_gpu.__variant__)")
-    echo "[TEST] Determined FBGEMM_GPU variant from installation: ${fbgemm_gpu_variant}"
-    echo "[TEST] Will be running tests specific to this variant ..."
-  fi
+  # Determine the FBGEMM build target and variant
+  # shellcheck disable=SC2086
+  fbgemm_build_target=$(conda run ${env_prefix} python -c "import fbgemm_gpu; print(fbgemm_gpu.__target__)")
+  # shellcheck disable=SC2086
+  fbgemm_build_variant=$(conda run ${env_prefix} python -c "import fbgemm_gpu; print(fbgemm_gpu.__variant__)")
+  echo "[TEST] Determined FBGEMM_GPU (target : variant) from installation: (${fbgemm_build_target} : ${fbgemm_build_variant})"
+  echo "[TEST] Will be running tests specific to this target and variant ..."
 
   # Determine the test directories to include for testing
   __determine_test_directories
@@ -312,9 +302,9 @@ test_all_fbgemm_gpu_modules () {
 
   # Iterate through the test directories and run bulk tests
   for test_dir in "${target_directories[@]}"; do
-    cd "${test_dir}"                                                          || return 1
-    __run_fbgemm_gpu_tests_in_directory "${env_name}" "${fbgemm_gpu_variant}" || return 1
-    cd -                                                                      || return 1
+    cd "${test_dir}"                                    || return 1
+    __run_fbgemm_gpu_tests_in_directory "${env_name}"   || return 1
+    cd -                                                || return 1
   done
 }
 
diff --git a/.github/scripts/nova_postscript.bash b/.github/scripts/nova_postscript.bash
index 40448b2f55..92827ec355 100644
--- a/.github/scripts/nova_postscript.bash
+++ b/.github/scripts/nova_postscript.bash
@@ -66,7 +66,7 @@ fi
 
 $CONDA_RUN python3 -c "import torch; print('cuda.is_available() ', torch.cuda.is_available()); print ('device_count() ',torch.cuda.device_count());"
 cd "${FBGEMM_REPO}" || { echo "[NOVA] Failed to cd to ${FBGEMM_REPO} from $(pwd)"; };
-test_all_fbgemm_gpu_modules "${BUILD_ENV_NAME}" "${fbgemm_variant}"
+test_all_fbgemm_gpu_modules "${BUILD_ENV_NAME}"
 end_time=$(date +%s)
 runtime=$((end_time-start_time))
 start_time=${end_time}
diff --git a/fbgemm_gpu/docs/src/fbgemm_genai/development/BuildInstructions.rst b/fbgemm_gpu/docs/src/fbgemm_genai/development/BuildInstructions.rst
index 64a51a0f92..3999273951 100644
--- a/fbgemm_gpu/docs/src/fbgemm_genai/development/BuildInstructions.rst
+++ b/fbgemm_gpu/docs/src/fbgemm_genai/development/BuildInstructions.rst
@@ -147,7 +147,8 @@ toolchains have been properly installed.
 
   # Build the wheel artifact only
   python setup.py bdist_wheel \
-      --package_variant=genai \
+      --build-target=genai \
+      --build-variant=cuda \
       --python-tag="${python_tag}" \
       --plat-name="${python_plat_name}" \
       --nvml_lib_path=${NVML_LIB_PATH} \
@@ -156,11 +157,57 @@ toolchains have been properly installed.
 
   # Build and install the library into the Conda environment
   python setup.py install \
-      --package_variant=genai \
+      --build-target=genai \
+      --build-variant=cuda \
       --nvml_lib_path=${NVML_LIB_PATH} \
       --nccl_lib_path=${NCCL_LIB_PATH} \
       -DTORCH_CUDA_ARCH_LIST="${cuda_arch_list}"
 
+.. _fbgemm-gpu.build.process.rocm:
+
+ROCm Build
+----------
+
+For ROCm builds, ``ROCM_PATH`` and ``PYTORCH_ROCM_ARCH`` need to be specified.
+The presence of a ROCm device, however, is not required for building
+the package.
+
+Similar to CUDA builds, building with Clang + ``libstdc++`` can be enabled by
+appending ``--cxxprefix=$CONDA_PREFIX`` to the build command, presuming the
+toolchains have been properly installed.
+
+.. code:: sh
+
+  # !! Run in fbgemm_gpu/ directory inside the Conda environment !!
+
+  export ROCM_PATH=/path/to/rocm
+
+  # [OPTIONAL] Enable verbose HIPCC logs
+  export HIPCC_VERBOSE=1
+
+  # Build for the target architecture of the ROCm device installed on the machine (e.g. 'gfx908,gfx90a,gfx942')
+  # See https://rocm.docs.amd.com/en/latest/reference/gpu-arch-specs.html for list
+  export PYTORCH_ROCM_ARCH=$(${ROCM_PATH}/bin/rocminfo | grep -o -m 1 'gfx.*')
+
+  # Build the wheel artifact only
+  python setup.py bdist_wheel \
+      --build-target=genai \
+      --build-variant=rocm \
+      --python-tag="${python_tag}" \
+      --plat-name="${python_plat_name}" \
+      -DAMDGPU_TARGETS="${PYTORCH_ROCM_ARCH}" \
+      -DHIP_ROOT_DIR="${ROCM_PATH}" \
+      -DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA" \
+      -DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA"
+
+  # Build and install the library into the Conda environment
+  python setup.py install \
+      --build-target=genai \
+      --build-variant=rocm \
+      -DAMDGPU_TARGETS="${PYTORCH_ROCM_ARCH}" \
+      -DHIP_ROOT_DIR="${ROCM_PATH}" \
+      -DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA" \
+      -DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA"
 
 Post-Build Checks (For Developers)
 ----------------------------------
diff --git a/fbgemm_gpu/docs/src/fbgemm_gpu/development/BuildInstructions.rst b/fbgemm_gpu/docs/src/fbgemm_gpu/development/BuildInstructions.rst
index 2a8332ea2f..1147107e23 100644
--- a/fbgemm_gpu/docs/src/fbgemm_gpu/development/BuildInstructions.rst
+++ b/fbgemm_gpu/docs/src/fbgemm_gpu/development/BuildInstructions.rst
@@ -542,16 +542,16 @@ For CPU-only builds, the ``--cpu_only`` flag needs to be specified.
 
   # Build the wheel artifact only
   python setup.py bdist_wheel \
-      --package_variant=cpu \
+      --build-variant=cpu \
       --python-tag="${python_tag}" \
       --plat-name="${python_plat_name}"
 
   # Build and install the library into the Conda environment (GCC)
   python setup.py install \
-      --package_variant=cpu
+      --build-variant=cpu
 
   # NOTE: To build the package as part of generating the documentation, use
-  # `--package_variant=docs` flag instead!
+  # `--build-variant=docs` flag instead!
 
 To build using Clang + ``libstdc++`` instead of GCC, simply append the
 ``--cxxprefix`` flag:
@@ -562,14 +562,14 @@ To build using Clang + ``libstdc++`` instead of GCC, simply append the
 
   # Build the wheel artifact only
   python setup.py bdist_wheel \
-      --package_variant=cpu \
+      --build-variant=cpu \
       --python-tag="${python_tag}" \
       --plat-name="${python_plat_name}" \
       --cxxprefix=$CONDA_PREFIX
 
   # Build and install the library into the Conda environment (Clang)
   python setup.py install \
-      --package_variant=cpu
+      --build-variant=cpu
       --cxxprefix=$CONDA_PREFIX
 
 Note that this presumes the Clang toolchain is properly installed along with the
@@ -642,7 +642,7 @@ toolchains have been properly installed.
 
   # Build the wheel artifact only
   python setup.py bdist_wheel \
-      --package_variant=cuda \
+      --build-variant=cuda \
       --python-tag="${python_tag}" \
       --plat-name="${python_plat_name}" \
       --nvml_lib_path=${NVML_LIB_PATH} \
@@ -651,7 +651,7 @@ toolchains have been properly installed.
 
   # Build and install the library into the Conda environment
   python setup.py install \
-      --package_variant=cuda \
+      --build-variant=cuda \
       --nvml_lib_path=${NVML_LIB_PATH} \
       --nccl_lib_path=${NCCL_LIB_PATH} \
       -DTORCH_CUDA_ARCH_LIST="${cuda_arch_list}"
@@ -684,7 +684,7 @@ presuming the toolchains have been properly installed.
 
   # Build the wheel artifact only
   python setup.py bdist_wheel \
-      --package_variant=rocm \
+      --build-variant=rocm \
       --python-tag="${python_tag}" \
       --plat-name="${python_plat_name}" \
       -DAMDGPU_TARGETS="${PYTORCH_ROCM_ARCH}" \
@@ -694,7 +694,7 @@ presuming the toolchains have been properly installed.
 
   # Build and install the library into the Conda environment
   python setup.py install \
-      --package_variant=rocm \
+      --build-variant=rocm \
       -DAMDGPU_TARGETS="${PYTORCH_ROCM_ARCH}" \
       -DHIP_ROOT_DIR="${ROCM_PATH}" \
       -DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA" \
diff --git a/fbgemm_gpu/fbgemm_gpu/__init__.py b/fbgemm_gpu/fbgemm_gpu/__init__.py
index 2422daa296..8b57d68019 100644
--- a/fbgemm_gpu/fbgemm_gpu/__init__.py
+++ b/fbgemm_gpu/fbgemm_gpu/__init__.py
@@ -31,10 +31,15 @@ def _load_library(filename: str, no_throw: bool = False) -> None:
 
 try:
     # Export the version string from the version file auto-generated by setup.py
-    from fbgemm_gpu.docs.version import __variant__, __version__  # noqa: F401, E402
+    from fbgemm_gpu.docs.version import (  # noqa: F401, E402
+        __target__,
+        __variant__,
+        __version__,
+    )
 except Exception:
     __variant__: str = "INTERNAL"
     __version__: str = "INTERNAL"
+    __target__: str = "INTERNAL"
 
 fbgemm_gpu_libraries = [
     "fbgemm_gpu_config",
@@ -52,7 +57,7 @@ def _load_library(filename: str, no_throw: bool = False) -> None:
     "fbgemm_gpu_py",
 ]
 
-fbgemm_gpu_genai_libraries = [
+fbgemm_genai_libraries = [
     "experimental/gen_ai/fbgemm_gpu_experimental_gen_ai",
 ]
 
@@ -64,17 +69,14 @@ def _load_library(filename: str, no_throw: bool = False) -> None:
 # .SO file for the ROCm case, so that clients can import
 # fbgemm_gpu.experimental.gemm without triggering an error.
 if torch.cuda.is_available() and torch.version.hip:
-    fbgemm_gpu_genai_libraries = []
+    fbgemm_genai_libraries = []
 
 libraries_to_load = {
-    "cpu": fbgemm_gpu_libraries,
-    "docs": fbgemm_gpu_libraries,
-    "cuda": fbgemm_gpu_libraries,
-    "genai": fbgemm_gpu_genai_libraries,
-    "rocm": fbgemm_gpu_libraries,
+    "default": fbgemm_gpu_libraries,
+    "genai": fbgemm_genai_libraries,
 }
 
-for library in libraries_to_load.get(__variant__, []):
+for library in libraries_to_load.get(__target__, []):
     # NOTE: In all cases, we want to throw an error if we cannot load the
     # library.  However, this appears to break the OSS documentation build,
     # where the Python documentation doesn't show up in the generated docs.
diff --git a/fbgemm_gpu/setup.py b/fbgemm_gpu/setup.py
index 5b3aa80099..cb03df3986 100644
--- a/fbgemm_gpu/setup.py
+++ b/fbgemm_gpu/setup.py
@@ -7,6 +7,7 @@
 # @licenselint-loose-mode
 
 import argparse
+import logging
 import os
 import re
 import subprocess
@@ -23,6 +24,8 @@
 from skbuild import setup
 from tabulate import tabulate
 
+logging.basicConfig(level=logging.INFO)
+
 
 @dataclass(frozen=True)
 class FbgemmGpuBuild:
@@ -50,9 +53,16 @@ def from_args(cls, argv: List[str]):
             help="Print build information only.",
         )
         parser.add_argument(
-            "--package_variant",
+            "--build-target",
+            type=str,
+            choices=["default", "genai"],
+            default="default",
+            help="The FBGEMM_GPU variant to build.",
+        )
+        parser.add_argument(
+            "--build-variant",
             type=str,
-            choices=["docs", "cpu", "cuda", "rocm", "genai"],
+            choices=["docs", "cpu", "cuda", "rocm"],
             default="cuda",
             help="The FBGEMM_GPU variant to build.",
         )
@@ -102,37 +112,30 @@ def nova_flag(self) -> Optional[int]:
         else:
             return None
 
-    def package_name(self) -> str:
-        pkg_name: str = "fbgemm_gpu"
+    def nova_non_prebuild_step(self) -> bool:
+        # When running in Nova workflow context, the actual package build is run
+        # in the Nova CI's "pre-script" step, as denoted by the `BUILD_FROM_NOVA`
+        # flag.  As such, we skip building in the clean and build wheel steps.
+        return self.nova_flag() == 1
 
-        if self.nova_flag() == 1:
-            # When running in Nova workflow context, the actual package build is
-            # run in the Nova CI's "pre-script" step, as denoted by the
-            # `BUILD_FROM_NOVA` flag.  As such, we skip building in the clean
-            # and build wheel steps.
-            print(
-                "[SETUP.PY] Running under Nova workflow context (clean or build wheel step) ... exiting"
-            )
-            sys.exit(0)
+    def target(self) -> str:
+        return self.args.build_target
 
-        elif self.nova_flag() == 0:
-            # In Nova, we are publishing genai packages separately from the main
-            # fbgemm_gpu package, so if the package variant is genai, we need to
-            # update the package name accordingly.  Otherwise, the package name
-            # is the same for all other build variants in Nova
-            if self.args.package_variant == "genai":
-                pkg_name = "fbgemm_gpu_genai"
+    def variant(self) -> str:
+        return self.args.build_variant
 
-        else:
+    def package_name(self) -> str:
+        pkg_name: str = "fbgemm_gpu_genai" if self.target() == "genai" else "fbgemm_gpu"
+
+        if self.nova_flag() is None:
             # If running outside of Nova workflow context, append the channel
             # and variant to the package name as needed
             if self.args.package_channel != "release":
                 pkg_name += f"_{self.args.package_channel}"
 
-            if self.args.package_variant != "cuda":
-                pkg_name += f"-{self.args.package_variant}"
+            if self.variant() != "cuda":
+                pkg_name += f"-{self.variant()}"
 
-        print(f"[SETUP.PY] Determined the Python package name: '{pkg_name}'")
         return pkg_name
 
     def variant_version(self) -> str:
@@ -143,25 +146,24 @@ def variant_version(self) -> str:
             # `python setup.py`, this script is invoked twice, once as
             # `setup.py egg_info`, and once as `setup.py bdist_wheel`.
             # Ignore determining the variant_version for the first case.
-            print(
+            logging.debug(
                 "[SETUP.PY] Script was invoked as `setup.py egg_info`, ignoring variant_version"
             )
-            return pkg_vver
+            return ""
 
         elif self.nova_flag() is None:
-            # If not running in a Nova workflow, then use the
-            # `fbgemm_gpu-<variant>` naming convention for the package, since
-            # PyPI does not accept version+xx in the naming convention.
-            print(
+            # If not running in a Nova workflow, ignore the variant version and
+            # use the `fbgemm_gpu-<variant>` package naming convention instead,
+            # since PyPI does not accept version+xx in the naming convention.
+            logging.debug(
                 "[SETUP.PY] Not running under Nova workflow context; ignoring variant_version"
             )
-            return pkg_vver
+            return ""
 
         # NOTE: This is a workaround for the fact that we currently overload
         # package target (e.g. GPU, GenAI), and variant (e.g. CPU, CUDA, ROCm)
-        # into the same `package_variant` variable, and should be fixed soon.
-        if self.args.package_variant == "cuda" or self.args.package_variant == "genai":
-            CudaUtils.set_cuda_environment_variables()
+        # into the same `build_variant` variable, and should be fixed soon.
+        if self.variant() == "cuda":
             if torch.version.cuda is not None:
                 cuda_version = torch.version.cuda.split(".")
                 pkg_vver = f"+cu{cuda_version[0]}{cuda_version[1]}"
@@ -170,7 +172,7 @@ def variant_version(self) -> str:
                     "[SETUP.PY] The installed PyTorch variant is not CUDA; cannot determine the CUDA version!"
                 )
 
-        elif self.args.package_variant == "rocm":
+        elif self.variant() == "rocm":
             if torch.version.hip is not None:
                 rocm_version = torch.version.hip.split(".")
                 # NOTE: Unlike CUDA-based releases, which ignores the minor patch version,
@@ -190,20 +192,19 @@ def variant_version(self) -> str:
         else:
             pkg_vver = "+cpu"
 
-        print(f"[SETUP.PY] Extracted the package variant+version: '{pkg_vver}'")
         return pkg_vver
 
     def package_version(self):
         pkg_vver = self.variant_version()
 
-        print("[SETUP.PY] Extracting the package version ...")
-        print(
+        logging.debug("[SETUP.PY] Extracting the package version ...")
+        logging.debug(
             f"[SETUP.PY] TAG: {gitversion.get_tag()}, BRANCH: {gitversion.get_branch()}, SHA: {gitversion.get_sha()}"
         )
 
         if self.args.package_channel == "nightly":
             # Use date stamp for nightly versions
-            print(
+            logging.debug(
                 "[SETUP.PY] Package is for NIGHTLY; using timestamp for the versioning"
             )
             today = date.today()
@@ -241,12 +242,12 @@ def package_version(self):
             )
 
         full_version_string = f"{pkg_version}{pkg_vver}"
-        print(
+        logging.debug(
             f"[SETUP.PY] Setting the full package version string: {full_version_string}"
         )
         return full_version_string
 
-    def cmake_args(self) -> None:
+    def cmake_args(self) -> List[str]:
         def _get_cxx11_abi():
             try:
                 value = int(torch._C._GLIBCXX_USE_CXX11_ABI)
@@ -257,7 +258,7 @@ def _get_cxx11_abi():
             return f"-D_GLIBCXX_USE_CXX11_ABI={value}"
 
         torch_root = os.path.dirname(torch.__file__)
-        os.environ["CMAKE_BUILD_PARALLEL_LEVEL"] = str(os.cpu_count() // 2)
+        os.environ["CMAKE_BUILD_PARALLEL_LEVEL"] = str((os.cpu_count() or 4) // 2)
 
         cmake_args = [
             f"-DCMAKE_PREFIX_PATH={torch_root}",
@@ -267,36 +268,31 @@ def _get_cxx11_abi():
         cxx_flags = []
 
         if self.args.verbose:
-            print("[SETUP.PY] Building in VERBOSE mode ...")
+            # Enable verbose logging in CMake
             cmake_args.extend(
                 ["-DCMAKE_VERBOSE_MAKEFILE=ON", "-DCMAKE_EXPORT_COMPILE_COMMANDS=TRUE"]
             )
 
         if self.args.debug:
-            # Enable device-side assertions in CUDA and HIP
+            # Enable torch device-side assertions for CUDA and HIP
             # https://stackoverflow.com/questions/44284275/passing-compiler-options-in-cmake-command-line
             cxx_flags.extend(["-DTORCH_USE_CUDA_DSA", "-DTORCH_USE_HIP_DSA"])
 
-        if self.args.package_variant in ["docs", "cpu"]:
-            # NOTE: The docs variant is a fake variant that is effectively the
-            # cpu variant, but marks __VARIANT__ as "docs" instead of "cpu".
-            #
-            # This minor change lets the library loader know not throw
-            # exceptions on failed load, which is the workaround for a bug in
-            # the Sphinx documentation generation process, see:
-            #
-            #   https://github.com/pytorch/FBGEMM/pull/3477
-            #   https://github.com/pytorch/FBGEMM/pull/3717
-            print("[SETUP.PY] Building the CPU ...")
-            cmake_args.append("-DFBGEMM_BUILD_VARIANT=cpu")
-
-        if self.args.package_variant == "rocm":
-            print("[SETUP.PY] Building the ROCm variant ...")
-            cmake_args.append("-DFBGEMM_BUILD_VARIANT=rocm")
-
-        if self.args.package_variant == "genai":
-            print("[SETUP.PY] Building the GENAI-ONLY variant of FBGEMM_GPU ...")
-            cmake_args.append("-DFBGEMM_BUILD_TARGET=genai")
+        print(f"[SETUP.PY] Setting the FBGEMM build target: {self.target()} ...")
+        cmake_args.append(f"-DFBGEMM_BUILD_TARGET={self.target()}")
+
+        # NOTE: The docs variant is a fake variant that is effectively the
+        # cpu variant, but marks __VARIANT__ as "docs" instead of "cpu".
+        #
+        # This minor change lets the library loader know not throw
+        # exceptions on failed load, which is the workaround for a bug in
+        # the Sphinx documentation generation process, see:
+        #
+        #   https://github.com/pytorch/FBGEMM/pull/3477
+        #   https://github.com/pytorch/FBGEMM/pull/3717
+        cmake_bvariant = "cpu" if self.variant() == "docs" else self.variant()
+        print(f"[SETUP.PY] Setting the FBGEMM build variant: {cmake_bvariant} ...")
+        cmake_args.append(f"-DFBGEMM_BUILD_VARIANT={cmake_bvariant}")
 
         if self.args.nvml_lib_path:
             cmake_args.append(f"-DNVML_LIB_PATH={self.args.nvml_lib_path}")
@@ -316,7 +312,7 @@ def _get_cxx11_abi():
             cmake_args.append("-DUSE_FB_ONLY=ON")
 
         if self.args.cxxprefix:
-            print("[SETUP.PY] Setting CMake flags ...")
+            logging.debug("[SETUP.PY] Setting CMake flags ...")
             path = self.args.cxxprefix
 
             cxx_flags.extend(
@@ -353,7 +349,7 @@ class CudaUtils:
     """CUDA Utilities"""
 
     @classmethod
-    def nvcc_ok(cls, cuda_home: str, major: int, minor: int) -> bool:
+    def nvcc_ok(cls, cuda_home: Optional[str], major: int, minor: int) -> bool:
         if not cuda_home:
             return False
 
@@ -455,7 +451,8 @@ def generate_version_file(cls, build: FbgemmGpuBuild) -> None:
                 # LICENSE file in the root directory of this source tree.
 
                 __version__: str = "{package_version}"
-                __variant__: str = "{build.args.package_variant}"
+                __target__: str = "{build.target()}"
+                __variant__: str = "{build.variant()}"
                 """
             )
             file.write(text)
@@ -521,9 +518,20 @@ def run(self):
 def main(argv: List[str]) -> None:
     # Handle command line args before passing to main setup() method.
     build = FbgemmGpuBuild.from_args(argv)
-    # Repair command line args for setup.
+    # Repair command line args for setup() method.
     sys.argv = [sys.argv[0]] + build.other_args
 
+    # Skip the build step if running under Nova non-prebuild step
+    if build.nova_non_prebuild_step():
+        print(
+            "[SETUP.PY] Running under Nova workflow context (clean or build wheel step) ... exiting"
+        )
+        sys.exit(0)
+
+    # Set the CUDA environment variables if needed
+    if build.variant() == "cuda":
+        CudaUtils.set_cuda_environment_variables()
+
     # Extract the package name
     package_name = build.package_name()
 
@@ -532,9 +540,8 @@ def main(argv: List[str]) -> None:
 
     if build.args.dryrun:
         print(
-            f"[SETUP.PY] Extracted package name and version: ({package_name} : {package_version})"
+            f"[SETUP.PY] Determined the package name and variant+version: ({package_name} : {package_version})\n"
         )
-        print("")
         sys.exit(0)
 
     # Generate the version file