From 14582eebabcf1ca33323e6b6d6757a49d2306dbf Mon Sep 17 00:00:00 2001
From: Daniel Meister <daniel.meister@amd.com>
Date: Mon, 16 Dec 2024 08:25:25 +0900
Subject: [PATCH 1/3] Add HIP-RT support

Co-authored-by: Aaryaman Vasishta <jem456.vasishta@gmail.com>
---
 .gitignore                        |    1 +
 0001-MSVC-HIP.patch               |   24 +
 CMakeLists.txt                    |  676 ++++++++-----
 LICENSE.txt                       |   29 +
 README.md                         |   30 +
 src/ext/flip/flip.cpp             |    4 +
 src/ext/openvdb                   |    2 +-
 src/pbrt/base/medium.h            |    1 -
 src/pbrt/cmd/imgtool.cpp          |   12 +-
 src/pbrt/cmd/pbrt_test.cpp        |    2 +-
 src/pbrt/cmd/pspec_gpu.cpp        |    4 +
 src/pbrt/gpu/common.h             |  111 +++
 src/pbrt/gpu/cudagl.h             |   13 +
 src/pbrt/gpu/hiprt/aggregate.cpp  | 1499 +++++++++++++++++++++++++++++
 src/pbrt/gpu/hiprt/aggregate.h    |  191 ++++
 src/pbrt/gpu/hiprt/hiprt.cu       |  609 ++++++++++++
 src/pbrt/gpu/memory.cpp           |    4 +
 src/pbrt/gpu/util.cpp             |    6 +
 src/pbrt/gpu/util.h               |   15 +-
 src/pbrt/media.h                  |    3 -
 src/pbrt/options.h                |    4 +-
 src/pbrt/pbrt.h                   |    4 +-
 src/pbrt/shapes.h                 |    4 +-
 src/pbrt/textures.cpp             |  187 +++-
 src/pbrt/textures.h               |   22 +-
 src/pbrt/util/check.h             |   15 +-
 src/pbrt/util/color.h             |    6 +-
 src/pbrt/util/colorspace.cpp      |   14 +-
 src/pbrt/util/colorspace.h        |   11 +-
 src/pbrt/util/float.h             |   48 +-
 src/pbrt/util/hash.h              |    4 +-
 src/pbrt/util/hip_aliases.h       |  105 ++
 src/pbrt/util/image.h             |    2 +-
 src/pbrt/util/log.cpp             |    6 +-
 src/pbrt/util/log.h               |   44 +-
 src/pbrt/util/math.h              |    2 +-
 src/pbrt/util/noise.cpp           |   15 +-
 src/pbrt/util/parallel.h          |    4 +-
 src/pbrt/util/progressreporter.h  |    4 +
 src/pbrt/util/pstd.h              |    8 +-
 src/pbrt/util/sampling.h          |    2 +-
 src/pbrt/util/scattering.cpp      |    4 +-
 src/pbrt/util/taggedptr.h         |   56 +-
 src/pbrt/util/vecmath_test.cpp    |    2 +
 src/pbrt/wavefront/integrator.cpp |   31 +-
 src/pbrt/wavefront/intersect.h    |    2 +-
 src/pbrt/wavefront/wavefront.cpp  |    7 +
 src/pbrt/wavefront/workqueue.h    |    3 +
 48 files changed, 3431 insertions(+), 421 deletions(-)
 create mode 100644 0001-MSVC-HIP.patch
 create mode 100644 src/pbrt/gpu/common.h
 create mode 100644 src/pbrt/gpu/hiprt/aggregate.cpp
 create mode 100644 src/pbrt/gpu/hiprt/aggregate.h
 create mode 100644 src/pbrt/gpu/hiprt/hiprt.cu
 create mode 100644 src/pbrt/util/hip_aliases.h

diff --git a/.gitignore b/.gitignore
index 21b5c9b1f..4d3e0f77d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 .#*
 #*#
 src/build
+src/ext/hiprtSdk/hiprt
 .DS_Store
 .ipynb_checkpoints/
 build/
diff --git a/0001-MSVC-HIP.patch b/0001-MSVC-HIP.patch
new file mode 100644
index 000000000..75dfb6092
--- /dev/null
+++ b/0001-MSVC-HIP.patch
@@ -0,0 +1,24 @@
+From 56933471af36147e1032fbbc7912ca0088797b78 Mon Sep 17 00:00:00 2001
+From: Aaryaman Vasishta <aaryaman.vasishta@amd.com>
+Date: Sun, 10 Dec 2023 18:40:22 +0900
+Subject: [PATCH] Fix BitScanreverse by including intrin.h for MSVC
+
+---
+ common/compiler_msc.h | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/common/compiler_msc.h b/common/compiler_msc.h
+index 18cfa12..2eafd55 100644
+--- a/common/compiler_msc.h
++++ b/common/compiler_msc.h
+@@ -4,6 +4,7 @@
+ 
+ #include <stdint.h>
+ #include <stdlib.h> /* for _byteswap_*() */
++#include <intrin.h>
+ 
+ #define LIBEXPORT	__declspec(dllexport)
+ 
+-- 
+2.33.0.windows.2
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6c6a50166..cf4120218 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,11 +11,17 @@ if (NOT DEFINED PBRT_SOURCE_CODE)
   set (PBRT_SOURCE_CODE ${CMAKE_SOURCE_DIR})
 endif ()
 
+# HIP
+list (APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+set (AMDGPU_TARGETS "gfx1100" CACHE STRING "List of specific machine types for library to target") # User variable
+
 # For sanitizers
 list (INSERT CMAKE_MODULE_PATH 0 "${PBRT_SOURCE_CODE}/cmake")
 
 # Configuration options
 
+option (PBRT_HIPRT "Use HIPRT as backend" OFF)
+option (PBRT_HIPRT_SDK "Use HIPRT SDK instead of HIPRT in HIP SDK" OFF)
 option (PBRT_FLOAT_AS_DOUBLE "Use 64-bit floats" OFF)
 option (PBRT_BUILD_NATIVE_EXECUTABLE "Build executable optimized for CPU architecture of system pbrt was built on" ON)
 option (PBRT_DBG_LOGGING "Enable (very verbose!) debug logging" OFF)
@@ -23,6 +29,7 @@ option (PBRT_NVTX "Insert NVTX annotations for NVIDIA Profiling and Debugging To
 option (PBRT_NVML "Use NVML for GPU performance measurement" OFF)
 option (PBRT_USE_PREGENERATED_RGB_TO_SPECTRUM_TABLES "Use pregenerated rgbspectrum_*.cpp files rather than running rgb2spec_opt to generate them at build time" OFF)
 set (PBRT_OPTIX7_PATH $ENV{PBRT_OPTIX7_PATH} CACHE PATH "Path to OptiX 7 SDK")
+set (PBRT_HIPRT_PATH $ENV{PBRT_HIPRT_PATH} CACHE PATH "Path to HIPRT SDK")
 set (PBRT_GPU_SHADER_MODEL "" CACHE STRING "")
 
 
@@ -64,7 +71,7 @@ function (CHECK_EXT NAME DIR HASH)
 endfunction ()
 
 check_ext ("OpenEXR" "openexr/OpenEXR" ca757f97033ae7cfcc6130f23c0e9ee86b108b07)
-check_ext ("OpenVDB" "openvdb/nanovdb" 414bed84c2fc22e188eac7b611aa85c7edd7a5a9)
+check_ext ("OpenVDB" "openvdb/nanovdb" 385516321b272c423e2a514b41bd21ce6ffdda72)
 check_ext ("Ptex" "ptex/src" 054047d02b9e06e690420b407114d2872435b953)
 check_ext ("double-conversion" "double-conversion/cmake" cc1f75a114aca8d2af69f73a5a959aecbab0e87a)
 check_ext ("filesystem" "filesystem/filesystem" c5f9de30142453eb3c6fe991e82dfc2583373116)
@@ -86,7 +93,7 @@ find_package(OpenGL REQUIRED)
 
 set_property (GLOBAL PROPERTY USE_FOLDERS ON)
 
-if (MSVC)
+if (WIN32)
   list (APPEND PBRT_DEFINITIONS "PBRT_IS_MSVC" "_CRT_SECURE_NO_WARNINGS")
   list (APPEND PBRT_DEFINITIONS "PBRT_IS_MSVC" "_ENABLE_EXTENDED_ALIGNED_STORAGE")
 endif ()
@@ -155,207 +162,268 @@ target_compile_options (
 add_library (pbrt_opt INTERFACE)
 
 #########################################
-## CUDA / OptiX
+## CUDA / OptiX / HIP / HIPRT
 
 add_library (cuda_build_configuration INTERFACE)
 
 include (CheckLanguage)
 
-check_language(CUDA)
-
-if (CMAKE_CUDA_COMPILER)
-  if (CUDA_VERSION_MAJOR LESS 11)
-    message (WARNING "pbrt-v4 requires CUDA version 11.0 or later but version ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}\
-was found. GPU support is therefore disabled. If you have multiple versions\
-of CUDA installed, please update your PATH.")
+# Function to get the short path
+function(get_short_path input_path output_var)
+    # Create a command to get the short path using cmd.exe
+    execute_process(
+        COMMAND cmd /c for %A in ("${input_path}") do @echo %~sA
+        OUTPUT_VARIABLE short_path
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    # Set the result to the output variable
+    set(${output_var} "${short_path}" PARENT_SCOPE)
+endfunction()
+
+if (PBRT_HIPRT)
+  find_package (hip REQUIRED)
+  set (PBRT_HIP_PATH $ENV{HIP_PATH})
+  get_short_path("${PBRT_HIP_PATH}" PBRT_HIP_PATH)
+  string(REGEX REPLACE "[/\\]$" "" PBRT_HIP_PATH "${PBRT_HIP_PATH}")
+
+  set (CUDA_COMPILE_LANGUAGE CXX)
+
+  if (WIN32)
+    if (PBRT_HIPRT_SDK)
+      set (PBRT_HIPRT_BINARY_PATH ${PBRT_HIPRT_PATH}/hiprt/win)
+    else ()
+      set (PBRT_HIPRT_PATH ${PBRT_HIP_PATH}/include)
+      set (PBRT_HIPRT_BINARY_PATH ${PBRT_HIP_PATH}/bin)
+    endif ()
+  else ()
+    set (PBRT_HIPRT_BINARY_PATH ${PBRT_HIPRT_PATH}/hiprt/linux64)
+  endif ()
+  link_directories (${PBRT_HIPRT_BINARY_PATH})
+  include_directories (${PBRT_HIPRT_PATH})
+  list (APPEND PBRT_DEFINITIONS "PBRT_BUILD_GPU_RENDERER")
+  list (APPEND PBRT_DEFINITIONS "GTEST_HAS_SEH=0")
+  list (APPEND PBRT_CXX_FLAGS "-std=c++17")
+  list (APPEND PBRT_CXX_FLAGS "-fgpu-rdc")
+  list (APPEND PBRT_LINK_FLAGS "-fgpu-rdc")
+  if (WIN32)
+	  list (APPEND PBRT_LINK_FLAGS "-fuse-ld=lld")
+	endif()
+  list (APPEND PBRT_LINK_FLAGS "--hip-link")
+  list (APPEND PBRT_LINK_FLAGS "-Xoffload-linker")
+  list (APPEND PBRT_LINK_FLAGS "--whole-archive")
+  set (PBRT_HIP_ENABLED ON)
+  file (GLOB PBRT_HIPRT_BITCODES ${PBRT_HIPRT_BINARY_PATH}/*.bc)
+  file (GLOB PBRT_HIPRT_FATBINS ${PBRT_HIPRT_BINARY_PATH}/*.hipfb)
+  file (COPY ${PBRT_HIPRT_BITCODES} DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
+  file (COPY ${PBRT_HIPRT_FATBINS} DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
+  get_filename_component(PBRT_HIPRT_BITCODE_FILENAME ${PBRT_HIPRT_BITCODES} NAME)
+  if (WIN32)
+    file (GLOB PBRT_HIPRT_DLLS ${PBRT_HIPRT_BINARY_PATH}/*.dll)
+    file (GLOB PBRT_HIPRT_LIB_NAME ${PBRT_HIPRT_BINARY_PATH}/*.lib)
+    file (COPY ${PBRT_HIPRT_DLLS} DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
+    get_filename_component(PBRT_HIPRT_LIB_NAME ${PBRT_HIPRT_LIB_NAME} NAME_WE)
   else ()
-    find_package (CUDA REQUIRED)
+    file (GLOB PBRT_HIPRT_LIB_NAME ${PBRT_HIPRT_BINARY_PATH}/*.so)
+    get_filename_component(PBRT_HIPRT_LIB_NAME ${PBRT_HIPRT_LIB_NAME} NAME_WE)
+    string(REGEX REPLACE "^lib" "" PBRT_HIPRT_LIB_NAME ${PBRT_HIPRT_LIB_NAME})
+  endif ()
+else ()
+  check_language(CUDA)
 
-    # This seems to be necessary starting with 3.17.1, but gives an error
-    # about 17 being an unsupported version earlier...
-    if (${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.17.0")
-        set (CMAKE_CUDA_STANDARD 17)
-    endif ()
+  if (CMAKE_CUDA_COMPILER)
+    if (CUDA_VERSION_MAJOR LESS 11)
+      message (WARNING "pbrt-v4 requires CUDA version 11.0 or later but version ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}\
+  was found. GPU support is therefore disabled. If you have multiple versions\
+  of CUDA installed, please update your PATH.")
+    else ()
+      find_package (CUDA REQUIRED)
 
-    # https://github.com/VIAME/VIAME/blob/aa6c5f56a898b08e4da102c400b453e23952199c/CMakeLists.txt#L291
-    if (NOT CUDA_VERSION_PATCH)
-      if (CUDA_NVCC_EXECUTABLE AND
-          CUDA_NVCC_EXECUTABLE STREQUAL CMAKE_CUDA_COMPILER AND
-          CMAKE_CUDA_COMPILER_VERSION MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=])
-        set (CUDA_VERSION_PATCH "${CMAKE_MATCH_3}")
-      elseif (CUDA_NVCC_EXECUTABLE)
-        execute_process (COMMAND ${CUDA_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NOUT)
-        if (NOUT MATCHES [=[ V([0-9]+)\.([0-9]+)\.([0-9]+)]=])
-          set (CUDA_VERSION_PATCH "${CMAKE_MATCH_3}")
-        endif ()
+      # This seems to be necessary starting with 3.17.1, but gives an error
+      # about 17 being an unsupported version earlier...
+      if (${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.17.0")
+          set (CMAKE_CUDA_STANDARD 17)
       endif ()
-    endif ()
-
-    message (STATUS "Found CUDA: ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}.${CUDA_VERSION_PATCH}")
-    if (CUDA_VERSION_MAJOR EQUAL 11 AND CUDA_VERSION_MINOR EQUAL 3 AND CUDA_VERSION_PATCH LESS 109)
-      message (SEND_ERROR "Unfortunately, pbrt-v4 triggers an internal compiler error in CUDA 11.3.0. Please either use CUDA 11.0-11.2 or 11.3.1 or later.")
-    endif ()
 
-    if ("${PBRT_OPTIX7_PATH}" STREQUAL "")
-        message (WARNING "Found CUDA but PBRT_OPTIX7_PATH is not set. Disabling GPU compilation.")
-    else ()
-        enable_language (CUDA)
-        list (APPEND PBRT_DEFINITIONS "PBRT_BUILD_GPU_RENDERER")
-        if (PBRT_NVTX)
-            list (APPEND PBRT_DEFINITIONS "NVTX")
-        endif ()
-        if (PBRT_NVML)
-            list (APPEND PBRT_DEFINITIONS "PBRT_USE_NVML")
+      # https://github.com/VIAME/VIAME/blob/aa6c5f56a898b08e4da102c400b453e23952199c/CMakeLists.txt#L291
+      if (NOT CUDA_VERSION_PATCH)
+        if (CUDA_NVCC_EXECUTABLE AND
+            CUDA_NVCC_EXECUTABLE STREQUAL CMAKE_CUDA_COMPILER AND
+            CMAKE_CUDA_COMPILER_VERSION MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=])
+          set (CUDA_VERSION_PATCH "${CMAKE_MATCH_3}")
+        elseif (CUDA_NVCC_EXECUTABLE)
+          execute_process (COMMAND ${CUDA_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NOUT)
+          if (NOUT MATCHES [=[ V([0-9]+)\.([0-9]+)\.([0-9]+)]=])
+            set (CUDA_VERSION_PATCH "${CMAKE_MATCH_3}")
+          endif ()
         endif ()
-        set (PBRT_CUDA_ENABLED ON)
-
-        # FIXME
-        include_directories (${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})  # for regular c++ compiles
-
-        # http://www.ssl.berkeley.edu/~jimm/grizzly_docs/SSL/opt/intel/cc/9.0/lib/locale/en_US/mcpcom.msg
-        target_compile_options (
-            pbrt_warnings
-            INTERFACE
-                #"$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xptxas --warn-on-double-precision-use>"
-                "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcudafe --diag_suppress=partial_override>"
-                "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcudafe --diag_suppress=virtual_function_decl_hidden>"
-                "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcudafe --diag_suppress=integer_sign_change>"
-                "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcudafe --diag_suppress=declared_but_not_referenced>"
-                # WAR invalid warnings about this with "if constexpr"
-                "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcudafe --diag_suppress=implicit_return_from_non_void_function>"
-        )
-
-        # Willie hears yeh..
-        string (APPEND CMAKE_CUDA_FLAGS " -Xnvlink -suppress-stack-size-warning")
-
-        target_compile_options (
-            cuda_build_configuration
-            INTERFACE
-                "$<$<COMPILE_LANGUAGE:CUDA>:--std=c++17;--use_fast_math;--expt-relaxed-constexpr;--extended-lambda;--forward-unknown-to-host-compiler>"
-                # The "$<NOT:$<BOOL:$<TARGET_PROPERTY:CUDA_PTX_COMPILATION>>>" part is to not add debugging symbols when generating PTX files for OptiX; see https://github.com/mmp/pbrt-v4/issues/69#issuecomment-715499748.
-                "$<$<COMPILE_LANGUAGE:CUDA>:$<IF:$<AND:$<CONFIG:Debug>,$<NOT:$<BOOL:$<TARGET_PROPERTY:CUDA_PTX_COMPILATION>>>>,-G;-g,-lineinfo;-maxrregcount;128>>"
-        )
-
-        if (PBRT_GPU_SHADER_MODEL STREQUAL "")
-            # https://wagonhelm.github.io/articles/2018-03/detecting-cuda-capability-with-cmake
-            # Get CUDA compute capability
-            set (CHECK_CUDA_OUTPUT_EXE ${CMAKE_BINARY_DIR}/checkcuda)
-            if (MSVC)
-                execute_process (COMMAND ${CMAKE_CUDA_COMPILER} -lcuda ${PBRT_SOURCE_CODE}/cmake/checkcuda.cu -ccbin ${CMAKE_CXX_COMPILER} -o ${CHECK_CUDA_OUTPUT_EXE}
-                                 RESULT_VARIABLE BUILD_CHECK_CUDA_RETURN_CODE)
-            else  ()
-                execute_process (COMMAND ${CMAKE_CUDA_COMPILER} -lcuda ${PBRT_SOURCE_CODE}/cmake/checkcuda.cu -o ${CHECK_CUDA_OUTPUT_EXE}
-                                 RESULT_VARIABLE BUILD_CHECK_CUDA_RETURN_CODE)
-            endif ()
+      endif ()
 
-            if (NOT ${BUILD_CHECK_CUDA_RETURN_CODE} EQUAL 0)
-                message (SEND_ERROR "Was unable to build checkcuda, consider manually setting PBRT_GPU_SHADER_MODEL")
-            endif ()
+      message (STATUS "Found CUDA: ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}.${CUDA_VERSION_PATCH}")
+      if (CUDA_VERSION_MAJOR EQUAL 11 AND CUDA_VERSION_MINOR EQUAL 3 AND CUDA_VERSION_PATCH LESS 109)
+        message (SEND_ERROR "Unfortunately, pbrt-v4 triggers an internal compiler error in CUDA 11.3.0. Please either use CUDA 11.0-11.2 or 11.3.1 or later.")
+      endif ()
 
-            execute_process (COMMAND ${CHECK_CUDA_OUTPUT_EXE}
-                             RESULT_VARIABLE CUDA_RETURN_CODE
-                             OUTPUT_VARIABLE CHECK_CUDA_OUTPUT)
+      if ("${PBRT_OPTIX7_PATH}" STREQUAL "")
+          message (WARNING "Found CUDA but PBRT_OPTIX7_PATH is not set. Disabling GPU compilation.")
+      else ()
+          enable_language (CUDA)
+          list (APPEND PBRT_DEFINITIONS "PBRT_BUILD_GPU_RENDERER")
+          if (PBRT_NVTX)
+              list (APPEND PBRT_DEFINITIONS "NVTX")
+          endif ()
+          if (PBRT_NVML)
+              list (APPEND PBRT_DEFINITIONS "PBRT_USE_NVML")
+          endif ()
+          set (PBRT_CUDA_ENABLED ON)
+
+          # FIXME
+          include_directories (${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})  # for regular c++ compiles
+
+          # http://www.ssl.berkeley.edu/~jimm/grizzly_docs/SSL/opt/intel/cc/9.0/lib/locale/en_US/mcpcom.msg
+          target_compile_options (
+              pbrt_warnings
+              INTERFACE
+                  #"$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xptxas --warn-on-double-precision-use>"
+                  "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcudafe --diag_suppress=partial_override>"
+                  "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcudafe --diag_suppress=virtual_function_decl_hidden>"
+                  "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcudafe --diag_suppress=integer_sign_change>"
+                  "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcudafe --diag_suppress=declared_but_not_referenced>"
+                  # WAR invalid warnings about this with "if constexpr"
+                  "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcudafe --diag_suppress=implicit_return_from_non_void_function>"
+          )
 
-            if (NOT ${CUDA_RETURN_CODE} EQUAL 0)
-                message (SEND_ERROR ${CHECK_CUDA_OUTPUT})
-              else ()
-                set(ARCH "${CHECK_CUDA_OUTPUT}")
-                message (STATUS "Detected CUDA Architecture: ${ARCH}")
-                string (APPEND CMAKE_CUDA_FLAGS " --gpu-architecture=${ARCH}")
-            endif ()
-        else ()
-            set(ARCH "${PBRT_GPU_SHADER_MODEL}")
-            message (STATUS "Specified CUDA Architecture: ${ARCH}")
-            string (APPEND CMAKE_CUDA_FLAGS " --gpu-architecture=${ARCH}")
-        endif ()
+          # Willie hears yeh..
+          string (APPEND CMAKE_CUDA_FLAGS " -Xnvlink -suppress-stack-size-warning")
 
-        set (PBRT_CUDA_LIB cuda)
-        # optix
-        # FIXME
-        include_directories (${PBRT_OPTIX7_PATH}/include)
-
-        # find CUDA's bin2c executable
-        get_filename_component (cuda_compiler_bin "${CMAKE_CUDA_COMPILER}" DIRECTORY)
-        find_program (BIN2C
-                      NAMES bin2c
-                      PATHS ${cuda_compiler_bin}
-                      DOC "Path to the CUDA SDK bin2c executable."
-                      NO_DEFAULT_PATH)
-        if (NOT BIN2C)
-            message (FATAL_ERROR
-                     "bin2c not found:\n"
-                     "  CMAKE_CUDA_COMPILER='${CMAKE_CUDA_COMPILER}'\n"
-                     "  cuda_compiler_bin='${cuda_compiler_bin}'\n"
-            )
-        endif ()
+          target_compile_options (
+              cuda_build_configuration
+              INTERFACE
+                  "$<$<COMPILE_LANGUAGE:CUDA>:--std=c++17;--use_fast_math;--expt-relaxed-constexpr;--extended-lambda;--forward-unknown-to-host-compiler>"
+                  # The "$<NOT:$<BOOL:$<TARGET_PROPERTY:CUDA_PTX_COMPILATION>>>" part is to not add debugging symbols when generating PTX files for OptiX; see https://github.com/mmp/pbrt-v4/issues/69#issuecomment-715499748.
+                  "$<$<COMPILE_LANGUAGE:CUDA>:$<IF:$<AND:$<CONFIG:Debug>,$<NOT:$<BOOL:$<TARGET_PROPERTY:CUDA_PTX_COMPILATION>>>>,-G;-g,-lineinfo;-maxrregcount;128>>"
+          )
 
-        # this macro defines cmake rules that execute the following four steps:
-        # 1) compile the given cuda file ${cuda_file} to an intermediary PTX file
-        # 2) use the 'bin2c' tool (that comes with CUDA) to
-        #    create a second intermediary (.c-)file which defines a const string variable
-        #    (named '${c_var_name}') whose (constant) value is the PTX output
-        #    from the previous step.
-        # 3) compile the given .c file to an intermediary object file (why thus has
-        #    that PTX string 'embedded' as a global constant.
-        # 4) assign the name of the intermediary .o file to the cmake variable
-        #    'output_var', which can then be added to cmake targets.
-        macro (cuda_compile_and_embed output_var cuda_file lib_name)
-          add_library ("${lib_name}" OBJECT "${cuda_file}")
-          set_property (TARGET "${lib_name}" PROPERTY CUDA_PTX_COMPILATION ON)
-
-          # disable "extern declaration... is treated as a static definition" warning
-          if (CUDA_VERSION_MAJOR EQUAL 11 AND CUDA_VERSION_MINOR LESS 2)
-              target_compile_options ("${lib_name}" PRIVATE
-                                      -Xcudafe=--display_error_number -Xcudafe=--diag_suppress=3089)
+          if (PBRT_GPU_SHADER_MODEL STREQUAL "")
+              # https://wagonhelm.github.io/articles/2018-03/detecting-cuda-capability-with-cmake
+              # Get CUDA compute capability
+              set (CHECK_CUDA_OUTPUT_EXE ${CMAKE_BINARY_DIR}/checkcuda)
+              if (MSVC)
+                  execute_process (COMMAND ${CMAKE_CUDA_COMPILER} -lcuda ${PBRT_SOURCE_CODE}/cmake/checkcuda.cu -ccbin ${CMAKE_CXX_COMPILER} -o ${CHECK_CUDA_OUTPUT_EXE}
+                                  RESULT_VARIABLE BUILD_CHECK_CUDA_RETURN_CODE)
+              else  ()
+                  execute_process (COMMAND ${CMAKE_CUDA_COMPILER} -lcuda ${PBRT_SOURCE_CODE}/cmake/checkcuda.cu -o ${CHECK_CUDA_OUTPUT_EXE}
+                                  RESULT_VARIABLE BUILD_CHECK_CUDA_RETURN_CODE)
+              endif ()
+
+              if (NOT ${BUILD_CHECK_CUDA_RETURN_CODE} EQUAL 0)
+                  message (SEND_ERROR "Was unable to build checkcuda, consider manually setting PBRT_GPU_SHADER_MODEL")
+              endif ()
+
+              execute_process (COMMAND ${CHECK_CUDA_OUTPUT_EXE}
+                              RESULT_VARIABLE CUDA_RETURN_CODE
+                              OUTPUT_VARIABLE CHECK_CUDA_OUTPUT)
+
+              if (NOT ${CUDA_RETURN_CODE} EQUAL 0)
+                  message (SEND_ERROR ${CHECK_CUDA_OUTPUT})
+                else ()
+                  set(ARCH "${CHECK_CUDA_OUTPUT}")
+                  message (STATUS "Detected CUDA Architecture: ${ARCH}")
+                  string (APPEND CMAKE_CUDA_FLAGS " --gpu-architecture=${ARCH}")
+              endif ()
           else ()
-              target_compile_options ("${lib_name}" PRIVATE
-                                      -Xcudafe=--display_error_number -Xcudafe=--diag_suppress=20044)
+              set(ARCH "${PBRT_GPU_SHADER_MODEL}")
+              message (STATUS "Specified CUDA Architecture: ${ARCH}")
+              string (APPEND CMAKE_CUDA_FLAGS " --gpu-architecture=${ARCH}")
           endif ()
 
-          # CUDA integration in Visual Studio seems broken as even if "Use
-          # Host Preprocessor Definitions" is checked, the host preprocessor
-          # definitions are still not used when compiling device code.
-          # To work around that, define the macros using --define-macro to
-          # avoid CMake identifying those as macros and using the proper (but
-          # broken) way of specifying them.
-          if (${CMAKE_GENERATOR} MATCHES "^Visual Studio")
-            # As PBRT_DEBUG_BUILD is specified globally as a definition, we need to
-            # manually add it due to the bug mentioned earlier and due to it
-            # not being found in PBRT_DEFINITIONS.
-            if (CMAKE_BUILD_TYPE MATCHES Debug)
-                set (cuda_definitions "--define-macro=PBRT_DEBUG_BUILD")
-            endif ()
-            foreach (arg ${PBRT_DEFINITIONS})
-              list (APPEND cuda_definitions "--define-macro=${arg}")
-            endforeach ()
-            target_compile_options ("${lib_name}" PRIVATE ${cuda_definitions})
-          else ()
-            target_compile_definitions ("${lib_name}" PRIVATE ${PBRT_DEFINITIONS})
+          set (PBRT_CUDA_LIB cuda)
+          # optix
+          # FIXME
+          include_directories (${PBRT_OPTIX7_PATH}/include)
+
+          # find CUDA's bin2c executable
+          get_filename_component (cuda_compiler_bin "${CMAKE_CUDA_COMPILER}" DIRECTORY)
+          find_program (BIN2C
+                        NAMES bin2c
+                        PATHS ${cuda_compiler_bin}
+                        DOC "Path to the CUDA SDK bin2c executable."
+                        NO_DEFAULT_PATH)
+          if (NOT BIN2C)
+              message (FATAL_ERROR
+                      "bin2c not found:\n"
+                      "  CMAKE_CUDA_COMPILER='${CMAKE_CUDA_COMPILER}'\n"
+                      "  cuda_compiler_bin='${cuda_compiler_bin}'\n"
+              )
           endif ()
-          target_include_directories ("${lib_name}" PRIVATE src ${CMAKE_BINARY_DIR})
-          target_include_directories ("${lib_name}" SYSTEM PRIVATE ${NANOVDB_INCLUDE})
-          target_link_libraries ("${lib_name}" PRIVATE cuda_build_configuration pbrt_opt pbrt_warnings)
-          add_dependencies ("${lib_name}" pbrt_soa_generated)
-          set (c_var_name ${output_var})
-          set (embedded_file ${cuda_file}.ptx_embedded.c)
-          add_custom_command (
-            OUTPUT "${embedded_file}"
-            COMMAND ${CMAKE_COMMAND}
-              "-DBIN_TO_C_COMMAND=${BIN2C}"
-              "-DOBJECTS=$<TARGET_OBJECTS:${lib_name}>"
-              "-DVAR_NAME=${c_var_name}"
-              "-DOUTPUT=${embedded_file}"
-              -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/bin2c_wrapper.cmake
-            VERBATIM
-            DEPENDS "${lib_name}" $<TARGET_OBJECTS:${lib_name}>
-            COMMENT "Embedding PTX generated from ${cuda_file}"
-          )
-          set (${output_var} ${embedded_file})
-        endmacro ()
-    endif ()
-  endif()
-else ()
-    message (STATUS "CUDA not found")
+
+          # this macro defines cmake rules that execute the following four steps:
+          # 1) compile the given cuda file ${cuda_file} to an intermediary PTX file
+          # 2) use the 'bin2c' tool (that comes with CUDA) to
+          #    create a second intermediary (.c-)file which defines a const string variable
+          #    (named '${c_var_name}') whose (constant) value is the PTX output
+          #    from the previous step.
+          # 3) compile the given .c file to an intermediary object file (why thus has
+          #    that PTX string 'embedded' as a global constant.
+          # 4) assign the name of the intermediary .o file to the cmake variable
+          #    'output_var', which can then be added to cmake targets.
+          macro (cuda_compile_and_embed output_var cuda_file lib_name)
+            add_library ("${lib_name}" OBJECT "${cuda_file}")
+            set_property (TARGET "${lib_name}" PROPERTY CUDA_PTX_COMPILATION ON)
+
+            # disable "extern declaration... is treated as a static definition" warning
+            if (CUDA_VERSION_MAJOR EQUAL 11 AND CUDA_VERSION_MINOR LESS 2)
+                target_compile_options ("${lib_name}" PRIVATE
+                                        -Xcudafe=--display_error_number -Xcudafe=--diag_suppress=3089)
+            else ()
+                target_compile_options ("${lib_name}" PRIVATE
+                                        -Xcudafe=--display_error_number -Xcudafe=--diag_suppress=20044)
+            endif ()
+
+            # CUDA integration in Visual Studio seems broken as even if "Use
+            # Host Preprocessor Definitions" is checked, the host preprocessor
+            # definitions are still not used when compiling device code.
+            # To work around that, define the macros using --define-macro to
+            # avoid CMake identifying those as macros and using the proper (but
+            # broken) way of specifying them.
+            if (${CMAKE_GENERATOR} MATCHES "^Visual Studio")
+              # As PBRT_DEBUG_BUILD is specified globally as a definition, we need to
+              # manually add it due to the bug mentioned earlier and due to it
+              # not being found in PBRT_DEFINITIONS.
+              if (CMAKE_BUILD_TYPE MATCHES Debug)
+                  set (cuda_definitions "--define-macro=PBRT_DEBUG_BUILD")
+              endif ()
+              foreach (arg ${PBRT_DEFINITIONS})
+                list (APPEND cuda_definitions "--define-macro=${arg}")
+              endforeach ()
+              target_compile_options ("${lib_name}" PRIVATE ${cuda_definitions})
+            else ()
+              target_compile_definitions ("${lib_name}" PRIVATE ${PBRT_DEFINITIONS})
+            endif ()
+            target_include_directories ("${lib_name}" PRIVATE src ${CMAKE_BINARY_DIR})
+            target_include_directories ("${lib_name}" SYSTEM PRIVATE ${NANOVDB_INCLUDE})
+            target_link_libraries ("${lib_name}" PRIVATE cuda_build_configuration pbrt_opt pbrt_warnings)
+            add_dependencies ("${lib_name}" pbrt_soa_generated)
+            set (c_var_name ${output_var})
+            set (embedded_file ${cuda_file}.ptx_embedded.c)
+            add_custom_command (
+              OUTPUT "${embedded_file}"
+              COMMAND ${CMAKE_COMMAND}
+                "-DBIN_TO_C_COMMAND=${BIN2C}"
+                "-DOBJECTS=$<TARGET_OBJECTS:${lib_name}>"
+                "-DVAR_NAME=${c_var_name}"
+                "-DOUTPUT=${embedded_file}"
+                -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/bin2c_wrapper.cmake
+              VERBATIM
+              DEPENDS "${lib_name}" $<TARGET_OBJECTS:${lib_name}>
+              COMMENT "Embedding PTX generated from ${cuda_file}"
+            )
+            set (${output_var} ${embedded_file})
+          endmacro ()
+      endif ()
+    endif()
+  else ()
+      message (STATUS "CUDA not found")
+  endif ()
 endif ()
 
 ###########################################################################
@@ -363,10 +431,12 @@ endif ()
 
 include (CheckCXXCompilerFlag)
 
-check_cxx_compiler_flag ("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE)
-if (COMPILER_SUPPORTS_MARCH_NATIVE AND PBRT_BUILD_NATIVE_EXECUTABLE)
-    target_compile_options (pbrt_opt INTERFACE
-          "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler >-march=native")
+if (NOT PBRT_HIP_ENABLED)
+  check_cxx_compiler_flag ("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE)
+  if (COMPILER_SUPPORTS_MARCH_NATIVE AND PBRT_BUILD_NATIVE_EXECUTABLE)
+      target_compile_options (pbrt_opt INTERFACE
+            "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler >-march=native")
+  endif ()
 endif ()
 
 if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
@@ -692,85 +762,72 @@ SET (PBRT_UTIL_SOURCE_HEADERS
   src/pbrt/util/vecmath.h
   )
 
-if (PBRT_CUDA_ENABLED)
-  set (PBRT_GPU_SOURCE
-    src/pbrt/gpu/optix/aggregate.cpp
-    src/pbrt/gpu/optix/denoiser.cpp
-    src/pbrt/gpu/memory.cpp
-    src/pbrt/gpu/util.cpp
-  )
-  set (PBRT_GPU_SOURCE_HEADERS
-    src/pbrt/gpu/optix/aggregate.h
-    src/pbrt/gpu/cudagl.h
-    src/pbrt/gpu/optix/denoiser.h
-    src/pbrt/gpu/memory.h
-    src/pbrt/gpu/optix/optix.h
-    src/pbrt/gpu/util.h
-  )
-
-  set_source_files_properties (
-   src/pbrt/bsdf.cpp
-   src/pbrt/bssrdf.cpp
-   src/pbrt/bxdfs.cpp
-   src/pbrt/cameras.cpp
-   src/pbrt/film.cpp
-   src/pbrt/filters.cpp
-#   src/pbrt/genscene.cpp
-   src/pbrt/interaction.cpp
-   src/pbrt/lights.cpp
-   src/pbrt/lightsamplers.cpp
-   src/pbrt/materials.cpp
-#   src/pbrt/media.cpp
-   src/pbrt/options.cpp
-#   src/pbrt/paramdict.cpp
-#   src/pbrt/parser.cpp
-   src/pbrt/pbrt.cpp
-   src/pbrt/samplers.cpp
-   src/pbrt/shapes.cpp
-   src/pbrt/textures.cpp
-
-   src/pbrt/util/bluenoise.cpp
-   src/pbrt/util/check.cpp
-   src/pbrt/util/color.cpp
-   src/pbrt/util/colorspace.cpp
-   src/pbrt/util/error.cpp
-#   src/pbrt/util/file.cpp
-#   src/pbrt/util/float.cpp
-#   src/pbrt/util/image.cpp
-   src/pbrt/util/log.cpp
-#   src/pbrt/util/loopsubdiv.cpp
-   src/pbrt/util/lowdiscrepancy.cpp
-   src/pbrt/util/math.cpp
-#   src/pbrt/util/memory.cpp
-   src/pbrt/util/mesh.cpp
-#   src/pbrt/util/mipmap.cpp
-   src/pbrt/util/noise.cpp
-#   src/pbrt/util/parallel.cpp
-   src/pbrt/util/pmj02tables.cpp
-   src/pbrt/util/primes.cpp
-#   src/pbrt/util/print.cpp
-#   src/pbrt/util/progressreporter.cpp
-   src/pbrt/util/pstd.cpp
-   src/pbrt/util/rng.cpp
-   src/pbrt/util/sampling.cpp
-   src/pbrt/util/scattering.cpp
-   src/pbrt/util/sobolmatrices.cpp
-   src/pbrt/util/spectrum.cpp
-   src/pbrt/util/stats.cpp
-#   src/pbrt/util/stbimage.cpp
-#   src/pbrt/util/string.cpp
-   src/pbrt/util/transform.cpp
-   src/pbrt/util/vecmath.cpp
+if (PBRT_CUDA_ENABLED OR PBRT_HIP_ENABLED)
+  if (PBRT_HIP_ENABLED)
+    set (PBRT_GPU_SOURCE src/pbrt/gpu/hiprt/aggregate.cpp src/pbrt/gpu/memory.cpp src/pbrt/gpu/util.cpp)
+    set (PBRT_GPU_SOURCE_HEADERS src/pbrt/gpu/hiprt/aggregate.h src/pbrt/gpu/common.h src/pbrt/gpu/cudagl.h src/pbrt/gpu/memory.h src/pbrt/gpu/util.h)
+  else ()
+    set (PBRT_GPU_SOURCE src/pbrt/gpu/optix/aggregate.cpp src/pbrt/gpu/optix/denoiser.cpp src/pbrt/gpu/memory.cpp src/pbrt/gpu/util.cpp)
+    set (PBRT_GPU_SOURCE_HEADERS src/pbrt/gpu/optix/aggregate.h src/pbrt/gpu/optix/denoiser.h src/pbrt/gpu/common.h src/pbrt/gpu/cudagl.h src/pbrt/gpu/memory.h src/pbrt/gpu/util.h)
+  endif ()
 
+set(PBRT_GPU_SOURCE_ALL
+    src/pbrt/bsdf.cpp
+    src/pbrt/bssrdf.cpp
+    src/pbrt/bxdfs.cpp
+    src/pbrt/cameras.cpp
+    src/pbrt/film.cpp
+    src/pbrt/filters.cpp
+    # src/pbrt/genscene.cpp
+    src/pbrt/interaction.cpp
+    src/pbrt/lights.cpp
+    src/pbrt/lightsamplers.cpp
+    src/pbrt/materials.cpp
+    # src/pbrt/media.cpp
+    src/pbrt/options.cpp
+    # src/pbrt/paramdict.cpp src/pbrt/parser.cpp
+    src/pbrt/pbrt.cpp
+    src/pbrt/samplers.cpp
+    src/pbrt/shapes.cpp
+    src/pbrt/textures.cpp
+    src/pbrt/util/bluenoise.cpp
+    src/pbrt/util/check.cpp
+    src/pbrt/util/color.cpp
+    src/pbrt/util/colorspace.cpp
+    src/pbrt/util/error.cpp
+    # src/pbrt/util/file.cpp src/pbrt/util/float.cpp src/pbrt/util/image.cpp
+    src/pbrt/util/log.cpp
+    # src/pbrt/util/loopsubdiv.cpp
+    src/pbrt/util/lowdiscrepancy.cpp
+    src/pbrt/util/math.cpp
+    # src/pbrt/util/memory.cpp
+    src/pbrt/util/mesh.cpp
+    # src/pbrt/util/mipmap.cpp
+    src/pbrt/util/noise.cpp
+    # src/pbrt/util/parallel.cpp
+    src/pbrt/util/pmj02tables.cpp
+    src/pbrt/util/primes.cpp
+    # src/pbrt/util/print.cpp src/pbrt/util/progressreporter.cpp
+    src/pbrt/util/pstd.cpp
+    src/pbrt/util/rng.cpp
+    src/pbrt/util/sampling.cpp
+    src/pbrt/util/scattering.cpp
+    src/pbrt/util/sobolmatrices.cpp
+    src/pbrt/util/spectrum.cpp
+    src/pbrt/util/stats.cpp
+    # src/pbrt/util/stbimage.cpp src/pbrt/util/string.cpp
+    src/pbrt/util/transform.cpp
+    src/pbrt/util/vecmath.cpp
     ${PBRT_WAVEFRONT_SOURCE}
     ${PBRT_GPU_SOURCE}
+    src/pbrt/cmd/pspec_gpu.cpp)
 
-    src/pbrt/cmd/pspec_gpu.cpp
-
-    PROPERTIES LANGUAGE CUDA
-  )
-
-  cuda_compile_and_embed (PBRT_EMBEDDED_PTX src/pbrt/gpu/optix/optix.cu optix.cu)
+  if(PBRT_HIP_ENABLED)
+    set_source_files_properties(${PBRT_GPU_SOURCE_ALL} PROPERTIES LANGUAGE CXX)
+  else()
+    set_source_files_properties(${PBRT_GPU_SOURCE_ALL} PROPERTIES LANGUAGE CUDA)
+    cuda_compile_and_embed(PBRT_EMBEDDED_PTX src/pbrt/gpu/optix/optix.cu optix.cu)
+  endif()
 endif ()
 
 source_group ("Source Files" FILES ${PBRT_SOURCE})
@@ -781,7 +838,7 @@ source_group ("Source Files/util" FILES ${PBRT_UTIL_SOURCE})
 source_group ("Header Files/util" FILES ${PBRT_UTIL_SOURCE_HEADERS})
 source_group ("Source Files/wavefront" FILES ${PBRT_WAVEFRONT_SOURCE})
 source_group ("Header Files/wavefront" FILES ${PBRT_WAVEFRONT_SOURCE_HEADERS})
-if (PBRT_CUDA_ENABLED)
+if (PBRT_CUDA_ENABLED OR PBRT_HIP_ENABLED)
   source_group ("Source Files/gpu" FILES ${PBRT_GPU_SOURCE})
   source_group ("Header Files/gpu" FILES ${PBRT_GPU_SOURCE_HEADERS})
 endif ()
@@ -875,7 +932,12 @@ endif ()
 
 target_compile_options (pbrt_lib PUBLIC ${PBRT_CXX_FLAGS})
 
-target_link_libraries (pbrt_lib PRIVATE pbrt_warnings pbrt_opt $<$<BOOL:PBRT_CUDA_ENABLED>:cuda_build_configuration>)
+if (PBRT_HIP_ENABLED)
+  target_link_libraries (pbrt_lib PUBLIC hip::device)
+  target_link_libraries (pbrt_lib PRIVATE pbrt_warnings pbrt_opt)
+else ()
+  target_link_libraries (pbrt_lib PRIVATE pbrt_warnings pbrt_opt $<$<BOOL:PBRT_CUDA_ENABLED>:cuda_build_configuration>)
+endif()
 
 add_sanitizers (pbrt_lib)
 
@@ -910,6 +972,84 @@ if (PBRT_CUDA_ENABLED)
   endif ()
 endif ()
 
+if (PBRT_HIP_ENABLED)
+  list (APPEND ALL_PBRT_LIBS ${PBRT_HIPRT_LIB_NAME})
+  if (CMAKE_SYSTEM_NAME STREQUAL Windows)
+    set(HIPRT_BUILD_CMD "call ${PBRT_HIP_PATH}/bin/hipcc.bat")
+    string (APPEND HIPRT_BUILD_CMD " -DPBRT_IS_WINDOWS")
+    string (APPEND HIPRT_BUILD_CMD " -DNOMINMAX")
+    string (APPEND HIPRT_BUILD_CMD " -DPBRT_IS_MSVC")
+    string (APPEND HIPRT_BUILD_CMD " -D_CRT_SECURE_NO_WARNINGS")
+    string (APPEND HIPRT_BUILD_CMD " -D_ENABLE_EXTENDED_ALIGNED_STORAGE")
+    set(HIPRT_LINK_CMD "call ${PBRT_HIP_PATH}/bin/clang++.exe")
+  elseif (CMAKE_SYSTEM_NAME STREQUAL Linux)
+    set(HIPRT_BUILD_CMD "hipcc")
+    string (APPEND HIPRT_BUILD_CMD " -DPBRT_IS_LINUX")
+    set(HIPRT_LINK_CMD "/opt/rocm/bin/amdclang++" )
+  endif ()
+  foreach (TARGET IN LISTS AMDGPU_TARGETS)
+    message (STATUS "Current item: ${item}")
+    string (APPEND HIPRT_BUILD_CMD " --offload-arch=")
+    string (APPEND HIPRT_BUILD_CMD ${TARGET})
+    string (APPEND HIPRT_LINK_CMD " --offload-arch=")
+    string (APPEND HIPRT_LINK_CMD ${TARGET})
+  endforeach ()
+  string (APPEND HIPRT_BUILD_CMD " -I.")
+  string (APPEND HIPRT_BUILD_CMD " -I../src")
+  string (APPEND HIPRT_BUILD_CMD " -I${PBRT_HIPRT_PATH}")
+  string (APPEND HIPRT_BUILD_CMD " -I../src/ext/openvdb/nanovdb")
+  string (APPEND HIPRT_BUILD_CMD " -x hip")
+  string (APPEND HIPRT_BUILD_CMD " ../src/pbrt/gpu/hiprt/hiprt.cu")
+  string (APPEND HIPRT_BUILD_CMD " ../src/pbrt/util/sobolmatrices.cpp")
+  string (APPEND HIPRT_BUILD_CMD " ../src/pbrt/util/primes.cpp")
+  string (APPEND HIPRT_BUILD_CMD " ../src/pbrt/options.cpp")
+  string (APPEND HIPRT_BUILD_CMD " ../src/pbrt/shapes.cpp")
+  string (APPEND HIPRT_BUILD_CMD " -O3")
+  string (APPEND HIPRT_BUILD_CMD " -ffast-math")
+  string (APPEND HIPRT_BUILD_CMD " -std=c++17")
+  string (APPEND HIPRT_BUILD_CMD " --cuda-device-only")
+  string (APPEND HIPRT_BUILD_CMD " -fgpu-rdc -c")
+  string (APPEND HIPRT_BUILD_CMD " --gpu-bundle-output -c -emit-llvm")
+  string (APPEND HIPRT_BUILD_CMD " -DWARP_THREADS=32 ")
+  string (APPEND HIPRT_BUILD_CMD " -DPBRT_BUILD_GPU_RENDERER")
+  string (APPEND HIPRT_BUILD_CMD " -DPBRT_RESTRICT=__restrict__ ")
+  string (APPEND HIPRT_BUILD_CMD " -DBLOCK_SIZE=64")
+  string (APPEND HIPRT_BUILD_CMD " -DSHARED_STACK_SIZE=16")
+  string (APPEND HIPRT_LINK_CMD " -o hiprt.hipfb")
+  string (APPEND HIPRT_LINK_CMD " -fgpu-rdc")
+  string (APPEND HIPRT_LINK_CMD " --hip-link")
+  string (APPEND HIPRT_LINK_CMD " -Xoffload-linker")
+  string (APPEND HIPRT_LINK_CMD " --whole-archive")
+  string (APPEND HIPRT_LINK_CMD " --cuda-device-only")
+  string (APPEND HIPRT_LINK_CMD " ${PBRT_HIPRT_BITCODE_FILENAME}")
+  string (APPEND HIPRT_LINK_CMD " hiprt-hip-amdgcn-amd-amdhsa.bc")
+  string (APPEND HIPRT_LINK_CMD " primes-hip-amdgcn-amd-amdhsa.bc")
+  string (APPEND HIPRT_LINK_CMD " sobolmatrices-hip-amdgcn-amd-amdhsa.bc")
+  string (APPEND HIPRT_LINK_CMD " options-hip-amdgcn-amd-amdhsa.bc")
+  string (APPEND HIPRT_LINK_CMD " shapes-hip-amdgcn-amd-amdhsa.bc")
+  if (CMAKE_SYSTEM_NAME STREQUAL Windows)
+    add_custom_target (
+        libdeflate_msvc_patch
+        COMMAND cd ${CMAKE_CURRENT_SOURCE_DIR}/src/ext/libdeflate && git apply ${CMAKE_CURRENT_SOURCE_DIR}/0001-MSVC-HIP.patch > nul 2> nul & exit 0
+    )
+    add_dependencies (pbrt_lib libdeflate_msvc_patch)
+	  message (STATUS "Applying patch to libdeflate to support _BitScanReverse for Windows HIP build.")
+    file (WRITE "${CMAKE_CURRENT_BINARY_DIR}/hiprt.bat" "${HIPRT_BUILD_CMD}\n${HIPRT_LINK_CMD}\n")
+    add_custom_target (
+      hiprt_kernels ALL
+      DEPENDS pbrt_lib
+      COMMAND ${CMAKE_CURRENT_BINARY_DIR}/hiprt.bat
+      COMMENT "Compiling HIPRT kernels...")
+  elseif (CMAKE_SYSTEM_NAME STREQUAL Linux)
+    file (WRITE "${CMAKE_CURRENT_BINARY_DIR}/hiprt.sh" "${HIPRT_BUILD_CMD}\n${HIPRT_LINK_CMD}\n")
+    add_custom_target (
+      hiprt_kernels ALL
+      DEPENDS pbrt_lib
+      COMMAND chmod 777 ${CMAKE_CURRENT_BINARY_DIR}/hiprt.sh && ${CMAKE_CURRENT_BINARY_DIR}/hiprt.sh
+      COMMENT "Compiling HIPRT kernels...")
+  endif ()
+endif ()
+
 if (WIN32)
   list (APPEND ALL_PBRT_LIBS "dbghelp" "wsock32" "ws2_32")
 endif ()
@@ -956,6 +1096,7 @@ add_executable (pbrt::pbrt_exe ALIAS pbrt_exe)
 
 target_compile_definitions (pbrt_exe PRIVATE ${PBRT_DEFINITIONS})
 target_compile_options (pbrt_exe PRIVATE ${PBRT_CXX_FLAGS})
+target_link_options (pbrt_exe PRIVATE ${PBRT_LINK_FLAGS})
 target_include_directories (pbrt_exe PRIVATE src src/ext)
 target_link_libraries (pbrt_exe PRIVATE ${ALL_PBRT_LIBS} pbrt_opt pbrt_warnings)
 
@@ -976,6 +1117,7 @@ set_property (TARGET sky_lib PROPERTY FOLDER "ext")
 
 target_compile_definitions (imgtool PRIVATE ${PBRT_DEFINITIONS})
 target_compile_options (imgtool PRIVATE ${PBRT_CXX_FLAGS})
+target_link_options (imgtool PRIVATE ${PBRT_LINK_FLAGS})
 target_include_directories (imgtool PRIVATE src src/ext ${FLIP_INCLUDE})
 target_link_libraries (imgtool PRIVATE ${ALL_PBRT_LIBS} pbrt_opt pbrt_warnings sky_lib flip_lib)
 
@@ -991,6 +1133,7 @@ add_executable (pbrt::pspec ALIAS pspec)
 
 target_compile_definitions (pspec PRIVATE ${PBRT_DEFINITIONS})
 target_compile_options (pspec PRIVATE ${PBRT_CXX_FLAGS})
+target_link_options (pspec PRIVATE ${PBRT_LINK_FLAGS})
 target_include_directories (pspec PRIVATE src src/ext)
 target_link_libraries (pspec PRIVATE ${ALL_PBRT_LIBS} pbrt_warnings)
 
@@ -1006,6 +1149,7 @@ add_executable (pbrt::plytool ALIAS plytool)
 
 target_compile_definitions (plytool PRIVATE ${PBRT_DEFINITIONS})
 target_compile_options (plytool PUBLIC ${PBRT_CXX_FLAGS})
+target_link_options (plytool PRIVATE ${PBRT_LINK_FLAGS})
 target_include_directories (plytool PUBLIC src src/ext)
 target_link_libraries (plytool PRIVATE ${ALL_PBRT_LIBS} pbrt_warnings pbrt_opt)
 
@@ -1025,6 +1169,7 @@ target_compile_definitions (nanovdb2pbrt PRIVATE ${PBRT_DEFINITIONS})
 target_compile_options (nanovdb2pbrt PUBLIC ${PBRT_CXX_FLAGS})
 target_include_directories (nanovdb2pbrt PUBLIC src src/ext)
 target_link_libraries (nanovdb2pbrt PRIVATE ${ALL_PBRT_LIBS} pbrt_warnings pbrt_opt)
+target_link_options (nanovdb2pbrt PRIVATE ${PBRT_LINK_FLAGS})
 
 set_target_properties (nanovdb2pbrt PROPERTIES OUTPUT_NAME nanovdb2pbrt)
 
@@ -1087,7 +1232,8 @@ add_executable (pbrt_test src/pbrt/cmd/pbrt_test.cpp ${PBRT_TEST_SOURCE})
 target_link_libraries (pbrt_test PRIVATE ${ALL_PBRT_LIBS} pbrt_opt pbrt_warnings)
 target_compile_definitions (pbrt_test PRIVATE ${PBRT_DEFINITIONS})
 target_include_directories (pbrt_test PRIVATE src src/ext ${DOUBLE_CONVERSION_INCLUDE})
-target_compile_options(pbrt_test PUBLIC ${PBRT_CXX_FLAGS})
+target_compile_options (pbrt_test PUBLIC ${PBRT_CXX_FLAGS})
+target_link_options (pbrt_test PRIVATE ${PBRT_LINK_FLAGS})
 
 add_sanitizers (pbrt_test)
 
diff --git a/LICENSE.txt b/LICENSE.txt
index d64569567..acd97298b 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,3 +1,7 @@
+All of the modifications made in commit ID dfd293c76b33ec03df55284283622868ea10aea9 are covered by the MIT license (below).
+The rest would be covered by the existing Apache license.
+
+---------------------------------------------------------
 
                                  Apache License
                            Version 2.0, January 2004
@@ -200,3 +204,28 @@
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.
+
+
+---------------------------------------------------------
+
+The MIT License (MIT)
+
+Copyright (c) 2022-2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
\ No newline at end of file
diff --git a/README.md b/README.md
index 0454dda15..d1d822297 100644
--- a/README.md
+++ b/README.md
@@ -228,3 +228,33 @@ is straightforward:
 ```bash
 $ imgtool denoise-optix noisy.exr --outfile denoised.exr
 ```
+
+Instructions to build the HIP port
+--------
+
+Linux:
+* Install [ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/native-install/ubuntu.html)
+* Download and extract [HIPRT](https://gpuopen.com/hiprt/)
+  * Note that HIP is very sensitive to the difference between versions of HIPRT and the application on Linux
+  * If you encounter linking error, we recommend to compile HIPRT by yourself
+* `cmake -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ -DPBRT_HIPRT=ON -DPBRT_HIPRT_PATH=~/hiprtSdk ..`
+* `make` or `make pbrt_exe`
+
+Windows:
+* Download and install [HIP SDK](https://www.amd.com/en/developer/resources/rocm-hub/hip-sdk.html)
+  * Make sure that you checked HIPRT to be installed
+* Download and install [Strawberry Perl](https://strawberryperl.com/)
+* Run `x64 Native Tools Command Prompt for VS 2022` as administrator
+* `set CC=clang`
+* `set CXX=clang++`
+* `mkdir build` and `cd build`
+* `cmake -G "Ninja" -DCMAKE_BUILD_TYPE=Release -DPBRT_HIPRT=ON ..`
+* `cmake --build . --config=Release`
+
+Example: `./pbrt --gpu ~/pbrt-v4-scenes/killeroos/killeroo-simple.pbrt`
+Add `--interactive` for the interactive mode.
+
+GPU architecture and ROCm version
+--------
+
+The instructions above assume the `gfx1100` architecture. You can specify other architectures via the CMake `AMDGPU_TARGETS` variable. Note that PBRT should be compiled with the same version as HIPRT binaries (e.g., using ROCm 6.0 and `hiprt02004_6.0_amd_lib_linux.bc`).
\ No newline at end of file
diff --git a/src/ext/flip/flip.cpp b/src/ext/flip/flip.cpp
index 0eedeed44..ba0f18541 100644
--- a/src/ext/flip/flip.cpp
+++ b/src/ext/flip/flip.cpp
@@ -50,6 +50,10 @@
 #include <fstream>
 #include <cassert>
 
+#ifndef M_PI
+#define M_PI 3.14159265358979323846f
+#endif
+
 namespace flip_detail {
 
 class histogram
diff --git a/src/ext/openvdb b/src/ext/openvdb
index 414bed84c..385516321 160000
--- a/src/ext/openvdb
+++ b/src/ext/openvdb
@@ -1 +1 @@
-Subproject commit 414bed84c2fc22e188eac7b611aa85c7edd7a5a9
+Subproject commit 385516321b272c423e2a514b41bd21ce6ffdda72
diff --git a/src/pbrt/base/medium.h b/src/pbrt/base/medium.h
index 698b04816..30500d006 100644
--- a/src/pbrt/base/medium.h
+++ b/src/pbrt/base/medium.h
@@ -88,7 +88,6 @@ class Medium
 
     std::string ToString() const;
 
-    PBRT_CPU_GPU
     bool IsEmissive() const;
 
     PBRT_CPU_GPU
diff --git a/src/pbrt/cmd/imgtool.cpp b/src/pbrt/cmd/imgtool.cpp
index a812fc7bb..e2c14dbf0 100644
--- a/src/pbrt/cmd/imgtool.cpp
+++ b/src/pbrt/cmd/imgtool.cpp
@@ -7,9 +7,9 @@
 #include <pbrt/filters.h>
 #include <pbrt/options.h>
 #ifdef PBRT_BUILD_GPU_RENDERER
-#ifndef __HIP_PLATFORM_AMD__
+#ifdef __NVCC__
 #include <pbrt/gpu/optix/denoiser.h>
-#endif // __HIP_PLATFORM_AMD__
+#endif // __NVCC__
 #include <pbrt/gpu/util.h>
 #endif  // PBRT_BUILD_GPU_RENDERER
 #include <pbrt/util/args.h>
@@ -2220,7 +2220,7 @@ int makeequiarea(std::vector<std::string> args) {
     return 0;
 }
 
-#ifdef PBRT_BUILD_GPU_RENDERER
+#ifdef __NVCC__
 int denoise_optix(std::vector<std::string> args) {
     std::string inFilename, outFilename;
 
@@ -2335,7 +2335,7 @@ int denoise_optix(std::vector<std::string> args) {
 
     return 0;
 }
-#endif  // PBRT_BUILD_GPU_RENDERER
+#endif  // __NVCC__
 
 int main(int argc, char *argv[]) {
     PBRTOptions opt;
@@ -2362,10 +2362,10 @@ int main(int argc, char *argv[]) {
         return convert(args);
     else if (cmd == "diff")
         return diff(args);
-#ifdef PBRT_BUILD_GPU_RENDERER
+#ifdef __NVCC__
     else if (cmd == "denoise-optix")
         return denoise_optix(args);
-#endif  // PBRT_BUILD_GPU_RENDERER
+#endif  // __NVCC__
     else if (cmd == "error")
         return error(args);
     else if (cmd == "falsecolor")
diff --git a/src/pbrt/cmd/pbrt_test.cpp b/src/pbrt/cmd/pbrt_test.cpp
index 4bd7b23d3..3789dae8b 100644
--- a/src/pbrt/cmd/pbrt_test.cpp
+++ b/src/pbrt/cmd/pbrt_test.cpp
@@ -2,6 +2,7 @@
 // The pbrt source code is licensed under the Apache License, Version 2.0.
 // SPDX: Apache-2.0
 
+#include <gtest/gtest.h>
 #include <pbrt/pbrt.h>
 
 #include <pbrt/options.h>
@@ -9,7 +10,6 @@
 #include <pbrt/util/error.h>
 #include <pbrt/util/print.h>
 
-#include <gtest/gtest.h>
 #include <string>
 
 using namespace pbrt;
diff --git a/src/pbrt/cmd/pspec_gpu.cpp b/src/pbrt/cmd/pspec_gpu.cpp
index ce5b1a1f3..5cc0ffa00 100644
--- a/src/pbrt/cmd/pspec_gpu.cpp
+++ b/src/pbrt/cmd/pspec_gpu.cpp
@@ -16,8 +16,12 @@
 #include <pbrt/util/image.h>
 #include <pbrt/util/vecmath.h>
 
+#if defined(__HIPCC__)
+#include <pbrt/util/hip_aliases.h>
+#else
 #include <cuda.h>
 #include <cuda_runtime_api.h>
+#endif
 
 #include <vector>
 
diff --git a/src/pbrt/gpu/common.h b/src/pbrt/gpu/common.h
new file mode 100644
index 000000000..7451d958d
--- /dev/null
+++ b/src/pbrt/gpu/common.h
@@ -0,0 +1,111 @@
+#ifndef PBRT_GPU_COMMON_H
+#define PBRT_GPU_COMMON_H
+
+#include <pbrt/pbrt.h>
+
+#include <pbrt/base/light.h>
+#include <pbrt/base/material.h>
+#include <pbrt/base/medium.h>
+#include <pbrt/base/shape.h>
+#include <pbrt/base/texture.h>
+#include <pbrt/util/pstd.h>
+#include <pbrt/wavefront/workitems.h>
+#include <pbrt/wavefront/workqueue.h>
+
+#if defined(__HIPCC__)
+#include <hiprt/hiprt.h>
+#include <hiprt/hiprt_vec.h>
+#else
+#include <optix.h>
+#endif
+
+namespace pbrt {
+
+class TriangleMesh;
+class BilinearPatchMesh;
+
+struct TriangleMeshRecord {
+    const TriangleMesh *mesh;
+    Material material;
+    FloatTexture alphaTexture;
+    pstd::span<Light> areaLights;
+    MediumInterface *mediumInterface;
+};
+
+struct BilinearMeshRecord {
+    const BilinearPatchMesh *mesh;
+    Material material;
+    FloatTexture alphaTexture;
+    pstd::span<Light> areaLights;
+    MediumInterface *mediumInterface;
+};
+
+struct QuadricRecord {
+    Shape shape;
+    Material material;
+    FloatTexture alphaTexture;
+    Light areaLight;
+    MediumInterface *mediumInterface;
+};
+
+#if defined(__HIP_PLATFORM_AMD__)
+static constexpr size_t HitgroupAlignment = 16u;
+
+struct alignas(HitgroupAlignment) HitgroupRecord {
+    PBRT_CPU_GPU HitgroupRecord() {}
+    PBRT_CPU_GPU HitgroupRecord(const HitgroupRecord &r) {
+        memcpy(this, &r, sizeof(HitgroupRecord));
+    }
+    PBRT_CPU_GPU HitgroupRecord &operator=(const HitgroupRecord &r) {
+        if (this != &r)
+            memcpy(this, &r, sizeof(HitgroupRecord));
+        return *this;
+    }
+
+    union {
+        TriangleMeshRecord triRec;
+        BilinearMeshRecord blpRec;
+        QuadricRecord quadricRec;
+    };
+    enum { TriangleMesh, BilinearMesh, Quadric } type;
+};
+#endif
+
+struct RayIntersectParameters {
+#if defined(__HIPCC__)
+    hiprtScene traversable;
+#else
+    OptixTraversableHandle traversable;
+#endif
+
+    const RayQueue *rayQueue;
+
+    // Closest hit
+    RayQueue *nextRayQueue;
+    EscapedRayQueue *escapedRayQueue;
+    HitAreaLightQueue *hitAreaLightQueue;
+    MaterialEvalQueue *basicEvalMaterialQueue, *universalEvalMaterialQueue;
+    MediumSampleQueue *mediumSampleQueue;
+
+    // Shadow rays
+    ShadowRayQueue *shadowRayQueue;
+    SOA<PixelSampleState> pixelSampleState;
+
+    // Subsurface scattering...
+    SubsurfaceScatterQueue *subsurfaceScatterQueue;
+
+#if defined(__HIPCC__)
+    // Stack buffers
+    hiprtGlobalStackBuffer globalStackBuffer;
+    hiprtGlobalStackBuffer globalInstanceStackBuffer;
+    // Custom function table
+    hiprtFuncTable funcTable;
+    // Hitgroup records
+    HitgroupRecord *hgRecords;
+    // Offsets for hitgroup records
+    uint32_t *offsets;
+#endif
+};
+}  // namespace pbrt
+
+#endif  // PBRT_GPU_COMMON_H
diff --git a/src/pbrt/gpu/cudagl.h b/src/pbrt/gpu/cudagl.h
index 2695844f2..d04dd6351 100644
--- a/src/pbrt/gpu/cudagl.h
+++ b/src/pbrt/gpu/cudagl.h
@@ -34,9 +34,13 @@
 
 #include <glad/glad.h>
 
+#if defined(__HIPCC__)
+#include <pbrt/util/hip_aliases.h>
+#else
 #include <cuda.h>
 #include <cuda_gl_interop.h>
 #include <cuda_runtime.h>
+#endif
 
 #define GL_CHECK(call)                                                   \
     do {                                                                 \
@@ -370,6 +374,15 @@ CUDAOutputBuffer<PIXEL_FORMAT>::CUDAOutputBuffer(int32_t width, int32_t height)
                           nullptr, GL_STREAM_DRAW));
     GL_CHECK(glBindBuffer(GL_ARRAY_BUFFER, 0u));
 
+#ifdef __HIPCC__
+    uint32_t num_gl_devices = 0;
+
+    int glDevice;
+    cudaGLGetDevices(&num_gl_devices, &glDevice, 1, cudaGLDeviceListAll);
+
+    if (glDevice != current_device)
+        LOG_FATAL("Multi-GPU not supported with GL interop yet");
+#endif
     CUDA_CHECK(cudaGraphicsGLRegisterBuffer(&m_cuda_gfx_resource, m_pbo,
                                             cudaGraphicsMapFlagsWriteDiscard));
 
diff --git a/src/pbrt/gpu/hiprt/aggregate.cpp b/src/pbrt/gpu/hiprt/aggregate.cpp
new file mode 100644
index 000000000..d99f2fa96
--- /dev/null
+++ b/src/pbrt/gpu/hiprt/aggregate.cpp
@@ -0,0 +1,1499 @@
+// pbrt is Copyright(c) 1998-2020 Matt Pharr, Wenzel Jakob, and Greg Humphreys.
+// The pbrt source code is licensed under the Apache License, Version 2.0.
+// SPDX: Apache-2.0
+
+#include <pbrt/gpu/hiprt/aggregate.h>
+
+#include <pbrt/gpu/common.h>
+#include <pbrt/gpu/util.h>
+#include <pbrt/lights.h>
+#include <pbrt/materials.h>
+#include <pbrt/scene.h>
+#include <pbrt/textures.h>
+#include <pbrt/util/error.h>
+#include <pbrt/util/file.h>
+#include <pbrt/util/log.h>
+#include <pbrt/util/loopsubdiv.h>
+#include <pbrt/util/mesh.h>
+#include <pbrt/util/parallel.h>
+#include <pbrt/util/pstd.h>
+#include <pbrt/util/splines.h>
+#include <pbrt/util/stats.h>
+#include <pbrt/wavefront/intersect.h>
+
+#include <atomic>
+#include <fstream>
+#include <mutex>
+#include <unordered_map>
+
+#include <hiprt/hiprt.h>
+
+#define HIPRT_CHECK(EXPR)                                                    \
+    do {                                                                     \
+        hiprtError res = EXPR;                                               \
+        if (res != hiprtSuccess)                                             \
+            LOG_FATAL("HIPRT call " #EXPR " failed with code %d", int(res)); \
+    } while (false) /* eat semicolon */
+
+namespace pbrt {
+
+STAT_MEMORY_COUNTER("Memory/Acceleration structures", gpuBVHBytes);
+STAT_COUNTER("Geometry/Triangles added from displacement mapping", displacedTrisDelta);
+STAT_COUNTER("Geometry/Curves", nCurves);
+STAT_COUNTER("Geometry/Bilinear patches created for diced curves", nBLPsForCurves);
+
+static Material getMaterial(const ShapeSceneEntity &shape,
+                            const std::map<std::string, Material> &namedMaterials,
+                            const std::vector<Material> &materials) {
+    if (!shape.materialName.empty()) {
+        auto iter = namedMaterials.find(shape.materialName);
+        if (iter == namedMaterials.end())
+            ErrorExit(&shape.loc, "%s: material not defined", shape.materialName);
+        return iter->second;
+    } else {
+        CHECK_NE(shape.materialIndex, -1);
+        return materials[shape.materialIndex];
+    }
+}
+
+static FloatTexture getAlphaTexture(
+    const ShapeSceneEntity &shape,
+    const std::map<std::string, FloatTexture> &floatTextures, Allocator alloc) {
+    FloatTexture alphaTexture;
+
+    std::string alphaTexName = shape.parameters.GetTexture("alpha");
+    if (alphaTexName.empty()) {
+        if (Float alpha = shape.parameters.GetOneFloat("alpha", 1.f); alpha < 1.f)
+            alphaTexture = alloc.new_object<FloatConstantTexture>(alpha);
+        else
+            return nullptr;
+    } else {
+        auto iter = floatTextures.find(alphaTexName);
+        if (iter == floatTextures.end())
+            ErrorExit(&shape.loc, "%s: alpha texture not defined.", alphaTexName);
+
+        alphaTexture = iter->second;
+    }
+
+    if (!BasicTextureEvaluator().CanEvaluate({alphaTexture}, {})) {
+        // It would be nice to just use the UniversalTextureEvaluator (maybe
+        // always), but optix complains "Error: Found call graph recursion"...
+        Warning(&shape.loc,
+                "%s: alpha texture too complex for BasicTextureEvaluator "
+                "(need fallback path). Ignoring for now.",
+                alphaTexName);
+        alphaTexture = nullptr;
+    }
+
+    return alphaTexture;
+}
+
+static MediumInterface *getMediumInterface(const ShapeSceneEntity &shape,
+                                           const std::map<std::string, Medium> &media,
+                                           Allocator alloc) {
+    if (shape.insideMedium.empty() && shape.outsideMedium.empty())
+        return nullptr;
+
+    auto getMedium = [&](const std::string &name) -> Medium {
+        if (name.empty())
+            return nullptr;
+
+        auto iter = media.find(name);
+        if (iter == media.end())
+            ErrorExit(&shape.loc, "%s: medium not defined", name);
+        return iter->second;
+    };
+
+    return alloc.new_object<MediumInterface>(getMedium(shape.insideMedium),
+                                             getMedium(shape.outsideMedium));
+}
+
+HiprtAggregate::HiprtAggregate(
+    const BasicScene &scene, CUDATrackedMemoryResource *memoryResource,
+    NamedTextures &textures,
+    const std::map<int, pstd::vector<Light> *> &shapeIndexToAreaLights,
+    const std::map<std::string, Medium> &media,
+    const std::map<std::string, pbrt::Material> &namedMaterials,
+    const std::vector<pbrt::Material> &materials, int maxQueueSize)
+    : memoryResource(memoryResource), cudaStream(nullptr), scene(nullptr) {
+    hipCtx_t cudaContext;
+    CUDA_CHECK(hipCtxGetCurrent(&cudaContext));
+    CHECK(cudaContext != nullptr);
+
+#if defined(PBRT_IS_WINDOWS)
+    // On Windows, it is unfortunately necessary to disable
+    // multithreading here.  The issue is that GPU managed memory can
+    // only be accessed by one of the CPU or the GPU at a time; the
+    // program crashes if this is restriction is violated.  Thus, it's
+    // bad news if we are simultaneously, say, reading PLY files on
+    // the CPU and storing them in managed memory while an OptiX
+    // kernel is running on the GPU to build a BVH... (Issue #164).
+    if (Options->useGPU)
+        DisableThreadPool();
+#endif  // PBRT_IS_WINDOWS
+
+    ThreadLocal<hipStream_t> threadCUDAStreams([]() {
+        hipStream_t stream;
+        hipStreamCreate(&stream);
+        return stream;
+    });
+
+    paramsPool.resize(256);  // should be plenty
+    for (ParamBufferState &ps : paramsPool) {
+        void *ptr;
+        CUDA_CHECK(hipMalloc(&ptr, sizeof(RayIntersectParameters)));
+        ps.ptr = (hipDeviceptr_t)ptr;
+        CUDA_CHECK(hipEventCreate(&ps.finishedEvent));
+        CUDA_CHECK(hipHostMalloc(&ps.hostPtr, sizeof(RayIntersectParameters)));
+    }
+
+    // Create HIPRT context
+    LOG_VERBOSE("Starting HIPRT initialization");
+
+    int current_device;
+    CUDA_CHECK(hipGetDevice(&current_device));
+
+    hiprtContextCreationInput ctxInput;
+    ctxInput.ctxt = cudaContext;
+    ctxInput.device = current_device;
+    ctxInput.deviceType = hiprtDeviceAMD;
+    HIPRT_CHECK(hiprtCreateContext(HIPRT_API_VERSION, ctxInput, context));
+
+#ifdef NDEBUG
+    hiprtSetLogLevel(hiprtLogLevelError);
+#endif
+    LOG_VERBOSE("HIPRT version %d.%d.%x successfully initialized", HIPRT_MAJOR_VERSION,
+                HIPRT_MINOR_VERSION, HIPRT_PATCH_VERSION);
+
+    // HIPRT module
+    module = compileHiprtModule(context);
+
+    LOG_VERBOSE("Finished HIPRT initialization");
+
+    // Note: do not delete the pointers in threadBufferResources, since doing
+    // so would cause the memory they manage to be freed.
+    ThreadLocal<Allocator> threadAllocators([memoryResource]() {
+        pstd::pmr::monotonic_buffer_resource *resource =
+            new pstd::pmr::monotonic_buffer_resource(1024 * 1024, memoryResource);
+        return Allocator(resource);
+    });
+
+    ///////////////////////////////////////////////////////////////////////////
+    // Build top-level acceleration structures for non-instanced shapes
+    LOG_VERBOSE("Starting to create shapes and acceleration structures");
+    for (const auto &shape : scene.shapes)
+        if (shape.name != "sphere" && shape.name != "cylinder" && shape.name != "disk" &&
+            shape.name != "trianglemesh" && shape.name != "plymesh" &&
+            shape.name != "loopsubdiv" && shape.name != "bilinearmesh" &&
+            shape.name != "curve")
+            ErrorExit(&shape.loc, "%s: unknown shape", shape.name);
+
+    LOG_VERBOSE("Starting to read PLY meshes");
+    std::map<int, TriQuadMesh> plyMeshes =
+        PreparePLYMeshes(scene.shapes, textures.floatTextures);
+    LOG_VERBOSE("Finished reading PLY meshes");
+
+    LOG_VERBOSE("Starting to build geometries (BLAS)");
+    geomGroups.resize(scene.instanceDefinitions.size() + 1);
+
+    geomGroups[0].triGeomContainer = buildBVHForTriangles(
+        scene.shapes, plyMeshes, context, textures.floatTextures, namedMaterials,
+        materials, media, shapeIndexToAreaLights, threadAllocators, threadCUDAStreams);
+
+    geomGroups[0].blpGeomContainer = buildBVHForBLPs(
+        scene.shapes, context, textures.floatTextures, namedMaterials, materials, media,
+        shapeIndexToAreaLights, threadAllocators, threadCUDAStreams);
+
+    geomGroups[0].quadricGeomContainer = buildBVHForQuadrics(
+        scene.shapes, context, textures.floatTextures, namedMaterials, materials, media,
+        shapeIndexToAreaLights, threadAllocators, threadCUDAStreams);
+    LOG_VERBOSE("Finished building geometries (BLAS)");
+
+    ///////////////////////////////////////////////////////////////////////////
+    // Create instanced geometries
+    LOG_VERBOSE("Starting to build instanced geometries",
+                scene.instanceDefinitions.size());
+
+    InstanceContainer instanceContainer;
+    instanceContainer.appendRecords(geomGroups[0].triGeomContainer);
+    instanceContainer.appendRecords(geomGroups[0].blpGeomContainer);
+    instanceContainer.appendRecords(geomGroups[0].quadricGeomContainer);
+
+    std::vector<InternedString> allInstanceNames;
+    for (const auto &def : scene.instanceDefinitions)
+        allInstanceNames.push_back(def.first);
+
+    std::unordered_map<InternedString, GeometryGroup, InternedStringHash> instanceMap;
+    for (int i = 0; i < scene.instanceDefinitions.size(); ++i) {
+        InternedString name = allInstanceNames[i];
+        auto iter = scene.instanceDefinitions.find(name);
+        CHECK(iter != scene.instanceDefinitions.end());
+        const auto &def = *iter;
+
+        if (!def.second->animatedShapes.empty())
+            Warning("Ignoring %d animated shapes in instance \"%s\".",
+                    def.second->animatedShapes.size(), def.first);
+
+        std::map<int, TriQuadMesh> meshes =
+            PreparePLYMeshes(def.second->shapes, textures.floatTextures);
+
+        GeometryGroup &geomGroup = geomGroups[i + 1];
+
+        geomGroup.triGeomContainer = buildBVHForTriangles(
+            def.second->shapes, meshes, context, textures.floatTextures, namedMaterials,
+            materials, media, {}, threadAllocators, threadCUDAStreams);
+
+        geomGroup.blpGeomContainer = buildBVHForBLPs(
+            def.second->shapes, context, textures.floatTextures, namedMaterials,
+            materials, media, {}, threadAllocators, threadCUDAStreams);
+
+        geomGroup.quadricGeomContainer = buildBVHForQuadrics(
+            def.second->shapes, context, textures.floatTextures, namedMaterials,
+            materials, media, {}, threadAllocators, threadCUDAStreams);
+
+        instanceContainer.appendRecords(geomGroup.triGeomContainer);
+        instanceContainer.appendRecords(geomGroup.blpGeomContainer);
+        instanceContainer.appendRecords(geomGroup.quadricGeomContainer);
+
+        meshes.clear();
+        instanceMap[def.first] = geomGroup;
+    }
+    LOG_VERBOSE("Finished building instanced geometries");
+
+    ///////////////////////////////////////////////////////////////////////////
+    // Instancing
+    LOG_VERBOSE("Starting to build scene (TLAS)");
+    hiprtFrameMatrix identity{};
+    for (size_t i = 0; i < 3; ++i)
+        identity.matrix[i][i] = 1.0f;
+
+    instanceContainer.insertInstance(geomGroups[0].triGeomContainer, identity);
+    instanceContainer.insertInstance(geomGroups[0].blpGeomContainer, identity);
+    instanceContainer.insertInstance(geomGroups[0].quadricGeomContainer, identity);
+    size_t nNoInstancedGeoms = instanceContainer.instances.size();
+
+    for (size_t i = 0; i < scene.instances.size(); ++i) {
+        const auto &sceneInstance = scene.instances[i];
+        auto iter = instanceMap.find(sceneInstance.name);
+
+        if (iter != instanceMap.end()) {
+            SquareMatrix<4> O2WTransfrom = sceneInstance.renderFromInstance->GetMatrix();
+            SquareMatrix<4> W2OTransfrom = Inverse(O2WTransfrom).value();
+
+            hiprtFrameMatrix transform;
+            for (int j = 0; j < 3; ++j)
+                for (int k = 0; k < 4; ++k)
+                    transform.matrix[j][k] = O2WTransfrom[j][k];
+
+            GeometryGroup &geomGroup = iter->second;
+            instanceContainer.insertInstance(geomGroup.triGeomContainer, transform);
+            instanceContainer.insertInstance(geomGroup.blpGeomContainer, transform);
+            instanceContainer.insertInstance(geomGroup.quadricGeomContainer, transform);
+        }
+    }
+
+    this->scene =
+        buildBVHForInstances(instanceContainer.instances, instanceContainer.transforms,
+                             context, threadAllocators, threadCUDAStreams);
+    LOG_VERBOSE("Finished building scene (TLAS)");
+
+    ///////////////////////////////////////////////////////////////////////////
+    // Bounds
+    hiprtFloat3 aabbMin, aabbMax;
+    HIPRT_CHECK(hiprtExportSceneAabb(context, this->scene, aabbMin, aabbMax));
+    bounds.pMin = {aabbMin.x, aabbMin.y, aabbMin.z};
+    bounds.pMax = {aabbMax.x, aabbMax.y, aabbMax.z};
+
+    ///////////////////////////////////////////////////////////////////////////
+    // Copy hitgroup records and offsets
+    hipStream_t buildStream = threadCUDAStreams.Get();
+    CUDA_CHECK(hipMalloc(&hgRecords,
+                         instanceContainer.hgRecords.size() * sizeof(HitgroupRecord)));
+    CUDA_CHECK(hipMemcpyAsync((void *)hgRecords, instanceContainer.hgRecords.data(),
+                              sizeof(HitgroupRecord) * instanceContainer.hgRecords.size(),
+                              hipMemcpyHostToDevice, buildStream));
+    CUDA_CHECK(hipMalloc(&offsets, instanceContainer.offsets.size() * sizeof(uint32_t)));
+    CUDA_CHECK(hipMemcpyAsync((void *)offsets, instanceContainer.offsets.data(),
+                              sizeof(uint32_t) * instanceContainer.offsets.size(),
+                              hipMemcpyHostToDevice, buildStream));
+
+    ///////////////////////////////////////////////////////////////////////////
+    // Create stack buffers
+    constexpr uint32_t StackSize = 64;
+    hiprtGlobalStackBufferInput stackBufferInput{hiprtStackTypeGlobal,
+                                                 hiprtStackEntryTypeInteger, StackSize,
+                                                 (uint32_t)maxQueueSize};
+    HIPRT_CHECK(
+        hiprtCreateGlobalStackBuffer(context, stackBufferInput, globalStackBuffer));
+
+    constexpr uint32_t InstanceStackSize = 1;
+    hiprtGlobalStackBufferInput instanceStackBufferInput{
+        hiprtStackTypeGlobal, hiprtStackEntryTypeInstance, InstanceStackSize,
+        (uint32_t)maxQueueSize};
+    HIPRT_CHECK(hiprtCreateGlobalStackBuffer(context, instanceStackBufferInput,
+                                             globalInstanceStackBuffer));
+
+#if defined(PBRT_IS_WINDOWS)
+    if (Options->useGPU)
+        ReenableThreadPool();
+#endif  // PBRT_IS_WINDOWS
+}
+
+void HiprtAggregate::IntersectClosest(int maxRays, const RayQueue *rayQueue,
+                                      EscapedRayQueue *escapedRayQueue,
+                                      HitAreaLightQueue *hitAreaLightQueue,
+                                      MaterialEvalQueue *basicEvalMaterialQueue,
+                                      MaterialEvalQueue *universalEvalMaterialQueue,
+                                      MediumSampleQueue *mediumSampleQueue,
+                                      RayQueue *nextRayQueue) const {
+    std::pair<hipEvent_t, hipEvent_t> events =
+        GetProfilerEvents("Trace closest hit rays");
+
+    hipEventRecord(events.first);
+
+    if (scene) {
+        RayIntersectParameters params;
+        params.traversable = scene;
+        params.rayQueue = rayQueue;
+        params.nextRayQueue = nextRayQueue;
+        params.escapedRayQueue = escapedRayQueue;
+        params.hitAreaLightQueue = hitAreaLightQueue;
+        params.basicEvalMaterialQueue = basicEvalMaterialQueue;
+        params.universalEvalMaterialQueue = universalEvalMaterialQueue;
+        params.mediumSampleQueue = mediumSampleQueue;
+        params.globalStackBuffer = globalStackBuffer;
+        params.globalInstanceStackBuffer = globalInstanceStackBuffer;
+        params.funcTable = module.funcTable;
+        params.hgRecords = hgRecords;
+        params.offsets = offsets;
+
+        ParamBufferState &pbs = getParamBuffer(params);
+
+#ifndef NDEBUG
+        LOG_VERBOSE("Launching intersect closest");
+#endif
+
+        hipDeviceptr_t paramsPtr;
+        size_t paramsSize;
+        CUDA_CHECK(
+            hipModuleGetGlobal(&paramsPtr, &paramsSize, module.hipModule, "paramBuffer"));
+        CUDA_CHECK(hipMemcpyAsync((void *)paramsPtr, (const void *)pbs.ptr,
+                                  sizeof(params), hipMemcpyDeviceToDevice, cudaStream));
+
+        hiprtLaunch(module.closestFunction, maxRays, 1, BlockSize, 1, nullptr,
+                    cudaStream);
+        CUDA_CHECK(hipEventRecord(pbs.finishedEvent));
+
+#ifndef NDEBUG
+        CUDA_CHECK(hipDeviceSynchronize());
+        LOG_VERBOSE("Post-sync triangle intersect closest");
+#endif
+    }
+
+    hipEventRecord(events.second);
+};
+
+void HiprtAggregate::IntersectShadow(int maxRays, ShadowRayQueue *shadowRayQueue,
+                                     SOA<PixelSampleState> *pixelSampleState) const {
+    std::pair<hipEvent_t, hipEvent_t> events = GetProfilerEvents("Trace shadow rays");
+
+    hipEventRecord(events.first);
+
+    if (scene) {
+        RayIntersectParameters params;
+        params.traversable = scene;
+        params.shadowRayQueue = shadowRayQueue;
+        params.pixelSampleState = *pixelSampleState;
+        params.globalStackBuffer = globalStackBuffer;
+        params.globalInstanceStackBuffer = globalInstanceStackBuffer;
+        params.funcTable = module.funcTable;
+        params.hgRecords = hgRecords;
+        params.offsets = offsets;
+
+        ParamBufferState &pbs = getParamBuffer(params);
+
+#ifndef NDEBUG
+        LOG_VERBOSE("Launching intersect shadow");
+#endif
+
+        hipDeviceptr_t paramsPtr;
+        size_t paramsSize;
+        CUDA_CHECK(
+            hipModuleGetGlobal(&paramsPtr, &paramsSize, module.hipModule, "paramBuffer"));
+        CUDA_CHECK(hipMemcpyAsync((void *)paramsPtr, (const void *)pbs.ptr,
+                                  sizeof(params), hipMemcpyDeviceToDevice, cudaStream));
+
+        hiprtLaunch(module.shadowFunction, maxRays, 1, BlockSize, 1, nullptr, cudaStream);
+        CUDA_CHECK(hipEventRecord(pbs.finishedEvent));
+
+#ifndef NDEBUG
+        CUDA_CHECK(hipDeviceSynchronize());
+        LOG_VERBOSE("Post-sync intersect shadow");
+#endif
+    }
+
+    hipEventRecord(events.second);
+}
+
+void HiprtAggregate::IntersectShadowTr(int maxRays, ShadowRayQueue *shadowRayQueue,
+                                       SOA<PixelSampleState> *pixelSampleState) const {
+    std::pair<hipEvent_t, hipEvent_t> events =
+        GetProfilerEvents("Tracing shadow Tr rays");
+
+    hipEventRecord(events.first);
+
+    if (scene) {
+        RayIntersectParameters params;
+        params.traversable = scene;
+        params.shadowRayQueue = shadowRayQueue;
+        params.pixelSampleState = *pixelSampleState;
+        params.globalStackBuffer = globalStackBuffer;
+        params.globalInstanceStackBuffer = globalInstanceStackBuffer;
+        params.funcTable = module.funcTable;
+        params.hgRecords = hgRecords;
+        params.offsets = offsets;
+
+        ParamBufferState &pbs = getParamBuffer(params);
+
+#ifndef NDEBUG
+        LOG_VERBOSE("Launching intersect shadow Tr");
+#endif
+        hipDeviceptr_t paramsPtr;
+        size_t paramsSize;
+        CUDA_CHECK(
+            hipModuleGetGlobal(&paramsPtr, &paramsSize, module.hipModule, "paramBuffer"));
+        CUDA_CHECK(hipMemcpyAsync((void *)paramsPtr, (const void *)pbs.ptr,
+                                  sizeof(params), hipMemcpyDeviceToDevice, cudaStream));
+
+        hiprtLaunch(module.shadowTrFunction, maxRays, 1, BlockSize, 1, nullptr,
+                    cudaStream);
+        CUDA_CHECK(hipEventRecord(pbs.finishedEvent));
+
+#ifndef NDEBUG
+        CUDA_CHECK(hipDeviceSynchronize());
+        LOG_VERBOSE("Post-sync intersect shadow Tr");
+#endif
+    }
+
+    hipEventRecord(events.second);
+}
+
+void HiprtAggregate::IntersectOneRandom(
+    int maxRays, SubsurfaceScatterQueue *subsurfaceScatterQueue) const {
+    std::pair<hipEvent_t, hipEvent_t> events =
+        GetProfilerEvents("Tracing subsurface scattering probe rays");
+
+    hipEventRecord(events.first);
+
+    if (scene) {
+        RayIntersectParameters params;
+        params.traversable = scene;
+        params.subsurfaceScatterQueue = subsurfaceScatterQueue;
+        params.globalStackBuffer = globalStackBuffer;
+        params.globalInstanceStackBuffer = globalInstanceStackBuffer;
+        params.funcTable = module.funcTable;
+        params.hgRecords = hgRecords;
+        params.offsets = offsets;
+
+        ParamBufferState &pbs = getParamBuffer(params);
+
+#ifndef NDEBUG
+        LOG_VERBOSE("Launching intersect random");
+#endif
+        hipDeviceptr_t paramsPtr;
+        size_t paramsSize;
+        CUDA_CHECK(
+            hipModuleGetGlobal(&paramsPtr, &paramsSize, module.hipModule, "paramBuffer"));
+        CUDA_CHECK(hipMemcpyAsync((void *)paramsPtr, (const void *)pbs.ptr,
+                                  sizeof(params), hipMemcpyDeviceToDevice, cudaStream));
+
+        hiprtLaunch(module.oneRandomFunction, maxRays, 1, BlockSize, 1, nullptr,
+                    cudaStream);
+        CUDA_CHECK(hipEventRecord(pbs.finishedEvent));
+
+#ifndef NDEBUG
+        CUDA_CHECK(hipDeviceSynchronize());
+        LOG_VERBOSE("Post-sync intersect random");
+#endif
+    }
+
+    hipEventRecord(events.second);
+}
+
+std::map<int, TriQuadMesh> HiprtAggregate::PreparePLYMeshes(
+    const std::vector<ShapeSceneEntity> &shapes,
+    const std::map<std::string, FloatTexture> &floatTextures) {
+    std::map<int, TriQuadMesh> plyMeshes;
+    std::mutex mutex;
+    ParallelFor(0, shapes.size(), [&](int64_t i) {
+        const auto &shape = shapes[i];
+        if (shape.name != "plymesh")
+            return;
+
+        std::string filename =
+            ResolveFilename(shape.parameters.GetOneString("filename", ""));
+        if (filename.empty())
+            ErrorExit(&shape.loc, "plymesh: \"filename\" must be provided.");
+        TriQuadMesh plyMesh = TriQuadMesh::ReadPLY(filename);  // todo: alloc
+        if (!plyMesh.triIndices.empty() || !plyMesh.quadIndices.empty()) {
+            plyMesh.ConvertToOnlyTriangles();
+
+            Float edgeLength =
+                shape.parameters.GetOneFloat("displacement.edgelength", 1.f);
+            edgeLength *= Options->displacementEdgeScale;
+
+            std::string displacementTexName = shape.parameters.GetTexture("displacement");
+            if (!displacementTexName.empty()) {
+                auto iter = floatTextures.find(displacementTexName);
+                if (iter == floatTextures.end())
+                    ErrorExit(&shape.loc, "%s: no such texture defined.",
+                              displacementTexName);
+                FloatTexture displacement = iter->second;
+
+                LOG_VERBOSE("Starting to displace mesh \"%s\" with \"%s\"", filename,
+                            displacementTexName);
+
+                size_t origNumTris = plyMesh.triIndices.size() / 3;
+
+                plyMesh = plyMesh.Displace(
+                    [&](Point3f v0, Point3f v1) {
+                        v0 = (*shape.renderFromObject)(v0);
+                        v1 = (*shape.renderFromObject)(v1);
+                        return Distance(v0, v1);
+                    },
+                    edgeLength,
+                    [&](Point3f *pCPU, const Normal3f *nCPU, const Point2f *uvCPU,
+                        int nVertices) {
+                        Point3f *p;
+                        Normal3f *n;
+                        Point2f *uv;
+                        CUDA_CHECK(hipMallocManaged(&p, nVertices * sizeof(Point3f)));
+                        CUDA_CHECK(hipMallocManaged(&n, nVertices * sizeof(Normal3f)));
+                        CUDA_CHECK(hipMallocManaged(&uv, nVertices * sizeof(Point2f)));
+
+                        std::memcpy(p, pCPU, nVertices * sizeof(Point3f));
+                        std::memcpy(n, nCPU, nVertices * sizeof(Normal3f));
+                        std::memcpy(uv, uvCPU, nVertices * sizeof(Point2f));
+
+                        GPUParallelFor(
+                            "Evaluate Displacement", nVertices, [=] PBRT_GPU(int i) {
+                                TextureEvalContext ctx;
+                                ctx.p = p[i];
+                                ctx.uv = uv[i];
+                                Float d = UniversalTextureEvaluator()(displacement, ctx);
+                                p[i] += Vector3f(d * n[i]);
+                            });
+                        GPUWait();
+
+                        std::memcpy(pCPU, p, nVertices * sizeof(Point3f));
+
+                        CUDA_CHECK(hipFree(p));
+                        CUDA_CHECK(hipFree(n));
+                        CUDA_CHECK(hipFree(uv));
+                    },
+                    &shape.loc);
+
+                displacedTrisDelta += plyMesh.triIndices.size() / 3 - origNumTris;
+
+                LOG_VERBOSE("Finished displacing mesh \"%s\" with \"%s\" -> %d tris",
+                            filename, displacementTexName, plyMesh.triIndices.size() / 3);
+            }
+        }
+
+        std::lock_guard<std::mutex> lock(mutex);
+        plyMeshes[i] = std::move(plyMesh);
+    });
+
+    return plyMeshes;
+}
+
+HiprtAggregate::ParamBufferState &HiprtAggregate::getParamBuffer(
+    const RayIntersectParameters &params) const {
+    CHECK(nextParamOffset < paramsPool.size());
+
+    ParamBufferState &pbs = paramsPool[nextParamOffset];
+    if (++nextParamOffset == paramsPool.size())
+        nextParamOffset = 0;
+    if (!pbs.used)
+        pbs.used = true;
+    else
+        CUDA_CHECK(hipEventSynchronize(pbs.finishedEvent));
+
+    // Copy to host-side pinned memory
+    memcpy(pbs.hostPtr, &params, sizeof(params));
+    CUDA_CHECK(hipMemcpyAsync((void *)pbs.ptr, pbs.hostPtr, sizeof(params),
+                              hipMemcpyHostToDevice, cudaStream));
+
+    return pbs;
+}
+
+BilinearPatchMesh *HiprtAggregate::diceCurveToBLP(const ShapeSceneEntity &shape,
+                                                  int nDiceU, int nDiceV,
+                                                  Allocator alloc) {
+    CHECK_EQ(shape.name, "curve");
+    const ParameterDictionary &parameters = shape.parameters;
+    const FileLoc *loc = &shape.loc;
+
+    ++nCurves;
+
+    // Extract parameters; the following ~90 lines of code are,
+    // unfortunately, copied from Curve::Create.  We would like to avoid
+    // the overhead of splitting the curve and creating Curve objects, so
+    // here we go..
+    Float width = parameters.GetOneFloat("width", 1.f);
+    Float width0 = parameters.GetOneFloat("width0", width);
+    Float width1 = parameters.GetOneFloat("width1", width);
+
+    int degree = parameters.GetOneInt("degree", 3);
+    if (degree != 2 && degree != 3) {
+        Error(loc, "Invalid degree %d: only degree 2 and 3 curves are supported.",
+              degree);
+        return {};
+    }
+
+    std::string basis = parameters.GetOneString("basis", "bezier");
+    if (basis != "bezier" && basis != "bspline") {
+        Error(loc,
+              "Invalid basis \"%s\": only \"bezier\" and \"bspline\" are "
+              "supported.",
+              basis);
+        return {};
+    }
+
+    int nSegments;
+    std::vector<Point3f> cp = parameters.GetPoint3fArray("P");
+    bool bezierBasis = (basis == "bezier");
+    if (bezierBasis) {
+        // After the first segment, which uses degree+1 control points,
+        // subsequent segments reuse the last control point of the previous
+        // one and then use degree more control points.
+        if (((cp.size() - 1 - degree) % degree) != 0) {
+            Error(loc,
+                  "Invalid number of control points %d: for the degree %d "
+                  "Bezier basis %d + n * %d are required, for n >= 0.",
+                  (int)cp.size(), degree, degree + 1, degree);
+            return {};
+        }
+        nSegments = (cp.size() - 1) / degree;
+    } else {
+        if (cp.size() < degree + 1) {
+            Error(loc,
+                  "Invalid number of control points %d: for the degree %d "
+                  "b-spline basis, must have >= %d.",
+                  int(cp.size()), degree, degree + 1);
+            return {};
+        }
+        nSegments = cp.size() - degree;
+    }
+
+    CurveType type;
+    std::string curveType = parameters.GetOneString("type", "flat");
+    if (curveType == "flat")
+        type = CurveType::Flat;
+    else if (curveType == "ribbon")
+        type = CurveType::Ribbon;
+    else if (curveType == "cylinder")
+        type = CurveType::Cylinder;
+    else {
+        Error(loc, R"(Unknown curve type "%s".  Using "cylinder".)", curveType);
+        type = CurveType::Cylinder;
+    }
+
+    std::vector<Normal3f> n = parameters.GetNormal3fArray("N");
+    if (!n.empty()) {
+        if (type != CurveType::Ribbon) {
+            Warning("Curve normals are only used with \"ribbon\" type curves.");
+            n = {};
+        } else if (n.size() != nSegments + 1) {
+            Error(loc,
+                  "Invalid number of normals %d: must provide %d normals for "
+                  "ribbon curves with %d segments.",
+                  int(n.size()), nSegments + 1, nSegments);
+            return {};
+        }
+        for (Normal3f &nn : n)
+            Normalize(nn);
+    } else if (type == CurveType::Ribbon) {
+        Error(loc, "Must provide normals \"N\" at curve endpoints with ribbon "
+                   "curves.");
+        return {};
+    }
+
+    // Start dicing...
+    std::vector<int> blpIndices;
+    std::vector<Point3f> blpP;
+    std::vector<Normal3f> blpN;
+    std::vector<Point2f> blpUV;
+
+    int lastCPOffset = -1;
+    pstd::array<Point3f, 4> segCpBezier;
+
+    for (int i = 0; i <= nDiceU; ++i) {
+        Float u = Float(i) / Float(nDiceU);
+        Float width = Lerp(u, width0, width1);
+
+        int segmentIndex = int(u * nSegments);
+        if (segmentIndex == nSegments)  // u == 1...
+            --segmentIndex;
+
+        // Compute offset into original control points for current u
+        int cpOffset;
+        if (bezierBasis)
+            cpOffset = segmentIndex * degree;
+        else
+            // Uniform b-spline.
+            cpOffset = segmentIndex;
+
+        if (cpOffset != lastCPOffset) {
+            // update segCpBezier
+            if (bezierBasis) {
+                if (degree == 2) {
+                    // Elevate to degree 3.
+                    segCpBezier = ElevateQuadraticBezierToCubic(
+                        pstd::MakeConstSpan(cp).subspan(cpOffset, 3));
+                } else {
+                    // All set.
+                    for (int i = 0; i < 4; ++i)
+                        segCpBezier[i] = cp[cpOffset + i];
+                }
+            } else {
+                // Uniform b-spline.
+                if (degree == 2) {
+                    pstd::array<Point3f, 3> bezCp = QuadraticBSplineToBezier(
+                        pstd::MakeConstSpan(cp).subspan(cpOffset, 3));
+                    segCpBezier =
+                        ElevateQuadraticBezierToCubic(pstd::MakeConstSpan(bezCp));
+                } else {
+                    segCpBezier = CubicBSplineToBezier(
+                        pstd::MakeConstSpan(cp).subspan(cpOffset, 4));
+                }
+            }
+            lastCPOffset = cpOffset;
+        }
+
+        Float uSeg = (u * nSegments) - segmentIndex;
+        DCHECK(uSeg >= 0 && uSeg <= 1);
+
+        Vector3f dpdu;
+        Point3f p = EvaluateCubicBezier(segCpBezier, uSeg, &dpdu);
+
+        switch (type) {
+        case CurveType::Ribbon: {
+            Float normalAngle = AngleBetween(n[segmentIndex], n[segmentIndex + 1]);
+            Float invSinNormalAngle = 1 / std::sin(normalAngle);
+
+            Normal3f nu;
+            if (normalAngle == 0)
+                nu = n[segmentIndex];
+            else {
+                Float sin0 = std::sin((1 - uSeg) * normalAngle) * invSinNormalAngle;
+                Float sin1 = std::sin(uSeg * normalAngle) * invSinNormalAngle;
+                nu = sin0 * n[segmentIndex] + sin1 * n[segmentIndex + 1];
+            }
+            Vector3f dpdv = Normalize(Cross(nu, dpdu)) * width;
+
+            blpP.push_back(p - dpdv / 2);
+            blpP.push_back(p + dpdv / 2);
+            blpUV.push_back(Point2f(u, 0));
+            blpUV.push_back(Point2f(u, 1));
+
+            if (i > 0) {
+                blpIndices.push_back(2 * (i - 1));
+                blpIndices.push_back(2 * (i - 1) + 1);
+                blpIndices.push_back(2 * i);
+                blpIndices.push_back(2 * i + 1);
+            }
+            break;
+        }
+        case CurveType::Flat:
+        case CurveType::Cylinder: {
+            Vector3f ortho[2];
+            CoordinateSystem(Normalize(dpdu), &ortho[0], &ortho[1]);
+            ortho[0] *= width / 2;
+            ortho[1] *= width / 2;
+
+            // Repeat the first/last vertex so we can assign different
+            // texture coordinates...
+            for (int v = 0; v <= nDiceV; ++v) {
+                Float angle = Float(v) / nDiceV * 2 * Pi;
+                blpP.push_back(p + ortho[0] * std::cos(angle) +
+                               ortho[1] * std::sin(angle));
+                blpN.push_back(Normal3f(Normalize(blpP.back() - p)));
+                blpUV.push_back(Point2f(u, Float(v) / nDiceV));
+            }
+
+            if (i > 0) {
+                for (int v = 0; v < nDiceV; ++v) {
+                    // Indexing is funny due to doubled-up last vertex
+                    blpIndices.push_back((nDiceV + 1) * (i - 1) + v);
+                    blpIndices.push_back((nDiceV + 1) * (i - 1) + v + 1);
+                    blpIndices.push_back((nDiceV + 1) * i + v);
+                    blpIndices.push_back((nDiceV + 1) * i + v + 1);
+                }
+            }
+            break;
+        }
+        }
+    }
+
+    nBLPsForCurves += blpIndices.size() / 4;
+
+    return alloc.new_object<BilinearPatchMesh>(
+        *shape.renderFromObject, shape.reverseOrientation, blpIndices, blpP, blpN, blpUV,
+        std::vector<int>(), nullptr, alloc);
+}
+
+HiprtAggregate::GeometryContainer HiprtAggregate::buildBVHForTriangles(
+    const std::vector<ShapeSceneEntity> &shapes,
+    const std::map<int, TriQuadMesh> &plyMeshes, hiprtContext context,
+    const std::map<std::string, FloatTexture> &floatTextures,
+    const std::map<std::string, Material> &namedMaterials,
+    const std::vector<Material> &materials, const std::map<std::string, Medium> &media,
+    const std::map<int, pstd::vector<Light> *> &shapeIndexToAreaLights,
+    ThreadLocal<Allocator> &threadAllocators,
+    ThreadLocal<hipStream_t> &threadCUDAStreams) {
+    // Count how many of the shapes are triangle meshes
+    std::vector<size_t> meshIndexToShapeIndex;
+    for (size_t i = 0; i < shapes.size(); ++i) {
+        const auto &shape = shapes[i];
+        if (shape.name == "trianglemesh" || shape.name == "plymesh" ||
+            shape.name == "loopsubdiv")
+            meshIndexToShapeIndex.push_back(i);
+    }
+
+    size_t nMeshes = meshIndexToShapeIndex.size();
+    if (nMeshes == 0)
+        return {};
+
+    LOG_VERBOSE("Building triangle BLAS");
+
+    std::vector<TriangleMesh *> meshes(nMeshes, nullptr);
+    ParallelFor(0, nMeshes, [&](int64_t meshIndex) {
+        Allocator alloc = threadAllocators.Get();
+        size_t shapeIndex = meshIndexToShapeIndex[meshIndex];
+        const auto &shape = shapes[shapeIndex];
+
+        TriangleMesh *mesh = nullptr;
+        if (shape.name == "trianglemesh") {
+            mesh = Triangle::CreateMesh(shape.renderFromObject, shape.reverseOrientation,
+                                        shape.parameters, &shape.loc, alloc);
+            CHECK(mesh != nullptr);
+        } else if (shape.name == "loopsubdiv") {
+            // Copied from pbrt/shapes.cpp... :-p
+            int nLevels = shape.parameters.GetOneInt("levels", 3);
+            std::vector<int> vertexIndices = shape.parameters.GetIntArray("indices");
+            if (vertexIndices.empty())
+                ErrorExit(&shape.loc, "Vertex indices \"indices\" not "
+                                      "provided for LoopSubdiv shape.");
+
+            std::vector<Point3f> P = shape.parameters.GetPoint3fArray("P");
+            if (P.empty())
+                ErrorExit(&shape.loc, "Vertex positions \"P\" not provided "
+                                      "for LoopSubdiv shape.");
+
+            // don't actually use this for now...
+            std::string scheme = shape.parameters.GetOneString("scheme", "loop");
+
+            mesh = LoopSubdivide(shape.renderFromObject, shape.reverseOrientation,
+                                 nLevels, vertexIndices, P, alloc);
+            CHECK(mesh != nullptr);
+        } else if (shape.name == "plymesh") {
+            auto plyIter = plyMeshes.find(shapeIndex);
+            CHECK(plyIter != plyMeshes.end());
+            const TriQuadMesh &plyMesh = plyIter->second;
+
+            if (!plyMesh.quadIndices.empty() && shape.lightIndex != -1) {
+#if 0
+                // If you'd like to know what they are...
+                for (int i = 0; i < plyMesh.quadIndices.size(); ++i)
+                    Printf("%s\n", plyMesh.p[plyMesh.quadIndices[i]]);
+#endif
+                // This would be nice to fix, but it involves some
+                // plumbing and it's a rare case. The underlying issue
+                // is that when we create AreaLights for emissive
+                // shapes earlier, we're not expecting this..
+                std::string filename =
+                    ResolveFilename(shape.parameters.GetOneString("filename", ""));
+                ErrorExit(&shape.loc,
+                          "%s: PLY file being used as an area light has quads--"
+                          "this is currently unsupported. Please replace them with "
+                          "\"bilinearmesh\" "
+                          "shapes as a workaround. (Sorry!).",
+                          filename);
+            }
+
+            mesh = alloc.new_object<TriangleMesh>(
+                *shape.renderFromObject, shape.reverseOrientation, plyMesh.triIndices,
+                plyMesh.p, std::vector<Vector3f>(), plyMesh.n, plyMesh.uv,
+                plyMesh.faceIndices, alloc);
+        } else
+            LOG_FATAL("Logic error in GPUAggregate::buildBVHForTriangles()");
+
+        meshes[meshIndex] = mesh;
+    });
+
+    GeometryContainer geomContainer(nMeshes);
+
+    ParallelFor(0, nMeshes, [&](int64_t startIndex, int64_t endIndex) {
+        Allocator alloc = threadAllocators.Get();
+
+        for (int meshIndex = startIndex; meshIndex < endIndex; ++meshIndex) {
+            TriangleMesh *mesh = meshes[meshIndex];
+            size_t shapeIndex = meshIndexToShapeIndex[meshIndex];
+            const auto &shape = shapes[shapeIndex];
+            FloatTexture alphaTexture = getAlphaTexture(shape, floatTextures, alloc);
+            Material material = getMaterial(shape, namedMaterials, materials);
+
+            // Do this here, after the alpha texture has been consumed.
+            shape.parameters.ReportUnused();
+
+            HitgroupRecord &hgRecord = geomContainer.hgRecords[meshIndex];
+            hgRecord.type = HitgroupRecord::TriangleMesh;
+            hgRecord.triRec.mesh = mesh;
+            hgRecord.triRec.material = material;
+            hgRecord.triRec.alphaTexture = alphaTexture;
+            hgRecord.triRec.areaLights = {};
+            if (shape.lightIndex != -1) {
+                if (!material)
+                    Warning(&shape.loc, "Ignoring area light specification for shape "
+                                        "with \"interface\" material.");
+                else {
+                    // Note: this will hit if we try to have an instance as an area
+                    // light.
+                    auto iter = shapeIndexToAreaLights.find(shapeIndex);
+                    CHECK(iter != shapeIndexToAreaLights.end());
+                    CHECK_EQ(iter->second->size(), mesh->nTriangles);
+                    hgRecord.triRec.areaLights = pstd::MakeSpan(*iter->second);
+                }
+            }
+            hgRecord.triRec.mediumInterface = getMediumInterface(shape, media, alloc);
+        }
+    });
+
+    hiprtBuildOptions options;
+    options.buildFlags = hiprtBuildFlagBitPreferBalancedBuild;
+    options.batchBuildMaxPrimCount = 512u;
+
+    size_t nTris = 0;
+    size_t nVerts = 0;
+    std::vector<size_t> triOffsets;
+    std::vector<size_t> vertsOffsets;
+    for (size_t meshIndex = 0; meshIndex < nMeshes; ++meshIndex) {
+        TriangleMesh *mesh = meshes[meshIndex];
+        triOffsets.push_back(nTris);
+        vertsOffsets.push_back(nVerts);
+        nTris += mesh->nTriangles;
+        nVerts += mesh->nVertices;
+    }
+
+    hiprtInt3 *tris;
+    CUDA_CHECK(hipMalloc(&tris, sizeof(hiprtInt3) * nTris));
+
+    hiprtFloat3 *verts;
+    CUDA_CHECK(hipMalloc(&verts, sizeof(hiprtFloat3) * nVerts));
+
+    std::vector<hiprtInt3> triBuffer(nTris);
+    std::vector<hiprtFloat3> vertBuffer(nVerts);
+
+    std::vector<hiprtGeometryBuildInput> geomInputs(nMeshes);
+    std::vector<hiprtGeometry> geoms(nMeshes);
+    std::vector<hiprtGeometry *> geomAddrs(nMeshes);
+    ParallelFor(0, nMeshes, [&](int64_t meshIndex) {
+        TriangleMesh *mesh = meshes[meshIndex];
+        size_t triOffset = triOffsets[meshIndex];
+        size_t vertOffset = vertsOffsets[meshIndex];
+
+        hiprtTriangleMeshPrimitive prim;
+        prim.triangleCount = mesh->nTriangles;
+        prim.triangleStride = sizeof(hiprtInt3);
+        prim.triangleIndices = tris + triOffset;
+        prim.vertexCount = mesh->nVertices;
+        prim.vertexStride = sizeof(hiprtFloat3);
+        prim.vertices = verts + vertOffset;
+
+        hiprtGeometryBuildInput &geomInput = geomInputs[meshIndex];
+        geomInput.type = hiprtPrimitiveTypeTriangleMesh;
+        geomInput.primitive.triangleMesh = prim;
+        geomInput.geomType = 0;
+
+        geomAddrs[meshIndex] = &geoms[meshIndex];
+
+        std::memcpy(&triBuffer[triOffset], mesh->vertexIndices,
+                    sizeof(hiprtInt3) * mesh->nTriangles);
+        std::memcpy(&vertBuffer[vertOffset], mesh->p,
+                    sizeof(hiprtFloat3) * mesh->nVertices);
+    });
+
+    hipStream_t buildStream = threadCUDAStreams.Get();
+    CUDA_CHECK(hipMemcpyAsync(tris, triBuffer.data(), sizeof(hiprtInt3) * nTris,
+                              hipMemcpyHostToDevice, buildStream));
+    CUDA_CHECK(hipMemcpyAsync(verts, vertBuffer.data(), sizeof(hiprtFloat3) * nVerts,
+                              hipMemcpyHostToDevice, buildStream));
+
+    size_t geomTempSize;
+    HIPRT_CHECK(hiprtGetGeometriesBuildTemporaryBufferSize(
+        context, nMeshes, geomInputs.data(), options, geomTempSize));
+
+    hiprtDevicePtr tempBuffer;
+    CUDA_CHECK(hipMalloc(&tempBuffer, geomTempSize));
+
+    HIPRT_CHECK(hiprtCreateGeometries(context, nMeshes, geomInputs.data(), options,
+                                      geomAddrs.data()));
+
+    LOG_VERBOSE("Starting to build triangle mesh geometries");
+    HIPRT_CHECK(hiprtBuildGeometries(context, hiprtBuildOperationBuild, nMeshes,
+                                     geomInputs.data(), options, tempBuffer, buildStream,
+                                     geoms.data()));
+    LOG_VERBOSE("Finished building triangle mesh geometries");
+
+    CUDA_CHECK(hipFree(tris));
+    CUDA_CHECK(hipFree(verts));
+    CUDA_CHECK(hipFree(tempBuffer));
+
+    hiprtFrameMatrix identity{};
+    for (size_t i = 0; i < 3; ++i)
+        identity.matrix[i][i] = 1.0f;
+
+    std::vector<hiprtInstance> instances(nMeshes);
+    std::vector<hiprtFrameMatrix> transforms(nMeshes);
+    ParallelFor(0, nMeshes, [&](int64_t meshIndex) {
+        instances[meshIndex].type = hiprtInstanceTypeGeometry;
+        instances[meshIndex].geometry = geoms[meshIndex];
+        transforms[meshIndex] = identity;
+    });
+
+    geomContainer.scene = buildBVHForInstances(instances, transforms, context,
+                                               threadAllocators, threadCUDAStreams);
+
+    return geomContainer;
+}
+
+HiprtAggregate::GeometryContainer HiprtAggregate::buildBVHForBLPs(
+    const std::vector<ShapeSceneEntity> &shapes, hiprtContext context,
+    const std::map<std::string, FloatTexture> &floatTextures,
+    const std::map<std::string, Material> &namedMaterials,
+    const std::vector<Material> &materials, const std::map<std::string, Medium> &media,
+    const std::map<int, pstd::vector<Light> *> &shapeIndexToAreaLights,
+    ThreadLocal<Allocator> &threadAllocators,
+    ThreadLocal<hipStream_t> &threadCUDAStreams) {
+    // Count how many BLP meshes there are in shapes
+    std::vector<size_t> meshIndexToShapeIndex;
+    for (size_t i = 0; i < shapes.size(); ++i) {
+        const auto &shape = shapes[i];
+        if (shape.name == "bilinearmesh" || shape.name == "curve")
+            meshIndexToShapeIndex.push_back(i);
+    }
+
+    size_t nMeshes = meshIndexToShapeIndex.size();
+    if (nMeshes == 0)
+        return {};
+
+    LOG_VERBOSE("Building bilinear patch BLAS");
+
+    // Create meshes
+    std::vector<BilinearPatchMesh *> meshes(nMeshes, nullptr);
+    ParallelFor(0, nMeshes, [&](int64_t meshIndex) {
+        Allocator alloc = threadAllocators.Get();
+        size_t shapeIndex = meshIndexToShapeIndex[meshIndex];
+        const auto &shape = shapes[shapeIndex];
+
+        if (shape.name == "bilinearmesh") {
+            BilinearPatchMesh *mesh = BilinearPatch::CreateMesh(
+                shape.renderFromObject, shape.reverseOrientation, shape.parameters,
+                &shape.loc, alloc);
+            meshes[meshIndex] = mesh;
+        } else if (shape.name == "curve") {
+            BilinearPatchMesh *curveMesh =
+                diceCurveToBLP(shape, 5 /* nseg */, 5 /* nvert */, alloc);
+            if (curveMesh) {
+                meshes[meshIndex] = curveMesh;
+            }
+        }
+    });
+
+    GeometryContainer geomContainer(nMeshes);
+
+    ParallelFor(0, nMeshes, [&](int64_t startIndex, int64_t endIndex) {
+        Allocator alloc = threadAllocators.Get();
+
+        for (int meshIndex = startIndex; meshIndex < endIndex; ++meshIndex) {
+            BilinearPatchMesh *mesh = meshes[meshIndex];
+            size_t shapeIndex = meshIndexToShapeIndex[meshIndex];
+            const auto &shape = shapes[shapeIndex];
+            Material material = getMaterial(shape, namedMaterials, materials);
+            FloatTexture alphaTexture = getAlphaTexture(shape, floatTextures, alloc);
+
+            // After "alpha" has been consumed...
+            shape.parameters.ReportUnused();
+
+            HitgroupRecord &hgRecord = geomContainer.hgRecords[meshIndex];
+            hgRecord.type = HitgroupRecord::BilinearMesh;
+            hgRecord.blpRec.mesh = mesh;
+            hgRecord.blpRec.material = material;
+            hgRecord.blpRec.alphaTexture = alphaTexture;
+            hgRecord.blpRec.areaLights = {};
+            if (shape.lightIndex != -1) {
+                if (!material)
+                    Warning(&shape.loc,
+                            "Ignoring area light specification for shape with "
+                            "\"interface\" material.");
+                else {
+                    auto iter = shapeIndexToAreaLights.find(shapeIndex);
+                    // Note: this will hit if we try to have an instance as an area
+                    // light.
+                    CHECK(iter != shapeIndexToAreaLights.end());
+                    CHECK_EQ(iter->second->size(), mesh->nPatches);
+                    hgRecord.blpRec.areaLights = pstd::MakeSpan(*iter->second);
+                }
+            }
+            hgRecord.blpRec.mediumInterface = getMediumInterface(shape, media, alloc);
+        }
+    });
+
+    hiprtBuildOptions options;
+    options.buildFlags = hiprtBuildFlagBitPreferBalancedBuild;
+    options.batchBuildMaxPrimCount = 512u;
+
+    size_t nAabbs = 0;
+    std::vector<size_t> aabbOffsets;
+    for (size_t meshIndex = 0; meshIndex < nMeshes; ++meshIndex) {
+        BilinearPatchMesh *mesh = meshes[meshIndex];
+        aabbOffsets.push_back(nAabbs);
+        nAabbs += mesh->nPatches;
+    }
+
+    hiprtFloat4 *aabbs;
+    CUDA_CHECK(hipMalloc(&aabbs, sizeof(hiprtFloat4) * 2 * nAabbs));
+
+    std::vector<hiprtFloat4> aabbBuffer(2 * nAabbs);
+
+    std::vector<hiprtGeometryBuildInput> geomInputs(nMeshes);
+    std::vector<hiprtGeometry> geoms(nMeshes);
+    std::vector<hiprtGeometry *> geomAddrs(nMeshes);
+    ParallelFor(0, nMeshes, [&](int64_t meshIndex) {
+        BilinearPatchMesh *mesh = meshes[meshIndex];
+        size_t aabbIndex = aabbOffsets[meshIndex];
+
+        hiprtAABBListPrimitive prim;
+        prim.aabbCount = mesh->nPatches;
+        prim.aabbStride = 2 * sizeof(hiprtFloat4);
+        prim.aabbs = aabbs + 2 * aabbIndex;
+
+        hiprtGeometryBuildInput &geomInput = geomInputs[meshIndex];
+        geomInput.type = hiprtPrimitiveTypeAABBList;
+        geomInput.primitive.aabbList = prim;
+        geomInput.geomType = 1;
+
+        geomAddrs[meshIndex] = &geoms[meshIndex];
+
+        for (int patchIndex = 0; patchIndex < mesh->nPatches; ++patchIndex) {
+            Bounds3f patchBounds;
+            for (int i = 0; i < 4; ++i)
+                patchBounds =
+                    Union(patchBounds, mesh->p[mesh->vertexIndices[4 * patchIndex + i]]);
+            hiprtFloat4 aabbMin =
+                hiprtFloat4{float(patchBounds.pMin.x), float(patchBounds.pMin.y),
+                            float(patchBounds.pMin.z), 0.0f};
+            hiprtFloat4 aabbMax =
+                hiprtFloat4{float(patchBounds.pMax.x), float(patchBounds.pMax.y),
+                            float(patchBounds.pMax.z), 0.0f};
+            aabbBuffer[2 * aabbIndex + 0] = aabbMin;
+            aabbBuffer[2 * aabbIndex + 1] = aabbMax;
+            ++aabbIndex;
+        }
+    });
+
+    hipStream_t buildStream = threadCUDAStreams.Get();
+    CUDA_CHECK(hipMemcpyAsync(aabbs, aabbBuffer.data(), sizeof(hiprtFloat4) * 2 * nAabbs,
+                              hipMemcpyHostToDevice, buildStream));
+
+    size_t geomTempSize;
+    HIPRT_CHECK(hiprtGetGeometriesBuildTemporaryBufferSize(
+        context, nMeshes, geomInputs.data(), options, geomTempSize));
+
+    hiprtDevicePtr tempBuffer;
+    CUDA_CHECK(hipMalloc(&tempBuffer, geomTempSize));
+
+    HIPRT_CHECK(hiprtCreateGeometries(context, nMeshes, geomInputs.data(), options,
+                                      geomAddrs.data()));
+
+    LOG_VERBOSE("Starting to build bilinear patch mesh geometries");
+    HIPRT_CHECK(hiprtBuildGeometries(context, hiprtBuildOperationBuild, nMeshes,
+                                     geomInputs.data(), options, tempBuffer, buildStream,
+                                     geoms.data()));
+    LOG_VERBOSE("Finished building bilinear patch mesh geometries");
+
+    CUDA_CHECK(hipFree(aabbs));
+    CUDA_CHECK(hipFree(tempBuffer));
+
+    hiprtFrameMatrix identity{};
+    for (size_t i = 0; i < 3; ++i)
+        identity.matrix[i][i] = 1.0f;
+
+    std::vector<hiprtInstance> instances(nMeshes);
+    std::vector<hiprtFrameMatrix> transforms(nMeshes);
+    ParallelFor(0, nMeshes, [&](int64_t meshIndex) {
+        instances[meshIndex].type = hiprtInstanceTypeGeometry;
+        instances[meshIndex].geometry = geoms[meshIndex];
+        transforms[meshIndex] = identity;
+    });
+
+    geomContainer.scene = buildBVHForInstances(instances, transforms, context,
+                                               threadAllocators, threadCUDAStreams);
+
+    return geomContainer;
+}
+
+HiprtAggregate::GeometryContainer HiprtAggregate::buildBVHForQuadrics(
+    const std::vector<ShapeSceneEntity> &shapes, hiprtContext context,
+    const std::map<std::string, FloatTexture> &floatTextures,
+    const std::map<std::string, Material> &namedMaterials,
+    const std::vector<Material> &materials, const std::map<std::string, Medium> &media,
+    const std::map<int, pstd::vector<Light> *> &shapeIndexToAreaLights,
+    ThreadLocal<Allocator> &threadAllocators,
+    ThreadLocal<hipStream_t> &threadCUDAStreams) {
+    int nQuadrics = 0;
+    for (size_t shapeIndex = 0; shapeIndex < shapes.size(); ++shapeIndex) {
+        const auto &s = shapes[shapeIndex];
+        if (s.name == "sphere" || s.name == "cylinder" || s.name == "disk")
+            ++nQuadrics;
+    }
+
+    if (nQuadrics == 0)
+        return {};
+
+    LOG_VERBOSE("Building quadric BLAS");
+
+    hiprtBuildOptions options;
+    options.buildFlags = hiprtBuildFlagBitPreferBalancedBuild;
+    options.batchBuildMaxPrimCount = 512u;
+
+    Allocator alloc = threadAllocators.Get();
+    hiprtFloat4 *aabbs;
+    CUDA_CHECK(hipMalloc(&aabbs, sizeof(hiprtFloat4) * 2 * nQuadrics));
+
+    std::vector<hiprtFloat4> aabbBuffer(2 * nQuadrics);
+
+    int quadricIndex = 0;
+    GeometryContainer geomContainer(nQuadrics);
+    for (size_t shapeIndex = 0; shapeIndex < shapes.size(); ++shapeIndex) {
+        const auto &s = shapes[shapeIndex];
+        if (s.name != "sphere" && s.name != "cylinder" && s.name != "disk")
+            continue;
+
+        pstd::vector<Shape> shapes = Shape::Create(
+            s.name, s.renderFromObject, s.objectFromRender, s.reverseOrientation,
+            s.parameters, floatTextures, &s.loc, alloc);
+        if (shapes.empty())
+            continue;
+        CHECK_EQ(1, shapes.size());
+        Shape shape = shapes[0];
+
+        Bounds3f shapeBounds = shape.Bounds();
+        hiprtFloat4 aabbMin =
+            hiprtFloat4{float(shapeBounds.pMin.x), float(shapeBounds.pMin.y),
+                        float(shapeBounds.pMin.z), 0.0f};
+        hiprtFloat4 aabbMax =
+            hiprtFloat4{float(shapeBounds.pMax.x), float(shapeBounds.pMax.y),
+                        float(shapeBounds.pMax.z), 0.0f};
+        aabbBuffer[2 * quadricIndex + 0] = aabbMin;
+        aabbBuffer[2 * quadricIndex + 1] = aabbMax;
+
+        // Find alpha texture, if present.
+        Material material = getMaterial(s, namedMaterials, materials);
+        FloatTexture alphaTexture = getAlphaTexture(s, floatTextures, alloc);
+
+        // Once again, after any alpha texture is created...
+        s.parameters.ReportUnused();
+
+        HitgroupRecord &hgRecord = geomContainer.hgRecords[quadricIndex];
+        hgRecord.type = HitgroupRecord::Quadric;
+        hgRecord.quadricRec.shape = shape;
+        hgRecord.quadricRec.material = material;
+        hgRecord.quadricRec.alphaTexture = alphaTexture;
+        hgRecord.quadricRec.areaLight = nullptr;
+        if (s.lightIndex != -1) {
+            if (!material)
+                Warning(&s.loc, "Ignoring area light specification for shape with "
+                                "\"interface\" material.");
+            else {
+                auto iter = shapeIndexToAreaLights.find(shapeIndex);
+                // Note: this will hit if we try to have an instance as an area
+                // light.
+                CHECK(iter != shapeIndexToAreaLights.end());
+                CHECK_EQ(iter->second->size(), 1);
+                hgRecord.quadricRec.areaLight = (*iter->second)[0];
+            }
+        }
+        hgRecord.quadricRec.mediumInterface = getMediumInterface(s, media, alloc);
+
+        ++quadricIndex;
+    }
+    nQuadrics = quadricIndex;
+    geomContainer.resize(nQuadrics);
+
+    std::vector<hiprtGeometryBuildInput> geomInputs(nQuadrics);
+    std::vector<hiprtGeometry> geoms(nQuadrics);
+    std::vector<hiprtGeometry *> geomAddrs(nQuadrics);
+    ParallelFor(0, nQuadrics, [&](int64_t quadricIndex) {
+        hiprtAABBListPrimitive prim;
+        prim.aabbCount = 1;
+        prim.aabbStride = 2 * sizeof(hiprtFloat4);
+        prim.aabbs = aabbs + 2 * quadricIndex;
+
+        hiprtGeometryBuildInput &geomInput = geomInputs[quadricIndex];
+        geomInput.type = hiprtPrimitiveTypeAABBList;
+        geomInput.primitive.aabbList = prim;
+        geomInput.geomType = 2;
+
+        geomAddrs[quadricIndex] = &geoms[quadricIndex];
+    });
+
+    hipStream_t buildStream = threadCUDAStreams.Get();
+    CUDA_CHECK(hipMemcpyAsync(aabbs, aabbBuffer.data(),
+                              sizeof(hiprtFloat4) * 2 * nQuadrics, hipMemcpyHostToDevice,
+                              buildStream));
+
+    size_t geomTempSize;
+    HIPRT_CHECK(hiprtGetGeometriesBuildTemporaryBufferSize(
+        context, nQuadrics, geomInputs.data(), options, geomTempSize));
+
+    hiprtDevicePtr tempBuffer;
+    CUDA_CHECK(hipMalloc(&tempBuffer, geomTempSize));
+
+    HIPRT_CHECK(hiprtCreateGeometries(context, nQuadrics, geomInputs.data(), options,
+                                      geomAddrs.data()));
+
+    LOG_VERBOSE("Starting to build quadric geometries");
+    HIPRT_CHECK(hiprtBuildGeometries(context, hiprtBuildOperationBuild, nQuadrics,
+                                     geomInputs.data(), options, tempBuffer, buildStream,
+                                     geoms.data()));
+    LOG_VERBOSE("Finished building quadric geometries");
+
+    CUDA_CHECK(hipFree(aabbs));
+    CUDA_CHECK(hipFree(tempBuffer));
+
+    hiprtFrameMatrix identity{};
+    for (size_t i = 0; i < 3; ++i)
+        identity.matrix[i][i] = 1.0f;
+
+    std::vector<hiprtInstance> instances(nQuadrics);
+    std::vector<hiprtFrameMatrix> transforms(nQuadrics);
+    ParallelFor(0, nQuadrics, [&](int64_t quadricIndex) {
+        instances[quadricIndex].type = hiprtInstanceTypeGeometry;
+        instances[quadricIndex].geometry = geoms[quadricIndex];
+        transforms[quadricIndex] = identity;
+    });
+
+    geomContainer.scene = buildBVHForInstances(instances, transforms, context,
+                                               threadAllocators, threadCUDAStreams);
+
+    return geomContainer;
+}
+
+hiprtScene HiprtAggregate::buildBVHForInstances(
+    const std::vector<hiprtInstance> &instances,
+    const std::vector<hiprtFrameMatrix> &transforms, hiprtContext context,
+    ThreadLocal<Allocator> &threadAllocators,
+    ThreadLocal<hipStream_t> &threadCUDAStreams) {
+    CHECK(transforms.size() == instances.size());
+    hiprtBuildOptions options;
+    options.buildFlags = hiprtBuildFlagBitPreferBalancedBuild;
+    options.batchBuildMaxPrimCount = 512u;
+
+    hiprtSceneBuildInput sceneInput;
+    sceneInput.instanceCount = instances.size();
+    sceneInput.frameCount = instances.size();
+    sceneInput.instanceFrames = nullptr;
+    sceneInput.instanceMasks = nullptr;
+    sceneInput.instanceTransformHeaders = nullptr;
+    sceneInput.instances = nullptr;
+    sceneInput.frameType = hiprtFrameTypeMatrix;
+
+    size_t sceneTempSize;
+    HIPRT_CHECK(hiprtGetSceneBuildTemporaryBufferSize(context, sceneInput, options,
+                                                      sceneTempSize));
+
+    hiprtDevicePtr tempBuffer;
+    CUDA_CHECK(hipMalloc(&tempBuffer, sceneTempSize));
+    CUDA_CHECK(
+        hipMalloc(&sceneInput.instances, sizeof(hiprtInstance) * instances.size()));
+    CUDA_CHECK(hipMalloc(&sceneInput.instanceFrames,
+                         sizeof(hiprtFrameMatrix) * transforms.size()));
+
+    hipStream_t buildStream = threadCUDAStreams.Get();
+    CUDA_CHECK(hipMemcpyAsync((void *)sceneInput.instances, instances.data(),
+                              sizeof(hiprtInstance) * instances.size(),
+                              hipMemcpyHostToDevice, buildStream));
+    CUDA_CHECK(hipMemcpyAsync((void *)sceneInput.instanceFrames, transforms.data(),
+                              sizeof(hiprtFrameMatrix) * transforms.size(),
+                              hipMemcpyHostToDevice, buildStream));
+
+    hiprtScene scene;
+    HIPRT_CHECK(hiprtCreateScene(context, sceneInput, options, scene));
+
+    LOG_VERBOSE("Started to build scene");
+    HIPRT_CHECK(hiprtBuildScene(context, hiprtBuildOperationBuild, sceneInput, options,
+                                tempBuffer, buildStream, scene));
+    LOG_VERBOSE("Finished building scene");
+
+    CUDA_CHECK(hipFree(tempBuffer));
+    CUDA_CHECK(hipFree(sceneInput.instances));
+    CUDA_CHECK(hipFree(sceneInput.instanceFrames));
+
+    return scene;
+}
+
+static void loadFile(const std::string& path, std::vector<char> &dst) {
+    std::fstream f(path, std::ios::binary | std::ios::in);
+    if (f.is_open()) {
+        size_t sizeFile;
+        f.seekg(0, std::fstream::end);
+        size_t size = sizeFile = (size_t)f.tellg();
+        dst.resize(size);
+        f.seekg(0, std::fstream::beg);
+        f.read(dst.data(), size);
+        f.close();
+    }
+}
+
+HiprtAggregate::Module HiprtAggregate::compileHiprtModule(hiprtContext context) {
+    const std::string path = "../src/pbrt/gpu/hiprt/hiprt.cu";
+    const std::string closestFunction = "__raygen__findClosest";
+    const std::string shadowFunction = "__raygen__shadow";
+    const std::string shadowTrFunction = "__raygen__shadow_Tr";
+    const std::string oneRandomFunction = "__raygen__randomHit";
+    const std::string binFilename = "hiprt.hipfb";
+
+    std::vector<char> binary;
+    loadFile(binFilename, binary);
+
+    Module module;
+    CUDA_CHECK(hipModuleLoadData(&module.hipModule, binary.data()));
+
+    HIPRT_CHECK(hiprtCreateFuncTable(context, 3, 1, module.funcTable));
+
+    CUDA_CHECK(hipModuleGetFunction(&module.closestFunction, module.hipModule,
+                                    closestFunction.c_str()));
+    CUDA_CHECK(hipModuleGetFunction(&module.shadowFunction, module.hipModule,
+                                    shadowFunction.c_str()));
+    CUDA_CHECK(hipModuleGetFunction(&module.shadowTrFunction, module.hipModule,
+                                    shadowTrFunction.c_str()));
+    CUDA_CHECK(hipModuleGetFunction(&module.oneRandomFunction, module.hipModule,
+                                    oneRandomFunction.c_str()));
+
+    return module;
+}
+
+void HiprtAggregate::hiprtLaunch(hipFunction_t func, int nx, int ny, int tx, int ty,
+                                 void **args, hipStream_t cudaStream,
+                                 size_t sharedMemoryBytes) {
+    int3 tpb = {tx, ty, 1};
+    int3 nb;
+    nb.x = (nx + tpb.x - 1) / tpb.x;
+    nb.y = (ny + tpb.y - 1) / tpb.y;
+    CUDA_CHECK(hipModuleLaunchKernel(func, nb.x, nb.y, 1, tpb.x, tpb.y, 1,
+                                     sharedMemoryBytes, cudaStream, args, 0));
+}
+
+}  // namespace pbrt
diff --git a/src/pbrt/gpu/hiprt/aggregate.h b/src/pbrt/gpu/hiprt/aggregate.h
new file mode 100644
index 000000000..e9fdc61bc
--- /dev/null
+++ b/src/pbrt/gpu/hiprt/aggregate.h
@@ -0,0 +1,191 @@
+#ifndef PBRT_HIPRT_AGGREGATE_H
+#define PBRT_HIPRT_AGGREGATE_H
+
+#include <pbrt/pbrt.h>
+
+#include <pbrt/gpu/common.h>
+#include <pbrt/gpu/memory.h>
+#include <pbrt/scene.h>
+#include <pbrt/util/containers.h>
+#include <pbrt/util/pstd.h>
+#include <pbrt/util/soa.h>
+#include <pbrt/util/vecmath.h>
+#include <pbrt/wavefront/integrator.h>
+#include <pbrt/wavefront/workitems.h>
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include <hip/hip_runtime.h>
+#include <hiprt/hiprt.h>
+
+namespace pbrt {
+
+class HiprtAggregate : public WavefrontAggregate {
+  public:
+    HiprtAggregate(const BasicScene &scene, CUDATrackedMemoryResource *memoryResource,
+                   NamedTextures &textures,
+                   const std::map<int, pstd::vector<Light> *> &shapeIndexToAreaLights,
+                   const std::map<std::string, Medium> &media,
+                   const std::map<std::string, pbrt::Material> &namedMaterials,
+                   const std::vector<pbrt::Material> &materials, int maxQueueSize);
+
+    Bounds3f Bounds() const { return bounds; }
+
+    void IntersectClosest(int maxRays, const RayQueue *rayQueue,
+                          EscapedRayQueue *escapedRayQueue,
+                          HitAreaLightQueue *hitAreaLightQueue,
+                          MaterialEvalQueue *basicEvalMaterialQueue,
+                          MaterialEvalQueue *universalEvalMaterialQueue,
+                          MediumSampleQueue *mediumSampleQueue,
+                          RayQueue *nextRayQueue) const;
+
+    void IntersectShadow(int maxRays, ShadowRayQueue *shadowRayQueue,
+                         SOA<PixelSampleState> *pixelSampleState) const;
+
+    void IntersectShadowTr(int maxRays, ShadowRayQueue *shadowRayQueue,
+                           SOA<PixelSampleState> *pixelSampleState) const;
+
+    void IntersectOneRandom(int maxRays,
+                            SubsurfaceScatterQueue *subsurfaceScatterQueue) const;
+
+    // WAR: The enclosing parent function ("PreparePLYMeshes") for an
+    // extended __device__ lambda cannot have private or protected access
+    // within its class, so it's public...
+    static std::map<int, TriQuadMesh> PreparePLYMeshes(
+        const std::vector<ShapeSceneEntity> &shapes,
+        const std::map<std::string, FloatTexture> &floatTextures);
+
+  private:
+    static constexpr size_t BlockSize = 64u;
+
+    struct ParamBufferState {
+        bool used = false;
+        hipEvent_t finishedEvent;
+        hipDeviceptr_t ptr = 0;
+        void *hostPtr = nullptr;
+    };
+
+    mutable std::vector<ParamBufferState> paramsPool;
+    mutable size_t nextParamOffset = 0;
+
+    struct Module {
+        hipModule_t hipModule;
+        hipFunction_t closestFunction;
+        hipFunction_t shadowFunction;
+        hipFunction_t shadowTrFunction;
+        hipFunction_t oneRandomFunction;
+        hiprtFuncTable funcTable;
+    };
+
+    struct GeometryContainer {
+        GeometryContainer() = default;
+        GeometryContainer(size_t size) : hgRecords(size) {}
+
+        void resize(size_t size) {
+            hgRecords.resize(size);
+        }
+
+        uint32_t offset;
+        hiprtScene scene;
+        std::vector<HitgroupRecord> hgRecords;
+    };
+
+    struct GeometryGroup {
+        GeometryContainer triGeomContainer;
+        GeometryContainer blpGeomContainer;
+        GeometryContainer quadricGeomContainer;
+    };
+
+    struct InstanceContainer {
+        void appendRecords(GeometryContainer &g) {
+            g.offset = hgRecords.size();
+            hgRecords.insert(hgRecords.end(), g.hgRecords.begin(), g.hgRecords.end());
+        }
+
+        void insertInstance(GeometryContainer &g, const hiprtFrameMatrix& m) { 
+            if (g.scene != nullptr) {
+                hiprtInstance instance;
+                instance.type = hiprtInstanceTypeScene;
+                instance.scene = g.scene;
+                instances.push_back(instance);
+                transforms.push_back(m);
+                offsets.push_back(g.offset);
+            }
+        }
+
+        std::vector<uint32_t> offsets;
+        std::vector<hiprtFrameMatrix> transforms;
+        std::vector<hiprtInstance> instances;
+        std::vector<HitgroupRecord> hgRecords;
+    };
+
+    ParamBufferState &getParamBuffer(const RayIntersectParameters &) const;
+
+    static BilinearPatchMesh *diceCurveToBLP(const ShapeSceneEntity &shape, int nDiceU,
+                                             int nDiceV, Allocator alloc);
+
+    static GeometryContainer buildBVHForTriangles(
+        const std::vector<ShapeSceneEntity> &shapes,
+        const std::map<int, TriQuadMesh> &plyMeshes, hiprtContext context,
+        const std::map<std::string, FloatTexture> &floatTextures,
+        const std::map<std::string, Material> &namedMaterials,
+        const std::vector<Material> &materials,
+        const std::map<std::string, Medium> &media,
+        const std::map<int, pstd::vector<Light> *> &shapeIndexToAreaLights,
+        ThreadLocal<Allocator> &threadAllocators,
+        ThreadLocal<hipStream_t> &threadCUDAStreams);
+
+    static GeometryContainer buildBVHForBLPs(
+        const std::vector<ShapeSceneEntity> &shapes, hiprtContext context,
+        const std::map<std::string, FloatTexture> &floatTextures,
+        const std::map<std::string, Material> &namedMaterials,
+        const std::vector<Material> &materials,
+        const std::map<std::string, Medium> &media,
+        const std::map<int, pstd::vector<Light> *> &shapeIndexToAreaLights,
+        ThreadLocal<Allocator> &threadAllocators,
+        ThreadLocal<hipStream_t> &threadCUDAStreams);
+
+    static GeometryContainer buildBVHForQuadrics(
+        const std::vector<ShapeSceneEntity> &shapes, hiprtContext context,
+        const std::map<std::string, FloatTexture> &floatTextures,
+        const std::map<std::string, Material> &namedMaterials,
+        const std::vector<Material> &materials,
+        const std::map<std::string, Medium> &media,
+        const std::map<int, pstd::vector<Light> *> &shapeIndexToAreaLights,
+        ThreadLocal<Allocator> &threadAllocators,
+        ThreadLocal<hipStream_t> &threadCUDAStreams);
+
+    static hiprtScene buildBVHForInstances(
+        const std::vector<hiprtInstance> &instances,
+        const std::vector<hiprtFrameMatrix> &transforms, hiprtContext context,
+        ThreadLocal<Allocator> &threadAllocators,
+        ThreadLocal<hipStream_t> &threadCUDAStreams);
+
+    static Module compileHiprtModule(hiprtContext context);
+
+    static void hiprtLaunch(hipFunction_t func, int nx, int ny, void **args,
+                            hipStream_t cudaStream, size_t sharedMemoryBytes = 0);
+
+    static void hiprtLaunch(hipFunction_t func, int nx, int ny, int tx, int ty,
+                            void **args, hipStream_t cudaStream,
+                            size_t sharedMemoryBytes = 0);
+    Module module;
+    hiprtContext context;
+
+    hiprtScene scene;
+    hiprtGlobalStackBuffer globalStackBuffer;
+    hiprtGlobalStackBuffer globalInstanceStackBuffer;
+    std::vector<GeometryGroup> geomGroups;
+    HitgroupRecord *hgRecords;
+    uint32_t *offsets;
+    
+    Bounds3f bounds;
+    CUDATrackedMemoryResource *memoryResource;
+    hipStream_t cudaStream;
+};
+
+}  // namespace pbrt
+
+#endif  // PBRT_HIPRT_AGGREGATE_H
diff --git a/src/pbrt/gpu/hiprt/hiprt.cu b/src/pbrt/gpu/hiprt/hiprt.cu
new file mode 100644
index 000000000..c713ae14f
--- /dev/null
+++ b/src/pbrt/gpu/hiprt/hiprt.cu
@@ -0,0 +1,609 @@
+#include <hip/hip_runtime.h>
+#include <hiprt/hiprt_device.h>
+
+__device__ bool __filter__alphaKilled(const hiprtRay &ray, const void *data,
+                                      void *payload, const hiprtHit &hit);
+__device__ bool __intersection__bilinearPatch(const hiprtRay &ray, const void *data,
+                                              void *payload, hiprtHit &hit);
+__device__ bool __intersection__quadric(const hiprtRay &ray, const void *data,
+                                        void *payload, hiprtHit &hit);
+
+HIPRT_DEVICE bool intersectFunc(uint32_t geomType, uint32_t rayType,
+                                const hiprtFuncTableHeader &tableHeader,
+                                const hiprtRay &ray, void *payload, hiprtHit &hit) {
+    const uint32_t index = tableHeader.numGeomTypes * rayType + geomType;
+    const void *data = tableHeader.funcDataSets[index].intersectFuncData;
+    switch (index) {
+    case 1: {
+        return __intersection__bilinearPatch(ray, data, payload, hit);
+    }
+    case 2: {
+        return __intersection__quadric(ray, data, payload, hit);
+    }
+    default: {
+        return false;
+    }
+    }
+}
+
+HIPRT_DEVICE bool filterFunc(uint32_t geomType, uint32_t rayType,
+                             const hiprtFuncTableHeader &tableHeader, const hiprtRay &ray,
+                             void *payload, const hiprtHit &hit) {
+    const uint32_t index = tableHeader.numGeomTypes * rayType + geomType;
+    const void *data = tableHeader.funcDataSets[index].filterFuncData;
+    switch (index) {
+    case 0: {
+        return __filter__alphaKilled(ray, data, payload, hit);
+    }
+    default: {
+        return false;
+    }
+    }
+}
+
+// pbrt is Copyright(c) 1998-2020 Matt Pharr, Wenzel Jakob, and Greg Humphreys.
+// The pbrt source code is licensed under the Apache License, Version 2.0.
+// SPDX: Apache-2.0
+
+#include <pbrt/pbrt.h>
+
+#include <pbrt/gpu/common.h>
+#include <pbrt/interaction.h>
+#include <pbrt/materials.h>
+#include <pbrt/media.h>
+#include <pbrt/shapes.h>
+#include <pbrt/textures.h>
+#include <pbrt/util/float.h>
+#include <pbrt/util/rng.h>
+#include <pbrt/util/transform.h>
+#include <pbrt/util/vecmath.h>
+#include <pbrt/wavefront/intersect.h>
+
+// Make various functions visible to HIPRT, which doesn't get to link
+// shader code with the HIP code in the main executable...
+#include <pbrt/util/color.cpp>
+#include <pbrt/util/colorspace.cpp>
+#include <pbrt/util/log.cpp>
+#include <pbrt/util/noise.cpp>
+#include <pbrt/util/spectrum.cpp>
+#include <pbrt/util/transform.cpp>
+
+#include <utility>
+
+using namespace pbrt;
+
+alignas(alignof(RayIntersectParameters)) __constant__
+    unsigned char paramBuffer[sizeof(RayIntersectParameters)];
+#define params (*(RayIntersectParameters *)paramBuffer)
+
+///////////////////////////////////////////////////////////////////////////
+// Utility functions
+
+template <bool AnyHit>
+__device__ inline hiprtHit Trace(hiprtScene scene, Ray ray, Float tMax, uint32_t &missed,
+                                 void *payload) {
+    hiprtRay hiprtRay;
+    hiprtRay.origin = make_float3(ray.o.x, ray.o.y, ray.o.z);
+    hiprtRay.direction = make_float3(ray.d.x, ray.d.y, ray.d.z);
+    hiprtRay.minT = 1e-7f;
+    hiprtRay.maxT = tMax;
+
+    __shared__ int sharedStackCache[SHARED_STACK_SIZE * BLOCK_SIZE];
+    hiprtSharedStackBuffer sharedStackBuffer{SHARED_STACK_SIZE, sharedStackCache};
+    hiprtGlobalStack stack(params.globalStackBuffer, sharedStackBuffer);
+
+    hiprtSharedStackBuffer sharedInstanceStackBuffer{};
+    hiprtGlobalInstanceStack instanceStack(params.globalInstanceStackBuffer,
+                                           sharedInstanceStackBuffer);
+
+    hiprtHit hit;
+    if constexpr (!AnyHit) {
+        hiprtSceneTraversalClosestCustomStack<hiprtGlobalStack, hiprtGlobalInstanceStack>
+            tr(scene, hiprtRay, stack, instanceStack, hiprtFullRayMask,
+               hiprtTraversalHintDefault, payload, params.funcTable, 0, ray.time);
+        hit = tr.getNextHit();
+    } else {
+        hiprtSceneTraversalAnyHitCustomStack<hiprtGlobalStack, hiprtGlobalInstanceStack>
+            tr(scene, hiprtRay, stack, instanceStack, hiprtFullRayMask,
+               hiprtTraversalHintDefault, payload, params.funcTable, 0, ray.time);
+        hit = tr.getNextHit();
+        if (hit.t == 0.0f)
+            hit.primID = hiprtInvalidValue;
+    }
+
+    missed = uint32_t(hit.primID == hiprtInvalidValue);
+    return hit;
+}
+
+static __device__ uint32_t recordIndex(const hiprtHit &hit) {
+    return hit.instanceIDs[1] + params.offsets[hit.instanceIDs[0]];
+}
+
+static __forceinline__ __device__ Transform getWorldFromInstance(const hiprtHit &hit) {
+    hiprtFrameMatrix hiprtWorldFromObjM =
+        hiprtGetObjectToWorldFrameMatrix(params.traversable, hit.instanceIDs, 0.0f);
+    hiprtFrameMatrix hiprtObjFromWorldM =
+        hiprtGetWorldToObjectFrameMatrix(params.traversable, hit.instanceIDs, 0.0f);
+    SquareMatrix<4> worldFromObjM, objFromWorldM;
+    for (int i = 0; i < 3; ++i) {
+        for (int j = 0; j < 4; ++j) {
+            worldFromObjM[i][j] = hiprtWorldFromObjM.matrix[i][j];
+            objFromWorldM[i][j] = hiprtObjFromWorldM.matrix[i][j];
+        }
+    }
+    return Transform(worldFromObjM, objFromWorldM);
+}
+
+static __forceinline__ __device__ SurfaceInteraction
+getTriangleIntersection(const Ray &rayWorld, const hiprtHit &hit) {
+    const TriangleMeshRecord &rec = params.hgRecords[recordIndex(hit)].triRec;
+
+    float b1 = hit.uv.x;
+    float b2 = hit.uv.y;
+    float b0 = 1 - b1 - b2;
+
+    float3 rd = make_float3(rayWorld.d.x, rayWorld.d.y, rayWorld.d.z);
+    Vector3f wo = -Vector3f(rd.x, rd.y, rd.z);
+
+    Transform worldFromInstance = getWorldFromInstance(hit);
+
+    Float time = rayWorld.time;
+    wo = worldFromInstance.ApplyInverse(wo);
+
+    TriangleIntersection ti{b0, b1, b2, hit.t};
+    SurfaceInteraction intr =
+        Triangle::InteractionFromIntersection(rec.mesh, hit.primID, ti, time, wo);
+    return worldFromInstance(intr);
+}
+
+static __forceinline__ __device__ SurfaceInteraction
+getBilinearPatchIntersection(const Ray &rayWorld, const hiprtHit &hit, Point2f uv) {
+    BilinearMeshRecord &rec = params.hgRecords[recordIndex(hit)].blpRec;
+
+    float3 rd = make_float3(rayWorld.d.x, rayWorld.d.y, rayWorld.d.z);
+    Vector3f wo = -Vector3f(rd.x, rd.y, rd.z);
+
+    return BilinearPatch::InteractionFromIntersection(rec.mesh, hit.primID, uv,
+                                                      rayWorld.time, wo);
+}
+
+static __device__ inline SurfaceInteraction getQuadricIntersection(
+    const Ray &rayWorld, const hiprtHit &hit, const QuadricIntersection &si) {
+    QuadricRecord &rec = params.hgRecords[recordIndex(hit)].quadricRec;
+
+    float3 rd = make_float3(rayWorld.d.x, rayWorld.d.y, rayWorld.d.z);
+    Vector3f wo = -Vector3f(rd.x, rd.y, rd.z);
+    Float time = rayWorld.time;
+
+    SurfaceInteraction intr;
+    if (const Sphere *sphere = rec.shape.CastOrNullptr<Sphere>())
+        intr = sphere->InteractionFromIntersection(si, wo, time);
+    else if (const Cylinder *cylinder = rec.shape.CastOrNullptr<Cylinder>())
+        intr = cylinder->InteractionFromIntersection(si, wo, time);
+    else if (const Disk *disk = rec.shape.CastOrNullptr<Disk>())
+        intr = disk->InteractionFromIntersection(si, wo, time);
+    else
+        CHECK(!"unexpected quadric");
+
+    return intr;
+}
+
+///////////////////////////////////////////////////////////////////////////
+// Intersection and filter functions
+
+__device__ bool __filter__alphaKilled(const hiprtRay &ray, const void *data,
+                                      void *payload, const hiprtHit &hit) {
+    const TriangleMeshRecord &rec = params.hgRecords[recordIndex(hit)].triRec;
+    if (!rec.alphaTexture)
+        return false;
+
+    Ray rayWorld = *(Ray *)payload;
+
+    SurfaceInteraction intr = getTriangleIntersection(rayWorld, hit);
+
+    BasicTextureEvaluator eval;
+    Float alpha = eval(rec.alphaTexture, intr);
+    if (alpha >= 1)
+        return false;
+    if (alpha <= 0)
+        return true;
+    else {
+        float3 o = make_float3(rayWorld.o.x, rayWorld.o.y, rayWorld.o.z);
+        float3 d = make_float3(rayWorld.d.x, rayWorld.d.y, rayWorld.d.z);
+        Float u = HashFloat(o, d);
+        return u > alpha;
+    }
+}
+
+__device__ bool __intersection__bilinearPatch(const hiprtRay &ray_, const void *data,
+                                              void *payload, hiprtHit &hit) {
+    BilinearMeshRecord &rec = params.hgRecords[recordIndex(hit)].blpRec;
+
+    float3 org = ray_.origin;
+    float3 dir = ray_.direction;
+    Float tMax = ray_.maxT;
+    Ray ray(Point3f(org.x, org.y, org.z), Vector3f(dir.x, dir.y, dir.z));
+    Ray rayWorld = *(Ray *)payload;
+
+    int vertexIndex = 4 * hit.primID;
+    Point3f p00 = rec.mesh->p[rec.mesh->vertexIndices[vertexIndex]];
+    Point3f p10 = rec.mesh->p[rec.mesh->vertexIndices[vertexIndex + 1]];
+    Point3f p01 = rec.mesh->p[rec.mesh->vertexIndices[vertexIndex + 2]];
+    Point3f p11 = rec.mesh->p[rec.mesh->vertexIndices[vertexIndex + 3]];
+    pstd::optional<BilinearIntersection> isect =
+        IntersectBilinearPatch(ray, tMax, p00, p10, p01, p11);
+
+    if (!isect)
+        return false;
+
+    if (rec.alphaTexture) {
+        SurfaceInteraction intr = getBilinearPatchIntersection(rayWorld, hit, isect->uv);
+        BasicTextureEvaluator eval;
+        Float alpha = eval(rec.alphaTexture, intr);
+        if (alpha < 1) {
+            if (alpha == 0)
+                // No hit
+                return false;
+
+            float3 o = make_float3(rayWorld.o.x, rayWorld.o.y, rayWorld.o.z);
+            float3 d = make_float3(rayWorld.d.x, rayWorld.d.y, rayWorld.d.z);
+            Float u = HashFloat(o, d);
+            if (u > alpha)
+                // no hit
+                return false;
+        }
+    }
+
+    hit.t = isect->t;
+    hit.uv.x = isect->uv[0];
+    hit.uv.y = isect->uv[1];
+
+    return true;
+}
+
+__device__ bool __intersection__quadric(const hiprtRay &ray_, const void *data,
+                                        void *payload, hiprtHit &hit) {
+    QuadricRecord &rec = params.hgRecords[recordIndex(hit)].quadricRec;
+
+    float3 org = ray_.origin;
+    float3 dir = ray_.direction;
+    Float tMax = ray_.maxT;
+    Ray ray(Point3f(org.x, org.y, org.z), Vector3f(dir.x, dir.y, dir.z));
+    pstd::optional<QuadricIntersection> isect;
+
+    if (const Sphere *sphere = rec.shape.CastOrNullptr<Sphere>())
+        isect = sphere->BasicIntersect(ray, tMax);
+    else if (const Cylinder *cylinder = rec.shape.CastOrNullptr<Cylinder>())
+        isect = cylinder->BasicIntersect(ray, tMax);
+    else if (const Disk *disk = rec.shape.CastOrNullptr<Disk>())
+        isect = disk->BasicIntersect(ray, tMax);
+
+    if (!isect)
+        return false;
+
+    Ray rayWorld = *(Ray *)payload;
+
+    if (rec.alphaTexture) {
+        SurfaceInteraction intr = getQuadricIntersection(rayWorld, hit, *isect);
+
+        BasicTextureEvaluator eval;
+        Float alpha = eval(rec.alphaTexture, intr);
+        if (alpha < 1) {
+            if (alpha == 0)
+                // No hit
+                return false;
+
+            float3 o = make_float3(rayWorld.o.x, rayWorld.o.y, rayWorld.o.z);
+            float3 d = make_float3(rayWorld.d.x, rayWorld.d.y, rayWorld.d.z);
+            Float u = HashFloat(o.x, o.y, o.z, d.x, d.y, d.z);
+            if (u > alpha)
+                // no hit
+                return false;
+        }
+    }
+
+    hit.t = isect->tHit;
+    hit.normal = make_float3(isect->pObj.x, isect->pObj.y, isect->pObj.z);
+    hit.uv.x = isect->phi;
+
+    return true;
+}
+
+///////////////////////////////////////////////////////////////////////////
+// Closest hit
+
+struct ClosestHitContext {
+    ClosestHitContext() = default;
+    __device__ ClosestHitContext(Medium rayMedium, bool shadowRay)
+        : rayMedium(rayMedium), shadowRay(shadowRay) {}
+
+    Medium rayMedium;
+    bool shadowRay;
+
+    // out
+    Point3fi piHit;
+    Normal3f nHit;
+    Material material;
+    MediumInterface mediumInterface;
+
+    __device__ Ray SpawnRayTo(const Point3f &p) const {
+        Interaction intr(piHit, nHit);
+        intr.mediumInterface = &mediumInterface;
+        return intr.SpawnRayTo(p);
+    }
+};
+
+static __forceinline__ __device__ void ProcessClosestIntersection(
+    SurfaceInteraction intr, const hiprtHit &hit, ClosestHitContext &ctx) {
+    int rayIndex = blockIdx.x * blockDim.x + threadIdx.x;
+
+    Medium rayMedium = ctx.rayMedium;
+
+    if (intr.mediumInterface)
+        ctx.mediumInterface = *intr.mediumInterface;
+    else
+        ctx.mediumInterface = MediumInterface(rayMedium);
+
+    ctx.piHit = intr.pi;
+    ctx.nHit = intr.n;
+    ctx.material = intr.material;
+
+    if (ctx.shadowRay)
+        return;
+
+    // We only have the ray queue (and it only makes sense to access) for
+    // regular closest hit rays.
+    RayWorkItem r = (*params.rayQueue)[rayIndex];
+
+    EnqueueWorkAfterIntersection(r, rayMedium, hit.t, intr, params.mediumSampleQueue,
+                                 params.nextRayQueue, params.hitAreaLightQueue,
+                                 params.basicEvalMaterialQueue,
+                                 params.universalEvalMaterialQueue);
+}
+__device__ void closesthitTriangle(const Ray &rayWorld, const hiprtHit &hit,
+                                   ClosestHitContext &ctx) {
+    const TriangleMeshRecord &rec = params.hgRecords[recordIndex(hit)].triRec;
+
+    SurfaceInteraction intr = getTriangleIntersection(rayWorld, hit);
+
+    if (rec.mediumInterface && rec.mediumInterface->IsMediumTransition())
+        intr.mediumInterface = rec.mediumInterface;
+    intr.material = rec.material;
+    if (!rec.areaLights.empty())
+        intr.areaLight = rec.areaLights[hit.primID];
+
+    ProcessClosestIntersection(intr, hit, ctx);
+}
+
+__device__ void closesthitBilinearPatch(const Ray &rayWorld, const hiprtHit &hit,
+                                        ClosestHitContext &ctx) {
+    BilinearMeshRecord &rec = params.hgRecords[recordIndex(hit)].blpRec;
+
+    Point2f uv(hit.uv.x, hit.uv.y);
+
+    SurfaceInteraction intr = getBilinearPatchIntersection(rayWorld, hit, uv);
+    if (rec.mediumInterface && rec.mediumInterface->IsMediumTransition())
+        intr.mediumInterface = rec.mediumInterface;
+    intr.material = rec.material;
+    if (!rec.areaLights.empty())
+        intr.areaLight = rec.areaLights[hit.primID];
+
+    Transform worldFromInstance = getWorldFromInstance(hit);
+    intr = worldFromInstance(intr);
+
+    ProcessClosestIntersection(intr, hit, ctx);
+}
+
+__device__ void closesthitQuadric(const Ray &rayWorld, const hiprtHit &hit,
+                                  ClosestHitContext &ctx) {
+    QuadricRecord &rec = params.hgRecords[recordIndex(hit)].quadricRec;
+    QuadricIntersection qi;
+    qi.pObj = Point3f(hit.normal.x, hit.normal.y, hit.normal.z);
+    qi.phi = hit.uv.x;
+
+    SurfaceInteraction intr = getQuadricIntersection(rayWorld, hit, qi);
+    if (rec.mediumInterface && rec.mediumInterface->IsMediumTransition())
+        intr.mediumInterface = rec.mediumInterface;
+    intr.material = rec.material;
+    if (rec.areaLight)
+        intr.areaLight = rec.areaLight;
+
+    Transform worldFromInstance = getWorldFromInstance(hit);
+    intr = worldFromInstance(intr);
+
+    ProcessClosestIntersection(intr, hit, ctx);
+}
+
+extern "C" __global__ void __raygen__findClosest() {
+    int rayIndex = blockIdx.x * blockDim.x + threadIdx.x;
+    if (rayIndex >= params.rayQueue->Size())
+        return;
+
+    RayWorkItem r = (*params.rayQueue)[rayIndex];
+    Ray ray = r.ray;
+    Float tMax = 1e30f;
+
+    PBRT_DBG("ray o %f %f %f dir %f %f %f tmax %f\n", ray.o.x, ray.o.y, ray.o.z, ray.d.x,
+             ray.d.y, ray.d.z, tMax);
+
+    uint32_t missed = 0;
+    hiprtHit hit = Trace<false>(params.traversable, ray, tMax, missed, &ray);
+
+    if (missed) {
+        EnqueueWorkAfterMiss(r, params.mediumSampleQueue, params.escapedRayQueue);
+        return;
+    }
+
+    ClosestHitContext ctx(ray.medium, false);
+    if (params.hgRecords[recordIndex(hit)].type == HitgroupRecord::TriangleMesh)
+        closesthitTriangle(ray, hit, ctx);
+    else if (params.hgRecords[recordIndex(hit)].type == HitgroupRecord::BilinearMesh)
+        closesthitBilinearPatch(ray, hit, ctx);
+    else if (params.hgRecords[recordIndex(hit)].type == HitgroupRecord::Quadric)
+        closesthitQuadric(ray, hit, ctx);
+    else
+        CHECK(!"unexpected primitive type");
+}
+
+///////////////////////////////////////////////////////////////////////////
+// Shadow rays
+
+extern "C" __global__ void __raygen__shadow() {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= params.shadowRayQueue->Size())
+        return;
+
+    ShadowRayWorkItem sr = (*params.shadowRayQueue)[index];
+    PBRT_DBG("Tracing shadow ray index %d o %f %f %f d %f %f %f\n", index, sr.ray.o.x,
+             sr.ray.o.y, sr.ray.o.z, sr.ray.d.x, sr.ray.d.y, sr.ray.d.z);
+
+    uint32_t missed = 0;
+    Trace<true>(params.traversable, sr.ray, sr.tMax, missed, &sr.ray);
+
+    RecordShadowRayResult(sr, &params.pixelSampleState, !missed);
+}
+
+extern "C" __global__ void __raygen__shadow_Tr() {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= params.shadowRayQueue->Size())
+        return;
+
+    ShadowRayWorkItem sr = (*params.shadowRayQueue)[index];
+
+    ClosestHitContext ctx;
+
+    TraceTransmittance(
+        sr, &params.pixelSampleState,
+        [&](Ray ray, Float tMax) -> TransmittanceTraceResult {
+            uint32_t missed = 0;
+            hiprtHit hit = Trace<false>(params.traversable, ray, tMax, missed,
+                                        &ray);  // the closest hit is actually used
+
+            ctx = ClosestHitContext(ray.medium, true);
+
+            if (!missed) {
+                if (params.hgRecords[recordIndex(hit)].type ==
+                    HitgroupRecord::TriangleMesh)
+                    closesthitTriangle(ray, hit, ctx);
+                else if (params.hgRecords[recordIndex(hit)].type ==
+                         HitgroupRecord::BilinearMesh)
+                    closesthitBilinearPatch(ray, hit, ctx);
+                else if (params.hgRecords[recordIndex(hit)].type ==
+                         HitgroupRecord::Quadric)
+                    closesthitQuadric(ray, hit, ctx);
+                else
+                    CHECK(!"unexpected primitive type");
+            }
+
+            return TransmittanceTraceResult{!missed, Point3f(ctx.piHit), ctx.material};
+        },
+        [&](Point3f p) -> Ray { return ctx.SpawnRayTo(p); });
+}
+
+///////////////////////////////////////////////////////////////////////////
+// Random hit (for subsurface scattering)
+
+struct RandomHitPayload {
+    WeightedReservoirSampler<SubsurfaceInteraction> wrs;
+    Material material;
+    pstd::optional<SurfaceInteraction> intr;
+};
+
+__device__ void closesthitRandomHitTriangle(const Ray &rayWorld, const hiprtHit &hit,
+                                            RandomHitPayload *p) {
+    const TriangleMeshRecord &rec = params.hgRecords[recordIndex(hit)].triRec;
+
+    PBRT_DBG("Anyhit triangle for random hit: rec.material %p params.materials %p\n",
+             rec.material.ptr(), p->material.ptr());
+
+    SurfaceInteraction intr = getTriangleIntersection(rayWorld, hit);
+    p->intr = intr;
+
+    if (rec.material == p->material)
+        p->wrs.Add([&] __device__() { return intr; }, 1.f);
+}
+
+__device__ void closesthitRandomHitBilinearPatch(const Ray &rayWorld, const hiprtHit &hit,
+                                                 RandomHitPayload *p) {
+    BilinearMeshRecord &rec = params.hgRecords[recordIndex(hit)].blpRec;
+
+    PBRT_DBG("Anyhit blp for random hit: rec.material %p params.materials %p\n",
+             rec.material.ptr(), p->material.ptr());
+
+    Point2f uv(hit.uv.x, hit.uv.y);
+    SurfaceInteraction intr = getBilinearPatchIntersection(rayWorld, hit, uv);
+    p->intr = intr;
+
+    if (rec.material == p->material)
+        p->wrs.Add([&] __device__() { return intr; }, 1.f);
+}
+
+__device__ void closesthitRandomHitQuadric(const Ray &rayWorld, const hiprtHit &hit,
+                                           RandomHitPayload *p) {
+    QuadricRecord &rec = params.hgRecords[recordIndex(hit)].quadricRec;
+
+    PBRT_DBG("Anyhit quadric for random hit: rec.material %p params.materials %p\n",
+             rec.material.ptr(), p->material.ptr());
+
+    QuadricIntersection qi;
+    qi.pObj = Point3f(hit.normal.x, hit.normal.y, hit.normal.z);
+    qi.phi = hit.uv.x;
+
+    SurfaceInteraction intr = getQuadricIntersection(rayWorld, hit, qi);
+    p->intr = intr;
+
+    if (rec.material == p->material)
+        p->wrs.Add([&] __device__() { return intr; }, 1.f);
+}
+
+extern "C" __global__ void __raygen__randomHit() {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= params.subsurfaceScatterQueue->Size())
+        return;
+
+    SubsurfaceScatterWorkItem s = (*params.subsurfaceScatterQueue)[index];
+
+    Ray ray(s.p0, s.p1 - s.p0);
+
+    RandomHitPayload payload;
+    payload.wrs.Seed(Hash(s.p0, s.p1));
+    payload.material = s.material;
+
+    PBRT_DBG("Randomhit raygen ray.o %f %f %f ray.d %f %f %f\n", ray.o.x, ray.o.y,
+             ray.o.z, ray.d.x, ray.d.y, ray.d.z);
+
+    int depth = 0;
+    while (LengthSquared(ray.d) > 0 && ++depth < 100) {
+        uint32_t missed = 0;
+        hiprtHit hit =
+            Trace<false>(params.traversable, ray, 1.f /* tMax */, missed, &ray);
+
+        if (!missed) {
+            if (params.hgRecords[recordIndex(hit)].type == HitgroupRecord::TriangleMesh)
+                closesthitRandomHitTriangle(ray, hit, &payload);
+            else if (params.hgRecords[recordIndex(hit)].type ==
+                     HitgroupRecord::BilinearMesh)
+                closesthitRandomHitBilinearPatch(ray, hit, &payload);
+            else if (params.hgRecords[recordIndex(hit)].type == HitgroupRecord::Quadric)
+                closesthitRandomHitQuadric(ray, hit, &payload);
+            else
+                CHECK(!"unexpected primitive type");
+        }
+
+        if (payload.intr) {
+            ray = payload.intr->SpawnRayTo(s.p1);
+            payload.intr.reset();
+        } else
+            break;
+    }
+
+    if (payload.wrs.HasSample() &&
+        payload.wrs.WeightSum() > 0) {  // TODO: latter check shouldn't be needed...
+        const SubsurfaceInteraction &si = payload.wrs.GetSample();
+
+        params.subsurfaceScatterQueue->reservoirPDF[index] =
+            payload.wrs.SampleProbability();
+        params.subsurfaceScatterQueue->ssi[index] = payload.wrs.GetSample();
+    } else
+        params.subsurfaceScatterQueue->reservoirPDF[index] = 0;
+}
diff --git a/src/pbrt/gpu/memory.cpp b/src/pbrt/gpu/memory.cpp
index 0cec0a9ab..da3601884 100644
--- a/src/pbrt/gpu/memory.cpp
+++ b/src/pbrt/gpu/memory.cpp
@@ -8,8 +8,12 @@
 #include <pbrt/util/check.h>
 #include <pbrt/util/log.h>
 
+#if defined(__HIPCC__)
+#include <pbrt/util/hip_aliases.h>
+#else
 #include <cuda.h>
 #include <cuda_runtime.h>
+#endif
 
 namespace pbrt {
 
diff --git a/src/pbrt/gpu/util.cpp b/src/pbrt/gpu/util.cpp
index aa89707c7..b0a631d8e 100644
--- a/src/pbrt/gpu/util.cpp
+++ b/src/pbrt/gpu/util.cpp
@@ -76,21 +76,27 @@ void GPUInit() {
 #endif
     CUDA_CHECK(cudaSetDevice(device));
 
+// there was a bug in HIP stating unsupported while actually supported
+// not sure whether has been fixed
+#ifdef __NVCC__ 
     int hasUnifiedAddressing;
     CUDA_CHECK(cudaDeviceGetAttribute(&hasUnifiedAddressing, cudaDevAttrUnifiedAddressing,
                                       device));
     if (!hasUnifiedAddressing)
         LOG_FATAL("The selected GPU device (%d) does not support unified addressing.",
                   device);
+#endif
 
     CUDA_CHECK(cudaDeviceSetLimit(cudaLimitStackSize, 8192));
     size_t stackSize;
     CUDA_CHECK(cudaDeviceGetLimit(&stackSize, cudaLimitStackSize));
     LOG_VERBOSE("Reset stack size to %d", stackSize);
 
+#ifdef __NVCC__
     CUDA_CHECK(cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 32 * 1024 * 1024));
 
     CUDA_CHECK(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
+#endif
 
 #ifdef NVTX
 #ifdef PBRT_IS_WINDOWS
diff --git a/src/pbrt/gpu/util.h b/src/pbrt/gpu/util.h
index 72b8cc4cc..98f782aa6 100644
--- a/src/pbrt/gpu/util.h
+++ b/src/pbrt/gpu/util.h
@@ -18,8 +18,12 @@
 #include <utility>
 #include <vector>
 
+#if defined(__HIPCC__)
+#include <pbrt/util/hip_aliases.h>
+#else
 #include <cuda.h>
 #include <cuda_runtime_api.h>
+#endif
 
 #ifdef NVTX
 #ifdef UNICODE
@@ -38,6 +42,7 @@
         LOG_FATAL("CUDA error: %s", cudaGetErrorString(error)); \
     } else /* eat semicolon */
 
+#ifdef __NVCC__  // only used in denoiser.cpp
 #define CU_CHECK(EXPR)                                              \
     do {                                                            \
         CUresult result = EXPR;                                     \
@@ -47,6 +52,7 @@
             LOG_FATAL("CUDA error: %s", str);                       \
         }                                                           \
     } while (false) /* eat semicolon */
+#endif
 
 namespace pbrt {
 
@@ -64,15 +70,20 @@ inline int GetBlockSize(const char *description, F kernel) {
         return iter->second;
 
     int minGridSize, blockSize;
+// this API is not reliable in HIP sometimes returning even negative values
+#ifdef __HIPCC__  
+    blockSize = 64;
+#else
     CUDA_CHECK(
         cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, kernel, 0, 0));
+#endif
     kernelBlockSizes[index] = blockSize;
     LOG_VERBOSE("[%s]: block size %d", description, blockSize);
 
     return blockSize;
 }
 
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
 template <typename F>
 __global__ void Kernel(F func, int nItems) {
     int tid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -113,7 +124,7 @@ void GPUParallelFor(const char *description, int nItems, F func) {
 #endif
 }
 
-#endif  // __NVCC__
+#endif  // __NVCC__ || __HIPCC__
 
 // GPU Synchronization Function Declarations
 void GPUWait();
diff --git a/src/pbrt/media.h b/src/pbrt/media.h
index 8a5677a74..d31d02aa2 100644
--- a/src/pbrt/media.h
+++ b/src/pbrt/media.h
@@ -24,9 +24,6 @@
 #include <nanovdb/NanoVDB.h>
 #include <nanovdb/util/GridHandle.h>
 #include <nanovdb/util/SampleFromVoxels.h>
-#if defined(PBRT_BUILD_GPU_RENDERER) && defined(__NVCC__)
-#include <nanovdb/util/CudaDeviceBuffer.h>
-#endif  // PBRT_BUILD_GPU_RENDERER
 
 #include <algorithm>
 #include <limits>
diff --git a/src/pbrt/options.h b/src/pbrt/options.h
index 56f8e67be..397ef9710 100644
--- a/src/pbrt/options.h
+++ b/src/pbrt/options.h
@@ -62,9 +62,9 @@ struct PBRTOptions : BasicPBRTOptions {
 extern PBRTOptions *Options;
 
 #if defined(PBRT_BUILD_GPU_RENDERER)
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
 extern __constant__ BasicPBRTOptions OptionsGPU;
-#endif  // __CUDACC__
+#endif  // __CUDACC__ || __HIPCC__
 
 void CopyOptionsToGPU();
 #endif  // PBRT_BUILD_GPU_RENDERER
diff --git a/src/pbrt/pbrt.h b/src/pbrt/pbrt.h
index 8207a4800..e52297032 100644
--- a/src/pbrt/pbrt.h
+++ b/src/pbrt/pbrt.h
@@ -18,11 +18,11 @@
 #endif  // PBRT_IS_WINDOWS
 
 // GPU Macro Definitions
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
 #define PBRT_IS_GPU_CODE
 #endif
 
-#if defined(PBRT_BUILD_GPU_RENDERER) && defined(__CUDACC__)
+#if defined(PBRT_BUILD_GPU_RENDERER) && (defined(__CUDACC__) || defined(__HIPCC__))
 #ifndef PBRT_NOINLINE
 #define PBRT_NOINLINE __attribute__((noinline))
 #endif
diff --git a/src/pbrt/shapes.h b/src/pbrt/shapes.h
index d775b6469..f74df4ba9 100644
--- a/src/pbrt/shapes.h
+++ b/src/pbrt/shapes.h
@@ -812,7 +812,7 @@ inline Cylinder::Cylinder(const Transform *renderFromObject,
       phiMax(Radians(Clamp(phiMax, 0, 360))) {}
 
 // Triangle Declarations
-#if defined(PBRT_BUILD_GPU_RENDERER) && defined(__CUDACC__)
+#if defined(PBRT_BUILD_GPU_RENDERER) && (defined(__HIPCC__) || defined(__CUDACC__))
 extern PBRT_GPU pstd::vector<const TriangleMesh *> *allTriangleMeshesGPU;
 #endif
 
@@ -1264,7 +1264,7 @@ class Curve {
 };
 
 // BilinearPatch Declarations
-#if defined(PBRT_BUILD_GPU_RENDERER) && defined(__CUDACC__)
+#if defined(PBRT_BUILD_GPU_RENDERER) && (defined(__HIPCC__) || defined(__CUDACC__))
 extern PBRT_GPU pstd::vector<const BilinearPatchMesh *> *allBilinearMeshesGPU;
 #endif
 
diff --git a/src/pbrt/textures.cpp b/src/pbrt/textures.cpp
index 48a0aeb2b..d21c72f28 100644
--- a/src/pbrt/textures.cpp
+++ b/src/pbrt/textures.cpp
@@ -180,7 +180,7 @@ std::string SpectrumBilerpTexture::ToString() const {
 }
 
 // CheckerboardTexture Function Definitions
-Float Checkerboard(TextureEvalContext ctx, TextureMapping2D map2D,
+PBRT_CPU_GPU Float Checkerboard(TextureEvalContext ctx, TextureMapping2D map2D,
                    TextureMapping3D map3D) {
     // Define 1D checkerboard filtered integral functions
     auto d = [](Float x) {
@@ -285,7 +285,7 @@ std::string SpectrumCheckerboardTexture::ToString() const {
 }
 
 // InsidePolkaDot Function Definition
-bool InsidePolkaDot(Point2f st) {
+PBRT_CPU_GPU bool InsidePolkaDot(Point2f st) {
     // Compute cell indices (_sCell_,_tCell_ for dots
     int sCell = pstd::floor(st[0] + .5f), tCell = pstd::floor(st[1] + .5f);
 
@@ -356,10 +356,10 @@ std::string FBmTexture::ToString() const {
 }
 
 // SpectrumImageTexture Method Definitions
-SampledSpectrum SpectrumImageTexture::Evaluate(TextureEvalContext ctx,
+PBRT_CPU_GPU SampledSpectrum SpectrumImageTexture::Evaluate(TextureEvalContext ctx,
                                                SampledWavelengths lambda) const {
 #ifdef PBRT_IS_GPU_CODE
-    assert(!"Should not be called in GPU code");
+    CHECK(!"Should not be called in GPU code");
     return SampledSpectrum(0);
 #else
     // Apply texture mapping and flip $t$ coordinate for image texture lookup
@@ -477,7 +477,7 @@ SpectrumImageTexture *SpectrumImageTexture::Create(
 }
 
 // MarbleTexture Method Definitions
-SampledSpectrum MarbleTexture::Evaluate(TextureEvalContext ctx,
+PBRT_CPU_GPU SampledSpectrum MarbleTexture::Evaluate(TextureEvalContext ctx,
                                         SampledWavelengths lambda) const {
     TexCoord3D c = mapping.Map(ctx);
     c.p *= scale;
@@ -582,7 +582,6 @@ SpectrumDirectionMixTexture *SpectrumDirectionMixTexture::Create(
         parameters.GetSpectrumTexture("tex2", one, spectrumType, alloc), dir);
 }
 
-static std::mutex ptexMutex;
 static Ptex::PtexCache *cache;
 
 STAT_COUNTER("Texture/Ptex lookups", nLookups);
@@ -601,7 +600,8 @@ struct : public PtexErrorHandler {
 PtexTextureBase::PtexTextureBase(const std::string &filename, ColorEncoding encoding,
                                  Float scale)
     : filename(filename), encoding(encoding), scale(scale) {
-    ptexMutex.lock();
+    std::mutex mutex;
+    mutex.lock();
     if (!cache) {
         int maxFiles = 100;
         size_t maxMem = 1ull << 32;  // 4GB
@@ -611,7 +611,7 @@ PtexTextureBase::PtexTextureBase(const std::string &filename, ColorEncoding enco
                                         &errorHandler);
         // TODO? cache->setSearchPath(...);
     }
-    ptexMutex.unlock();
+    mutex.unlock();
 
     // Issue an error if the texture doesn't exist or has an unsupported
     // number of channels.
@@ -695,7 +695,7 @@ std::string SpectrumPtexTexture::ToString() const {
     return StringPrintf("[ SpectrumPtexTexture %s ]", BaseToString());
 }
 
-Float FloatPtexTexture::Evaluate(TextureEvalContext ctx) const {
+PBRT_CPU_GPU Float FloatPtexTexture::Evaluate(TextureEvalContext ctx) const {
 #ifdef PBRT_IS_GPU_CODE
     LOG_FATAL("Ptex not supported with GPU renderer");
     return 0;
@@ -709,7 +709,7 @@ Float FloatPtexTexture::Evaluate(TextureEvalContext ctx) const {
 #endif
 }
 
-SampledSpectrum SpectrumPtexTexture::Evaluate(TextureEvalContext ctx,
+PBRT_CPU_GPU SampledSpectrum SpectrumPtexTexture::Evaluate(TextureEvalContext ctx,
                                               SampledWavelengths lambda) const {
 #ifdef PBRT_IS_GPU_CODE
     LOG_FATAL("Ptex not supported with GPU renderer");
@@ -989,15 +989,23 @@ WrinkledTexture *WrinkledTexture::Create(const Transform &renderFromTexture,
 
 #if defined(PBRT_BUILD_GPU_RENDERER)
 
+// mipmaps were not fully supported in HIP
+// we use just the base resolution for now
+#ifdef __HIPCC__
+using TextureArray = cudaArray_t;
+#else
+using TextureArray = cudaMipmappedArray_t;
+#endif
+
 struct LuminanceTextureCacheItem {
-    cudaMipmappedArray_t mipArray;
+    TextureArray texArray;
     cudaTextureReadMode readMode;
     int nMIPMapLevels;
     bool originallySingleChannel;
 };
 
 struct RGBTextureCacheItem {
-    cudaMipmappedArray_t mipArray;
+    TextureArray texArray;
     cudaTextureReadMode readMode;
     int nMIPMapLevels;
     const RGBColorSpace *colorSpace;
@@ -1009,10 +1017,10 @@ static std::map<std::string, RGBTextureCacheItem> rgbTextureCache;
 
 STAT_MEMORY_COUNTER("Memory/ImageTextures", gpuImageTextureBytes);
 
-static cudaMipmappedArray_t createSingleChannelTextureArray(
+static TextureArray createSingleChannelTextureArray(
     const Image &image, const RGBColorSpace *colorSpace, int *nMIPMapLevels) {
     CHECK_EQ(1, image.NChannels());
-    cudaMipmappedArray_t mipArray;
+    TextureArray texArray;
 
     cudaChannelFormatDesc channelDesc;
     switch (image.Format()) {
@@ -1036,13 +1044,37 @@ static cudaMipmappedArray_t createSingleChannelTextureArray(
     const Image &baseImage = mipmap.GetLevel(0);
     cudaExtent extent =
         make_cudaExtent(baseImage.Resolution().x, baseImage.Resolution().y, 0);
-    CUDA_CHECK(cudaMallocMipmappedArray(&mipArray, &channelDesc, extent, mipmap.Levels(),
+
+#ifdef __HIPCC__
+    int pitch;
+    switch (image.Format()) {
+    case PixelFormat::U256:
+        pitch = baseImage.Resolution().x * sizeof(uint8_t);
+        break;
+    case PixelFormat::Half:
+        pitch = baseImage.Resolution().x * sizeof(Half);
+        break;
+    case PixelFormat::Float:
+        pitch = baseImage.Resolution().x * sizeof(float);
+        break;
+    default:
+        LOG_FATAL("Unhandled PixelFormat");
+    }
+
+    gpuImageTextureBytes += pitch * baseImage.Resolution().y;
+
+    CUDA_CHECK(cudaMallocArray(&texArray, &channelDesc, extent.width, extent.height));
+    CUDA_CHECK(cudaMemcpy2DToArray(texArray, /* offset */ 0, 0,
+                                  baseImage.RawPointer({0, 0}), pitch, pitch,
+                                  baseImage.Resolution().y, cudaMemcpyHostToDevice));
+#else
+    CUDA_CHECK(cudaMallocMipmappedArray(&texArray, &channelDesc, extent, mipmap.Levels(),
                                         0 /* flags */));
 
     for (int level = 0; level < mipmap.Levels(); ++level) {
         const Image &levelImage = mipmap.GetLevel(level);
         cudaArray_t levelArray;
-        CUDA_CHECK(cudaGetMipmappedArrayLevel(&levelArray, mipArray, level));
+        CUDA_CHECK(cudaGetMipmappedArrayLevel(&levelArray, texArray, level));
 
         int pitch;
         switch (image.Format()) {
@@ -1065,8 +1097,9 @@ static cudaMipmappedArray_t createSingleChannelTextureArray(
             levelArray, /* offset */ 0, 0, levelImage.RawPointer({0, 0}), pitch, pitch,
             levelImage.Resolution().y, cudaMemcpyHostToDevice));
     }
+#endif
 
-    return mipArray;
+    return texArray;
 }
 
 static cudaTextureAddressMode convertAddressMode(const std::string &mode) {
@@ -1107,7 +1140,7 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create(
     ColorEncoding encoding = ColorEncoding::Get(encodingString, alloc);
 
     // These have to be initialized one way or another in the below
-    cudaMipmappedArray_t mipArray;
+    TextureArray texArray;
     int nMIPMapLevels = 0;
     cudaTextureReadMode readMode;
     const RGBColorSpace *colorSpace = nullptr;
@@ -1117,7 +1150,7 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create(
     auto rgbIter = rgbTextureCache.find(filename);
     if (rgbIter != rgbTextureCache.end()) {
         LOG_VERBOSE("Found %s in RGB tex array cache!", filename);
-        mipArray = rgbIter->second.mipArray;
+        texArray = rgbIter->second.texArray;
         readMode = rgbIter->second.readMode;
         nMIPMapLevels = rgbIter->second.nMIPMapLevels;
         colorSpace = rgbIter->second.colorSpace;
@@ -1128,7 +1161,7 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create(
         // GPUFloatImageTexture converted it to single channel
         if (lumIter != lumTextureCache.end() && lumIter->second.originallySingleChannel) {
             LOG_VERBOSE("Found %s in luminance tex array cache!", filename);
-            mipArray = lumIter->second.mipArray;
+            texArray = lumIter->second.texArray;
             readMode = lumIter->second.readMode;
             nMIPMapLevels = lumIter->second.nMIPMapLevels;
             colorSpace = RGBColorSpace::sRGB;
@@ -1162,14 +1195,35 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create(
 
                         cudaExtent extent = make_cudaExtent(baseImage.Resolution().x,
                                                             baseImage.Resolution().y, 0);
-                        CUDA_CHECK(cudaMallocMipmappedArray(&mipArray, &channelDesc,
+#ifdef __HIPCC__
+                        std::vector<uint8_t> rgba(4 * baseImage.Resolution().x *
+                                                  baseImage.Resolution().y);
+                        size_t offset = 0;
+                        for (int y = 0; y < baseImage.Resolution().y; ++y)
+                            for (int x = 0; x < baseImage.Resolution().x; ++x) {
+                                for (int c = 0; c < 3; ++c)
+                                    rgba[offset++] =
+                                        ((uint8_t *)baseImage.RawPointer({x, y}))[c];
+                                rgba[offset++] = 255;
+                            }
+
+                        int pitch = baseImage.Resolution().x * 4 * sizeof(uint8_t);
+                        gpuImageTextureBytes += pitch * baseImage.Resolution().y;
+
+                        CUDA_CHECK(cudaMallocArray(&texArray, &channelDesc, extent.width,
+                                                  extent.height));
+                        CUDA_CHECK(cudaMemcpy2DToArray(
+                            texArray, /* offset */ 0, 0, rgba.data(), pitch, pitch,
+                            baseImage.Resolution().y, cudaMemcpyHostToDevice));
+#else
+                        CUDA_CHECK(cudaMallocMipmappedArray(&texArray, &channelDesc,
                                                             extent, mipmap.Levels(),
                                                             0 /* flags */));
                         for (int level = 0; level < mipmap.Levels(); ++level) {
                             const Image &levelImage = mipmap.GetLevel(level);
                             cudaArray_t levelArray;
                             CUDA_CHECK(
-                                cudaGetMipmappedArrayLevel(&levelArray, mipArray, level));
+                                cudaGetMipmappedArrayLevel(&levelArray, texArray, level));
 
                             std::vector<uint8_t> rgba(4 * levelImage.Resolution().x *
                                                       levelImage.Resolution().y);
@@ -1190,6 +1244,7 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create(
                                 /* offset */ 0, 0, rgba.data(), pitch, pitch,
                                 levelImage.Resolution().y, cudaMemcpyHostToDevice));
                         }
+#endif
                         break;
                     }
                     case PixelFormat::Half: {
@@ -1198,7 +1253,28 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create(
 
                         cudaExtent extent = make_cudaExtent(baseImage.Resolution().x,
                                                             baseImage.Resolution().y, 0);
-                        CUDA_CHECK(cudaMallocMipmappedArray(&mipArray, &channelDesc,
+#ifdef __HIPCC__
+                        std::vector<Half> rgba(4 * baseImage.Resolution().x *
+                                               baseImage.Resolution().y);
+                        size_t offset = 0;
+                        for (int y = 0; y < baseImage.Resolution().y; ++y)
+                            for (int x = 0; x < baseImage.Resolution().x; ++x) {
+                                for (int c = 0; c < 3; ++c)
+                                    rgba[offset++] =
+                                        Half(baseImage.GetChannel({x, y}, c));
+                                rgba[offset++] = Half(1.f);
+                            }
+
+                        int pitch = baseImage.Resolution().x * 4 * sizeof(Half);
+                        gpuImageTextureBytes += pitch * baseImage.Resolution().y;
+
+                        CUDA_CHECK(cudaMallocArray(&texArray, &channelDesc, extent.width,
+                                                  extent.height));
+                        CUDA_CHECK(cudaMemcpy2DToArray(
+                            texArray, /* offset */ 0, 0, rgba.data(), pitch, pitch,
+                            baseImage.Resolution().y, cudaMemcpyHostToDevice));
+#else
+                        CUDA_CHECK(cudaMallocMipmappedArray(&texArray, &channelDesc,
                                                             extent, mipmap.Levels(),
                                                             0 /* flags */));
 
@@ -1206,7 +1282,7 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create(
                             const Image &levelImage = mipmap.GetLevel(level);
                             cudaArray_t levelArray;
                             CUDA_CHECK(
-                                cudaGetMipmappedArrayLevel(&levelArray, mipArray, level));
+                                cudaGetMipmappedArrayLevel(&levelArray, texArray, level));
 
                             std::vector<Half> rgba(4 * levelImage.Resolution().x *
                                                    levelImage.Resolution().y);
@@ -1228,6 +1304,7 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create(
                                 /* offset */ 0, 0, rgba.data(), pitch, pitch,
                                 levelImage.Resolution().y, cudaMemcpyHostToDevice));
                         }
+#endif
                         break;
                     }
                     case PixelFormat::Float: {
@@ -1236,7 +1313,28 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create(
 
                         cudaExtent extent = make_cudaExtent(baseImage.Resolution().x,
                                                             baseImage.Resolution().y, 0);
-                        CUDA_CHECK(cudaMallocMipmappedArray(&mipArray, &channelDesc,
+#ifdef __HIPCC__
+                        std::vector<float> rgba(4 * baseImage.Resolution().x *
+                                                baseImage.Resolution().y);
+
+                        size_t offset = 0;
+                        for (int y = 0; y < baseImage.Resolution().y; ++y)
+                            for (int x = 0; x < baseImage.Resolution().x; ++x) {
+                                for (int c = 0; c < 3; ++c)
+                                    rgba[offset++] = baseImage.GetChannel({x, y}, c);
+                                rgba[offset++] = 1.f;
+                            }
+
+                        int pitch = baseImage.Resolution().x * 4 * sizeof(float);
+                        gpuImageTextureBytes += pitch * baseImage.Resolution().y;
+
+                        CUDA_CHECK(cudaMallocArray(&texArray, &channelDesc, extent.width,
+                                                  extent.height));
+                        CUDA_CHECK(cudaMemcpy2DToArray(
+                            texArray, /* offset */ 0, 0, rgba.data(), pitch, pitch,
+                            baseImage.Resolution().y, cudaMemcpyHostToDevice));
+#else
+                        CUDA_CHECK(cudaMallocMipmappedArray(&texArray, &channelDesc,
                                                             extent, mipmap.Levels(),
                                                             0 /* flags */));
 
@@ -1244,7 +1342,7 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create(
                             const Image &levelImage = mipmap.GetLevel(level);
                             cudaArray_t levelArray;
                             CUDA_CHECK(
-                                cudaGetMipmappedArrayLevel(&levelArray, mipArray, level));
+                                cudaGetMipmappedArrayLevel(&levelArray, texArray, level));
 
                             std::vector<float> rgba(4 * levelImage.Resolution().x *
                                                     levelImage.Resolution().y);
@@ -1265,6 +1363,7 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create(
                                 /* offset */ 0, 0, rgba.data(), pitch, pitch,
                                 levelImage.Resolution().y, cudaMemcpyHostToDevice));
                         }
+#endif
                         break;
                     }
                     default:
@@ -1273,15 +1372,15 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create(
 
                     textureCacheMutex.lock();
                     rgbTextureCache[filename] = RGBTextureCacheItem{
-                        mipArray, readMode, nMIPMapLevels, colorSpace};
+                        texArray, readMode, nMIPMapLevels, colorSpace};
                     textureCacheMutex.unlock();
                 } else if (image.NChannels() == 1) {
-                    mipArray = createSingleChannelTextureArray(image, colorSpace,
+                    texArray = createSingleChannelTextureArray(image, colorSpace,
                                                                &nMIPMapLevels);
 
                     textureCacheMutex.lock();
                     lumTextureCache[filename] = LuminanceTextureCacheItem{
-                        mipArray, readMode, nMIPMapLevels, true};
+                        texArray, readMode, nMIPMapLevels, true};
                     textureCacheMutex.unlock();
                     isSingleChannel = true;
                 } else {
@@ -1293,8 +1392,13 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create(
     }
 
     cudaResourceDesc resDesc = {};
+#ifdef __HIPCC__
+    resDesc.resType = cudaResourceTypeArray;
+    resDesc.res.array.array = texArray;
+#else
     resDesc.resType = cudaResourceTypeMipmappedArray;
-    resDesc.res.mipmap.mipmap = mipArray;
+    resDesc.res.mipmap.mipmap = texArray;
+#endif
 
     cudaTextureDesc texDesc = {};
     texDesc.addressMode[0] = convertAddressMode(wrapString);
@@ -1302,6 +1406,7 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create(
     texDesc.filterMode = filter == "point" ? cudaFilterModePoint : cudaFilterModeLinear;
     texDesc.readMode = readMode;
     texDesc.normalizedCoords = 1;
+#ifndef __HIPCC__
     texDesc.maxAnisotropy = Clamp(maxAniso, 1, 16);
     texDesc.maxMipmapLevelClamp = nMIPMapLevels - 1;
     texDesc.minMipmapLevelClamp = 0;
@@ -1309,6 +1414,7 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create(
         (filter == "trilinear" || filter == "ewa" || filter == "EWA")
             ? cudaFilterModeLinear
             : cudaFilterModePoint;
+#endif
     texDesc.borderColor[0] = texDesc.borderColor[1] = texDesc.borderColor[2] =
         texDesc.borderColor[3] = 0.f;
     texDesc.sRGB = 1;
@@ -1356,7 +1462,7 @@ GPUFloatImageTexture *GPUFloatImageTexture::Create(
     std::string encodingString = parameters.GetOneString("encoding", defaultEncoding);
     ColorEncoding encoding = ColorEncoding::Get(encodingString, alloc);
 
-    cudaMipmappedArray_t mipArray;
+    TextureArray texArray;
     int nMIPMapLevels = 0;
     cudaTextureReadMode readMode;
 
@@ -1364,7 +1470,7 @@ GPUFloatImageTexture *GPUFloatImageTexture::Create(
     auto iter = lumTextureCache.find(filename);
     if (iter != lumTextureCache.end()) {
         LOG_VERBOSE("Found %s in luminance tex array cache!", filename);
-        mipArray = iter->second.mipArray;
+        texArray = iter->second.texArray;
         readMode = iter->second.readMode;
         nMIPMapLevels = iter->second.nMIPMapLevels;
         textureCacheMutex.unlock();
@@ -1407,19 +1513,24 @@ GPUFloatImageTexture *GPUFloatImageTexture::Create(
                           image.NChannels());
         }
 
-        mipArray = createSingleChannelTextureArray(image, colorSpace, &nMIPMapLevels);
+        texArray = createSingleChannelTextureArray(image, colorSpace, &nMIPMapLevels);
         readMode = (image.Format() == PixelFormat::U256) ? cudaReadModeNormalizedFloat
                                                          : cudaReadModeElementType;
 
         textureCacheMutex.lock();
         lumTextureCache[filename] =
-            LuminanceTextureCacheItem{mipArray, readMode, nMIPMapLevels, !convertedImage};
+            LuminanceTextureCacheItem{texArray, readMode, nMIPMapLevels, !convertedImage};
         textureCacheMutex.unlock();
     }
 
     cudaResourceDesc resDesc = {};
+#ifdef __HIPCC__
+    resDesc.resType = cudaResourceTypeArray;
+    resDesc.res.array.array = texArray;
+#else
     resDesc.resType = cudaResourceTypeMipmappedArray;
-    resDesc.res.mipmap.mipmap = mipArray;
+    resDesc.res.mipmap.mipmap = texArray;
+#endif
 
     cudaTextureDesc texDesc = {};
     texDesc.addressMode[0] = convertAddressMode(wrapString);
@@ -1427,6 +1538,7 @@ GPUFloatImageTexture *GPUFloatImageTexture::Create(
     texDesc.filterMode = filter == "point" ? cudaFilterModePoint : cudaFilterModeLinear;
     texDesc.readMode = readMode;
     texDesc.normalizedCoords = 1;
+#ifndef __HIPCC__
     texDesc.maxAnisotropy = Clamp(maxAniso, 1, 16);
     texDesc.maxMipmapLevelClamp = nMIPMapLevels - 1;
     texDesc.minMipmapLevelClamp = 0;
@@ -1434,6 +1546,7 @@ GPUFloatImageTexture *GPUFloatImageTexture::Create(
         (filter == "trilinear" || filter == "ewa" || filter == "EWA")
             ? cudaFilterModeLinear
             : cudaFilterModePoint;
+#endif
     texDesc.borderColor[0] = texDesc.borderColor[1] = texDesc.borderColor[2] =
         texDesc.borderColor[3] = 0.f;
     texDesc.sRGB = 1;
@@ -1562,11 +1675,11 @@ SpectrumTexture SpectrumTexture::Create(const std::string &name,
 }
 
 // UniversalTextureEvaluator Method Definitions
-Float UniversalTextureEvaluator::operator()(FloatTexture tex, TextureEvalContext ctx) {
+PBRT_CPU_GPU Float UniversalTextureEvaluator::operator()(FloatTexture tex, TextureEvalContext ctx) {
     return tex.Evaluate(ctx);
 }
 
-SampledSpectrum UniversalTextureEvaluator::operator()(SpectrumTexture tex,
+PBRT_CPU_GPU SampledSpectrum UniversalTextureEvaluator::operator()(SpectrumTexture tex,
                                                       TextureEvalContext ctx,
                                                       SampledWavelengths lambda) {
     return tex.Evaluate(ctx, lambda);
diff --git a/src/pbrt/textures.h b/src/pbrt/textures.h
index 4a074e967..14b0bb433 100644
--- a/src/pbrt/textures.h
+++ b/src/pbrt/textures.h
@@ -24,6 +24,10 @@
 #include <mutex>
 #include <string>
 
+#if defined(__HIPCC__)
+#include <pbrt/util/hip_aliases.h>
+#endif
+
 namespace pbrt {
 
 // TextureEvalContext Definition
@@ -220,7 +224,7 @@ class TextureMapping2D : public TaggedPointer<UVMapping, SphericalMapping,
 };
 
 // TextureMapping2D Inline Functions
-inline TexCoord2D TextureMapping2D::Map(TextureEvalContext ctx) const {
+PBRT_CPU_GPU inline TexCoord2D TextureMapping2D::Map(TextureEvalContext ctx) const {
     auto map = [&](auto ptr) { return ptr->Map(ctx); };
     return Dispatch(map);
 }
@@ -260,7 +264,7 @@ class TextureMapping3D : public TaggedPointer<PointTransformMapping> {
     TexCoord3D Map(TextureEvalContext ctx) const;
 };
 
-inline TexCoord3D TextureMapping3D::Map(TextureEvalContext ctx) const {
+PBRT_CPU_GPU inline TexCoord3D TextureMapping3D::Map(TextureEvalContext ctx) const {
     auto map = [&](auto ptr) { return ptr->Map(ctx); };
     return Dispatch(map);
 }
@@ -578,7 +582,7 @@ class FloatImageTexture : public ImageTextureBase {
     PBRT_CPU_GPU
     Float Evaluate(TextureEvalContext ctx) const {
 #ifdef PBRT_IS_GPU_CODE
-        assert(!"Should not be called in GPU code");
+        CHECK(!"Should not be called in GPU code");
         return 0;
 #else
         TexCoord2D c = mapping.Map(ctx);
@@ -624,7 +628,7 @@ class SpectrumImageTexture : public ImageTextureBase {
     SpectrumType spectrumType;
 };
 
-#if defined(PBRT_BUILD_GPU_RENDERER) && defined(__NVCC__)
+#if defined(PBRT_BUILD_GPU_RENDERER) && (defined(__NVCC__) || defined(__HIPCC__))
 class GPUSpectrumImageTexture {
   public:
     GPUSpectrumImageTexture(std::string filename, TextureMapping2D mapping,
@@ -732,7 +736,7 @@ class GPUFloatImageTexture {
     bool invert;
 };
 
-#else  // PBRT_BUILD_GPU_RENDERER && __NVCC__
+#else  // PBRT_BUILD_GPU_RENDERER && (__NVCC__ ||  __HIPCC__)
 
 class GPUSpectrumImageTexture {
   public:
@@ -769,7 +773,7 @@ class GPUFloatImageTexture {
     std::string ToString() const { return "GPUFloatImageTexture"; }
 };
 
-#endif  // PBRT_BUILD_GPU_RENDERER && __NVCC__
+#endif  // PBRT_BUILD_GPU_RENDERER && (__NVCC__ ||  __HIPCC__)
 
 // MarbleTexture Definition
 class MarbleTexture {
@@ -1125,13 +1129,13 @@ class WrinkledTexture {
     Float omega;
 };
 
-inline Float FloatTexture::Evaluate(TextureEvalContext ctx) const {
+PBRT_CPU_GPU inline Float FloatTexture::Evaluate(TextureEvalContext ctx) const {
     auto eval = [&](auto ptr) { return ptr->Evaluate(ctx); };
     return Dispatch(eval);
 }
 
-inline SampledSpectrum SpectrumTexture::Evaluate(TextureEvalContext ctx,
-                                                 SampledWavelengths lambda) const {
+PBRT_CPU_GPU inline SampledSpectrum SpectrumTexture::Evaluate(
+    TextureEvalContext ctx, SampledWavelengths lambda) const {
     auto eval = [&](auto ptr) { return ptr->Evaluate(ctx, lambda); };
     return Dispatch(eval);
 }
diff --git a/src/pbrt/util/check.h b/src/pbrt/util/check.h
index e2b513a2e..2128504b0 100644
--- a/src/pbrt/util/check.h
+++ b/src/pbrt/util/check.h
@@ -18,10 +18,19 @@ namespace pbrt {
 
 void PrintStackTrace();
 
+#define EMPTY_CHECK \
+    do {            \
+    } while (false) /* swallow semicolon */
+
 #ifdef PBRT_IS_GPU_CODE
 
+#ifdef __HIP_DEVICE_COMPILE__
+#define CHECK(x) EMPTY_CHECK
+#define CHECK_IMPL(a, b, op) EMPTY_CHECK
+#else
 #define CHECK(x) assert(x)
 #define CHECK_IMPL(a, b, op) assert((a)op(b))
+#endif
 
 #define CHECK_EQ(a, b) CHECK_IMPL(a, b, ==)
 #define CHECK_NE(a, b) CHECK_IMPL(a, b, !=)
@@ -56,7 +65,7 @@ void PrintStackTrace();
 
 #ifdef PBRT_DEBUG_BUILD
 
-#define DCHECK(x) (CHECK(x))
+#define DCHECK(x) CHECK(x)
 #define DCHECK_EQ(a, b) CHECK_EQ(a, b)
 #define DCHECK_NE(a, b) CHECK_NE(a, b)
 #define DCHECK_GT(a, b) CHECK_GT(a, b)
@@ -66,10 +75,6 @@ void PrintStackTrace();
 
 #else
 
-#define EMPTY_CHECK \
-    do {            \
-    } while (false) /* swallow semicolon */
-
 // Use an empty check (rather than expanding the macros to nothing) to swallow the
 // semicolon at the end, and avoid empty if-statements.
 #define DCHECK(x) EMPTY_CHECK
diff --git a/src/pbrt/util/color.h b/src/pbrt/util/color.h
index c04536901..5d10c9911 100644
--- a/src/pbrt/util/color.h
+++ b/src/pbrt/util/color.h
@@ -476,18 +476,18 @@ class GammaColorEncoding {
     pstd::array<Float, 1024> inverseLUT;
 };
 
-inline void ColorEncoding::ToLinear(pstd::span<const uint8_t> vin,
+PBRT_CPU_GPU inline void ColorEncoding::ToLinear(pstd::span<const uint8_t> vin,
                                     pstd::span<Float> vout) const {
     auto tolin = [&](auto ptr) { return ptr->ToLinear(vin, vout); };
     Dispatch(tolin);
 }
 
-inline Float ColorEncoding::ToFloatLinear(Float v) const {
+PBRT_CPU_GPU inline Float ColorEncoding::ToFloatLinear(Float v) const {
     auto tfl = [&](auto ptr) { return ptr->ToFloatLinear(v); };
     return Dispatch(tfl);
 }
 
-inline void ColorEncoding::FromLinear(pstd::span<const Float> vin,
+PBRT_CPU_GPU inline void ColorEncoding::FromLinear(pstd::span<const Float> vin,
                                       pstd::span<uint8_t> vout) const {
     auto fl = [&](auto ptr) { return ptr->FromLinear(vin, vout); };
     Dispatch(fl);
diff --git a/src/pbrt/util/colorspace.cpp b/src/pbrt/util/colorspace.cpp
index aa6d5e5db..27994690a 100644
--- a/src/pbrt/util/colorspace.cpp
+++ b/src/pbrt/util/colorspace.cpp
@@ -3,19 +3,15 @@
 // SPDX: Apache-2.0
 
 #include <pbrt/util/colorspace.h>
-
-#ifdef PBRT_BUILD_GPU_RENDERER
-#include <pbrt/gpu/util.h>
-#endif
 #include <pbrt/options.h>
 
 namespace pbrt {
 
 #ifdef PBRT_BUILD_GPU_RENDERER
-PBRT_CONST RGBColorSpace *RGBColorSpace_sRGB;
-PBRT_CONST RGBColorSpace *RGBColorSpace_DCI_P3;
-PBRT_CONST RGBColorSpace *RGBColorSpace_Rec2020;
-PBRT_CONST RGBColorSpace *RGBColorSpace_ACES2065_1;
+PBRT_GPU __constant__ RGBColorSpace *RGBColorSpace_sRGB;
+PBRT_GPU __constant__ RGBColorSpace *RGBColorSpace_DCI_P3;
+PBRT_GPU __constant__ RGBColorSpace *RGBColorSpace_Rec2020;
+PBRT_GPU __constant__ RGBColorSpace *RGBColorSpace_ACES2065_1;
 #endif
 
 // RGBColorSpace Method Definitions
@@ -40,7 +36,7 @@ SquareMatrix<3> ConvertRGBColorSpace(const RGBColorSpace &from, const RGBColorSp
     return to.RGBFromXYZ * from.XYZFromRGB;
 }
 
-RGBSigmoidPolynomial RGBColorSpace::ToRGBCoeffs(RGB rgb) const {
+PBRT_CPU_GPU RGBSigmoidPolynomial RGBColorSpace::ToRGBCoeffs(RGB rgb) const {
     DCHECK(rgb.r >= 0 && rgb.g >= 0 && rgb.b >= 0);
     return (*rgbToSpectrumTable)(ClampZero(rgb));
 }
diff --git a/src/pbrt/util/colorspace.h b/src/pbrt/util/colorspace.h
index 4fc026a34..c015a8f28 100644
--- a/src/pbrt/util/colorspace.h
+++ b/src/pbrt/util/colorspace.h
@@ -11,6 +11,9 @@
 #include <pbrt/util/math.h>
 #include <pbrt/util/spectrum.h>
 #include <pbrt/util/vecmath.h>
+#ifdef PBRT_BUILD_GPU_RENDERER
+#include <pbrt/gpu/util.h>
+#endif
 
 #include <string>
 
@@ -66,10 +69,10 @@ class RGBColorSpace {
 };
 
 #ifdef PBRT_BUILD_GPU_RENDERER
-extern PBRT_CONST RGBColorSpace *RGBColorSpace_sRGB;
-extern PBRT_CONST RGBColorSpace *RGBColorSpace_DCI_P3;
-extern PBRT_CONST RGBColorSpace *RGBColorSpace_Rec2020;
-extern PBRT_CONST RGBColorSpace *RGBColorSpace_ACES2065_1;
+extern PBRT_GPU __constant__ RGBColorSpace *RGBColorSpace_sRGB;
+extern PBRT_GPU __constant__ RGBColorSpace *RGBColorSpace_DCI_P3;
+extern PBRT_GPU __constant__ RGBColorSpace *RGBColorSpace_Rec2020;
+extern PBRT_GPU __constant__ RGBColorSpace *RGBColorSpace_ACES2065_1;
 #endif
 
 SquareMatrix<3> ConvertRGBColorSpace(const RGBColorSpace &from, const RGBColorSpace &to);
diff --git a/src/pbrt/util/float.h b/src/pbrt/util/float.h
index ab9e6447d..a672331ee 100644
--- a/src/pbrt/util/float.h
+++ b/src/pbrt/util/float.h
@@ -16,8 +16,14 @@
 #include <string>
 
 #if defined(PBRT_BUILD_GPU_RENDERER) && defined(PBRT_IS_GPU_CODE)
+#if defined(__HIPCC__)
+#include <hip/hip_runtime.h>
+#include <hip/math_functions.h>
+#include <hip/hip_fp16.h>
+#else
 #include <cuda_fp16.h>
 #endif
+#endif
 
 namespace pbrt {
 
@@ -57,7 +63,7 @@ template <typename T>
 inline PBRT_CPU_GPU typename std::enable_if_t<std::is_floating_point_v<T>, bool> IsNaN(
     T v) {
 #ifdef PBRT_IS_GPU_CODE
-    return isnan(v);
+    return isnan((float)v);
 #else
     return std::isnan(v);
 #endif
@@ -199,9 +205,9 @@ inline constexpr Float gamma(int n) {
 inline PBRT_CPU_GPU Float AddRoundUp(Float a, Float b) {
 #ifdef PBRT_IS_GPU_CODE
 #ifdef PBRT_FLOAT_AS_DOUBLE
-    return __dadd_ru(a, b);
+    return __dadd_rn(a, b);
 #else
-    return __fadd_ru(a, b);
+    return __fadd_rn(a, b);
 #endif
 #else  // CPU
     return NextFloatUp(a + b);
@@ -210,9 +216,9 @@ inline PBRT_CPU_GPU Float AddRoundUp(Float a, Float b) {
 inline PBRT_CPU_GPU Float AddRoundDown(Float a, Float b) {
 #ifdef PBRT_IS_GPU_CODE
 #ifdef PBRT_FLOAT_AS_DOUBLE
-    return __dadd_rd(a, b);
+    return __dadd_rn(a, b);
 #else
-    return __fadd_rd(a, b);
+    return __fadd_rn(a, b);
 #endif
 #else  // CPU
     return NextFloatDown(a + b);
@@ -229,9 +235,9 @@ inline PBRT_CPU_GPU Float SubRoundDown(Float a, Float b) {
 inline PBRT_CPU_GPU Float MulRoundUp(Float a, Float b) {
 #ifdef PBRT_IS_GPU_CODE
 #ifdef PBRT_FLOAT_AS_DOUBLE
-    return __dmul_ru(a, b);
+    return __dmul_rn(a, b);
 #else
-    return __fmul_ru(a, b);
+    return __fmul_rn(a, b);
 #endif
 #else  // CPU
     return NextFloatUp(a * b);
@@ -241,9 +247,9 @@ inline PBRT_CPU_GPU Float MulRoundUp(Float a, Float b) {
 inline PBRT_CPU_GPU Float MulRoundDown(Float a, Float b) {
 #ifdef PBRT_IS_GPU_CODE
 #ifdef PBRT_FLOAT_AS_DOUBLE
-    return __dmul_rd(a, b);
+    return __dmul_rn(a, b);
 #else
-    return __fmul_rd(a, b);
+    return __fmul_rn(a, b);
 #endif
 #else  // CPU
     return NextFloatDown(a * b);
@@ -253,9 +259,9 @@ inline PBRT_CPU_GPU Float MulRoundDown(Float a, Float b) {
 inline PBRT_CPU_GPU Float DivRoundUp(Float a, Float b) {
 #ifdef PBRT_IS_GPU_CODE
 #ifdef PBRT_FLOAT_AS_DOUBLE
-    return __ddiv_ru(a, b);
+    return __ddiv_rn(a, b);
 #else
-    return __fdiv_ru(a, b);
+    return __fdiv_rn(a, b);
 #endif
 #else  // CPU
     return NextFloatUp(a / b);
@@ -265,9 +271,9 @@ inline PBRT_CPU_GPU Float DivRoundUp(Float a, Float b) {
 inline PBRT_CPU_GPU Float DivRoundDown(Float a, Float b) {
 #ifdef PBRT_IS_GPU_CODE
 #ifdef PBRT_FLOAT_AS_DOUBLE
-    return __ddiv_rd(a, b);
+    return __ddiv_rn(a, b);
 #else
-    return __fdiv_rd(a, b);
+    return __fdiv_rn(a, b);
 #endif
 #else  // CPU
     return NextFloatDown(a / b);
@@ -277,9 +283,9 @@ inline PBRT_CPU_GPU Float DivRoundDown(Float a, Float b) {
 inline PBRT_CPU_GPU Float SqrtRoundUp(Float a) {
 #ifdef PBRT_IS_GPU_CODE
 #ifdef PBRT_FLOAT_AS_DOUBLE
-    return __dsqrt_ru(a);
+    return __dsqrt_rn(a);
 #else
-    return __fsqrt_ru(a);
+    return __fsqrt_rn(a);
 #endif
 #else  // CPU
     return NextFloatUp(std::sqrt(a));
@@ -289,9 +295,9 @@ inline PBRT_CPU_GPU Float SqrtRoundUp(Float a) {
 inline PBRT_CPU_GPU Float SqrtRoundDown(Float a) {
 #ifdef PBRT_IS_GPU_CODE
 #ifdef PBRT_FLOAT_AS_DOUBLE
-    return __dsqrt_rd(a);
+    return __dsqrt_rn(a);
 #else
-    return __fsqrt_rd(a);
+    return __fsqrt_rn(a);
 #endif
 #else  // CPU
     return std::max<Float>(0, NextFloatDown(std::sqrt(a)));
@@ -301,9 +307,9 @@ inline PBRT_CPU_GPU Float SqrtRoundDown(Float a) {
 inline PBRT_CPU_GPU Float FMARoundUp(Float a, Float b, Float c) {
 #ifdef PBRT_IS_GPU_CODE
 #ifdef PBRT_FLOAT_AS_DOUBLE
-    return __fma_ru(a, b, c);  // FIXME: what to do here?
+    return __fma_rn(a, b, c);  // FIXME: what to do here?
 #else
-    return __fma_ru(a, b, c);
+    return __fma_rn(a, b, c);
 #endif
 #else  // CPU
     return NextFloatUp(FMA(a, b, c));
@@ -313,9 +319,9 @@ inline PBRT_CPU_GPU Float FMARoundUp(Float a, Float b, Float c) {
 inline PBRT_CPU_GPU Float FMARoundDown(Float a, Float b, Float c) {
 #ifdef PBRT_IS_GPU_CODE
 #ifdef PBRT_FLOAT_AS_DOUBLE
-    return __fma_rd(a, b, c);  // FIXME: what to do here?
+    return __fma_rn(a, b, c);  // FIXME: what to do here?
 #else
-    return __fma_rd(a, b, c);
+    return __fma_rn(a, b, c);
 #endif
 #else  // CPU
     return NextFloatDown(FMA(a, b, c));
diff --git a/src/pbrt/util/hash.h b/src/pbrt/util/hash.h
index 1c53b3a7b..84a0163c0 100644
--- a/src/pbrt/util/hash.h
+++ b/src/pbrt/util/hash.h
@@ -27,7 +27,7 @@ PBRT_CPU_GPU inline uint64_t MurmurHash64A(const unsigned char *key, size_t len,
 
     while (key != end) {
         uint64_t k;
-        std::memcpy(&k, key, sizeof(uint64_t));
+        memcpy(&k, key, sizeof(uint64_t));
         key += 8;
 
         k *= m;
@@ -67,7 +67,7 @@ PBRT_CPU_GPU inline uint64_t MurmurHash64A(const unsigned char *key, size_t len,
 // http://zimbry.blogspot.ch/2011/09/better-bit-mixing-improving-on.html
 PBRT_CPU_GPU inline uint64_t MixBits(uint64_t v);
 
-inline uint64_t MixBits(uint64_t v) {
+PBRT_CPU_GPU inline uint64_t MixBits(uint64_t v) {
     v ^= (v >> 31);
     v *= 0x7fb5d329728ea185;
     v ^= (v >> 27);
diff --git a/src/pbrt/util/hip_aliases.h b/src/pbrt/util/hip_aliases.h
new file mode 100644
index 000000000..fb5e01601
--- /dev/null
+++ b/src/pbrt/util/hip_aliases.h
@@ -0,0 +1,105 @@
+#ifndef PBRT_UTIL_HIP_ALIASES_H
+#define PBRT_UTIL_HIP_ALIASES_H
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_gl_interop.h>
+
+#define cudaError_t hipError_t
+#define cudaSuccess hipSuccess
+#define cudaErrorNotReady hipErrorNotReady
+#define cudaGetLastError hipGetLastError
+#define cudaGetErrorString hipGetErrorString
+
+#define cudaGetDevice hipGetDevice
+#define cudaSetDevice hipSetDevice
+#define cudaGetDeviceCount hipGetDeviceCount
+#define cudaDeviceGetAttribute hipDeviceGetAttribute
+#define cudaDeviceProp hipDeviceProp_t
+#define cudaDeviceGetLimit hipDeviceGetLimit
+#define cudaDeviceSetLimit hipDeviceSetLimit
+#define cudaDeviceSetCacheConfig hipDeviceSetCacheConfig
+#define cudaLimitStackSize hipLimitStackSize
+#define cudaLimitPrintfFifoSize hipLimitPrintfFifoSize
+#define cudaFuncCachePreferL1 hipFuncCachePreferL1
+#define cudaGetDeviceProperties hipGetDeviceProperties
+#define cudaDevAttrKernelExecTimeout hipDeviceAttributeKernelExecTimeout
+#define cudaDevAttrConcurrentManagedAccess hipDeviceAttributeConcurrentManagedAccess
+#define cudaDriverGetVersion hipDriverGetVersion
+#define cudaRuntimeGetVersion hipRuntimeGetVersion
+
+#define cudaGraphicsMapResources hipGraphicsMapResources
+#define cudaGraphicsUnmapResources hipGraphicsUnmapResources
+#define cudaGraphicsResourceGetMappedPointer hipGraphicsResourceGetMappedPointer
+#define cudaGraphicsResource hipGraphicsResource 
+#define cudaGraphicsGLRegisterBuffer hipGraphicsGLRegisterBuffer
+#define cudaGraphicsMapResources hipGraphicsMapResources
+#define cudaGraphicsMapFlagsWriteDiscard hipGraphicsRegisterFlagsWriteDiscard
+
+#define cudaGLGetDevices hipGLGetDevices
+#define cudaGLDeviceListAll hipGLDeviceListAll
+
+#define CUdeviceptr hipDeviceptr_t
+#define cudaMalloc hipMalloc
+#define cudaMallocHost hipHostMalloc
+#define cudaMallocManaged hipMallocManaged
+#define cudaFree hipFree
+
+#define cudaMemcpy hipMemcpy
+#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaMemcpyToSymbol hipMemcpyToSymbol
+#define cudaMemcpyFromSymbol hipMemcpyFromSymbol
+#define cudaMemPrefetchAsync hipMemPrefetchAsync
+#define cudaMemset hipMemset
+#define cudaMemAdvise hipMemAdvise
+#define cudaMemAdviseSetReadMostly hipMemAdviseSetReadMostly
+#define cudaMemAdviseSetPreferredLocation hipMemAdviseSetPreferredLocation
+
+#define cudaArray_t hipArray_t
+#define cudaMallocArray hipMallocArray
+#define cudaMemcpy2DToArray hipMemcpy2DToArray
+#define cudaMipmappedArray_t  hipMipmappedArray_t
+#define cudaMallocMipmappedArray hipMallocMipmappedArray
+#define cudaGetMipmappedArrayLevel hipGetMipmappedArrayLevel
+
+#define cudaExtent hipExtent
+#define make_cudaExtent make_hipExtent
+
+#define cudaTextureObject_t hipTextureObject_t
+#define cudaCreateTextureObject hipCreateTextureObject
+#define cudaChannelFormatDesc hipChannelFormatDesc
+#define cudaCreateChannelDesc hipCreateChannelDesc
+#define cudaChannelFormatKindUnsigned hipChannelFormatKindUnsigned
+#define cudaChannelFormatKindFloat hipChannelFormatKindFloat
+#define cudaFilterModePoint hipFilterModePoint
+#define cudaFilterModeLinear hipFilterModeLinear
+#define cudaTextureDesc hipTextureDesc
+#define cudaTextureAddressMode hipTextureAddressMode
+#define cudaTextureReadMode hipTextureReadMode
+#define cudaAddressModeWrap hipAddressModeWrap
+#define cudaAddressModeClamp hipAddressModeClamp
+#define cudaAddressModeBorder hipAddressModeBorder
+#define cudaReadModeNormalizedFloat hipReadModeNormalizedFloat
+#define cudaReadModeElementType hipReadModeElementType
+#define cudaResourceDesc hipResourceDesc
+#define cudaResourceTypeArray hipResourceTypeArray
+#define cudaResourceTypeMipmappedArray hipResourceTypeMipmappedArray
+
+#define cudaEvent_t hipEvent_t
+#define cudaEventCreate hipEventCreate
+#define cudaEventRecord hipEventRecord
+#define cudaEventElapsedTime hipEventElapsedTime
+#define cudaEventSynchronize hipEventSynchronize
+#define cudaEventQuery hipEventQuery
+
+#define CUstream hipStream_t
+#define cudaStream_t hipStream_t
+#define cudaStreamCreate hipStreamCreate
+#define cudaStreamSynchronize hipStreamSynchronize
+#define cudaDeviceSynchronize hipDeviceSynchronize
+
+#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
+
+#endif  // PBRT_UTIL_HIP_ALIASES_H
diff --git a/src/pbrt/util/image.h b/src/pbrt/util/image.h
index 6771eeb9a..a54eec228 100644
--- a/src/pbrt/util/image.h
+++ b/src/pbrt/util/image.h
@@ -422,7 +422,7 @@ class Image {
 };
 
 // Image Inline Method Definitions
-inline void Image::SetChannel(Point2i p, int c, Float value) {
+PBRT_CPU_GPU inline void Image::SetChannel(Point2i p, int c, Float value) {
     // CHECK(!IsNaN(value));
     if (IsNaN(value)) {
 #ifndef PBRT_IS_GPU_CODE
diff --git a/src/pbrt/util/log.cpp b/src/pbrt/util/log.cpp
index df730c028..2064f1a07 100644
--- a/src/pbrt/util/log.cpp
+++ b/src/pbrt/util/log.cpp
@@ -327,7 +327,7 @@ std::string ToString(LogLevel level) {
     }
 }
 
-void Log(LogLevel level, const char *file, int line, const char *s) {
+PBRT_CPU_GPU void Log(LogLevel level, const char *file, int line, const char *s) {
 #ifdef PBRT_IS_GPU_CODE
     auto strlen = [](const char *ptr) {
         int len = 0;
@@ -389,7 +389,7 @@ void Log(LogLevel level, const char *file, int line, const char *s) {
 #endif
 }
 
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
 // warning #1305-D: function declared with "noreturn" does return
 #ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
 #pragma nv_diag_suppress 1305
@@ -398,7 +398,7 @@ void Log(LogLevel level, const char *file, int line, const char *s) {
 #endif
 #endif
 
-void LogFatal(LogLevel level, const char *file, int line, const char *s) {
+PBRT_CPU_GPU void LogFatal(LogLevel level, const char *file, int line, const char *s) {
 #ifdef PBRT_IS_GPU_CODE
     Log(LogLevel::Fatal, file, line, s);
     __threadfence();
diff --git a/src/pbrt/util/log.h b/src/pbrt/util/log.h
index 0cde178d5..45b34a178 100644
--- a/src/pbrt/util/log.h
+++ b/src/pbrt/util/log.h
@@ -44,16 +44,17 @@ extern FILE *logFile;
 PBRT_CPU_GPU
 void Log(LogLevel level, const char *file, int line, const char *s);
 
-PBRT_CPU_GPU [[noreturn]] void LogFatal(LogLevel level, const char *file, int line,
-                                        const char *s);
+template <typename... Args>
+PBRT_CPU_GPU inline void LogFatal(LogLevel level, const char *file, int line,
+                                  const char *fmt, Args &&...args);
+
+#ifndef __HIPCC__
+PBRT_CPU_GPU void LogFatal(LogLevel level, const char *file, int line, const char *s);
 
 template <typename... Args>
 PBRT_CPU_GPU inline void Log(LogLevel level, const char *file, int line, const char *fmt,
                              Args &&...args);
-
-template <typename... Args>
-PBRT_CPU_GPU [[noreturn]] inline void LogFatal(LogLevel level, const char *file, int line,
-                                               const char *fmt, Args &&...args);
+#endif
 
 #define TO_STRING(x) TO_STRING2(x)
 #define TO_STRING2(x) #x
@@ -62,6 +63,23 @@ PBRT_CPU_GPU [[noreturn]] inline void LogFatal(LogLevel level, const char *file,
 
 extern __constant__ LogLevel LOGGING_LogLevelGPU;
 
+// printing may cause hang in the device code
+#ifdef __HIP_DEVICE_COMPILE__
+
+#define LOG_VERBOSE(...) \
+    do {                 \
+    } while (false) /* swallow semicolon */
+
+#define LOG_ERROR(...) \
+    do {               \
+    } while (false) /* swallow semicolon */
+
+#define LOG_FATAL(...) \
+    do {               \
+    } while (false) /* swallow semicolon */
+
+#else
+
 #define LOG_VERBOSE(...)                               \
     (pbrt::LogLevel::Verbose >= LOGGING_LogLevelGPU && \
      (pbrt::Log(LogLevel::Verbose, __FILE__, __LINE__, __VA_ARGS__), true))
@@ -73,6 +91,8 @@ extern __constant__ LogLevel LOGGING_LogLevelGPU;
 #define LOG_FATAL(...) \
     pbrt::LogFatal(pbrt::LogLevel::Fatal, __FILE__, __LINE__, __VA_ARGS__)
 
+#endif
+
 #else
 
 // Logging Macros
@@ -96,9 +116,9 @@ extern __constant__ LogLevel LOGGING_LogLevelGPU;
 namespace pbrt {
 
 template <typename... Args>
-inline void Log(LogLevel level, const char *file, int line, const char *fmt,
-                Args &&...args) {
-#ifdef PBRT_IS_GPU_CODE
+PBRT_CPU_GPU inline void Log(LogLevel level, const char *file, int line, const char *fmt,
+                             Args &&...args) {
+#if defined(PBRT_IS_GPU_CODE)
     Log(level, file, line, fmt);  // just the format string #yolo
 #else
     std::string s = StringPrintf(fmt, std::forward<Args>(args)...);
@@ -107,9 +127,9 @@ inline void Log(LogLevel level, const char *file, int line, const char *fmt,
 }
 
 template <typename... Args>
-inline void LogFatal(LogLevel level, const char *file, int line, const char *fmt,
-                     Args &&...args) {
-#ifdef PBRT_IS_GPU_CODE
+PBRT_CPU_GPU inline void LogFatal(LogLevel level, const char *file, int line,
+                                  const char *fmt, Args &&...args) {
+#if defined(PBRT_IS_GPU_CODE) || defined(__HIPCC__)
     LogFatal(level, file, line, fmt);  // just the format string #yolo
 #else
     std::string s = StringPrintf(fmt, std::forward<Args>(args)...);
diff --git a/src/pbrt/util/math.h b/src/pbrt/util/math.h
index 48f50ddc8..1568a8a42 100644
--- a/src/pbrt/util/math.h
+++ b/src/pbrt/util/math.h
@@ -25,7 +25,7 @@
 
 namespace pbrt {
 
-#ifdef PBRT_IS_GPU_CODE
+#if defined(PBRT_IS_GPU_CODE) && defined(__CUDACC__)
 
 #define ShadowEpsilon 0.0001f
 #define Pi Float(3.14159265358979323846)
diff --git a/src/pbrt/util/noise.cpp b/src/pbrt/util/noise.cpp
index 77373bea3..646c26800 100644
--- a/src/pbrt/util/noise.cpp
+++ b/src/pbrt/util/noise.cpp
@@ -53,7 +53,7 @@ static PBRT_CONST int NoisePerm[2 * NoisePermSize] = {
 };
 
 // Noise Function Definitions
-Float Noise(Float x, Float y, Float z) {
+PBRT_CPU_GPU Float Noise(Float x, Float y, Float z) {
     // Compute noise cell coordinates and offsets
     // Avoid overflow when computing deltas if the coordinates are too large to store in
     // int32s.
@@ -87,11 +87,11 @@ Float Noise(Float x, Float y, Float z) {
     return Lerp(wz, y0, y1);
 }
 
-Float Noise(Point3f p) {
+PBRT_CPU_GPU Float Noise(Point3f p) {
     return Noise(p.x, p.y, p.z);
 }
 
-inline Float Grad(int x, int y, int z, Float dx, Float dy, Float dz) {
+PBRT_CPU_GPU inline Float Grad(int x, int y, int z, Float dx, Float dy, Float dz) {
     int h = NoisePerm[NoisePerm[NoisePerm[x] + y] + z];
     h &= 15;
     Float u = h < 8 || h == 12 || h == 13 ? dx : dy;
@@ -99,11 +99,11 @@ inline Float Grad(int x, int y, int z, Float dx, Float dy, Float dz) {
     return ((h & 1) ? -u : u) + ((h & 2) ? -v : v);
 }
 
-inline Float NoiseWeight(Float t) {
+PBRT_CPU_GPU inline Float NoiseWeight(Float t) {
     return 6 * Pow<5>(t) - 15 * Pow<4>(t) + 10 * Pow<3>(t);
 }
 
-Vector3f DNoise(Point3f p) {
+PBRT_CPU_GPU Vector3f DNoise(Point3f p) {
     Float delta = .01f;
     Float n = Noise(p);
     Point3f noiseDelta(Noise(p + Vector3f(delta, 0, 0)), Noise(p + Vector3f(0, delta, 0)),
@@ -111,7 +111,8 @@ Vector3f DNoise(Point3f p) {
     return (noiseDelta - Point3f(n, n, n)) / delta;
 }
 
-Float FBm(Point3f p, Vector3f dpdx, Vector3f dpdy, Float omega, int maxOctaves) {
+PBRT_CPU_GPU Float FBm(Point3f p, Vector3f dpdx, Vector3f dpdy, Float omega,
+                       int maxOctaves) {
     // Compute number of octaves for antialiased FBm
     Float len2 = std::max(LengthSquared(dpdx), LengthSquared(dpdy));
     Float n = Clamp(-1 - Log2(len2) / 2, 0, maxOctaves);
@@ -130,7 +131,7 @@ Float FBm(Point3f p, Vector3f dpdx, Vector3f dpdy, Float omega, int maxOctaves)
     return sum;
 }
 
-Float Turbulence(Point3f p, Vector3f dpdx, Vector3f dpdy, Float omega, int maxOctaves) {
+PBRT_CPU_GPU Float Turbulence(Point3f p, Vector3f dpdx, Vector3f dpdy, Float omega, int maxOctaves) {
     // Compute number of octaves for antialiased FBm
     Float len2 = std::max(LengthSquared(dpdx), LengthSquared(dpdy));
     Float n = Clamp(-1 - Log2(len2) / 2, 0, maxOctaves);
diff --git a/src/pbrt/util/parallel.h b/src/pbrt/util/parallel.h
index 48bd6c00b..827d0d3f1 100644
--- a/src/pbrt/util/parallel.h
+++ b/src/pbrt/util/parallel.h
@@ -213,7 +213,7 @@ class AtomicDouble {
     void Add(double v) {
 #if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600)
         atomicAdd(&value, v);
-#elif defined(__CUDA_ARCH__)
+#elif defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
         uint64_t old = bits, assumed;
 
         do {
@@ -235,7 +235,7 @@ class AtomicDouble {
     // AtomicDouble Private Data
 #if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600)
     double value;
-#elif defined(__CUDA_ARCH__)
+#elif (defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__))
     uint64_t bits;
 #else
     std::atomic<uint64_t> bits;
diff --git a/src/pbrt/util/progressreporter.h b/src/pbrt/util/progressreporter.h
index f018f1b46..a4028fa84 100644
--- a/src/pbrt/util/progressreporter.h
+++ b/src/pbrt/util/progressreporter.h
@@ -16,7 +16,11 @@
 #include <thread>
 
 #ifdef PBRT_BUILD_GPU_RENDERER
+#if defined(__HIPCC__)
+#include <pbrt/util/hip_aliases.h>
+#else
 #include <cuda_runtime.h>
+#endif
 #include <vector>
 #endif
 
diff --git a/src/pbrt/util/pstd.h b/src/pbrt/util/pstd.h
index 2d6065fef..52c3e240b 100644
--- a/src/pbrt/util/pstd.h
+++ b/src/pbrt/util/pstd.h
@@ -60,7 +60,7 @@ class array<T, 0> {
     array() = default;
 
     PBRT_CPU_GPU
-    void fill(const T &v) { assert(!"should never be called"); }
+    void fill(const T &v) { CHECK(!"should never be called"); }
 
     PBRT_CPU_GPU
     bool operator==(const array<T, 0> &a) const { return true; }
@@ -81,13 +81,13 @@ class array<T, 0> {
 
     PBRT_CPU_GPU
     T &operator[](size_t i) {
-        assert(!"should never be called");
+        CHECK(!"should never be called");
         static T t;
         return t;
     }
     PBRT_CPU_GPU
     const T &operator[](size_t i) const {
-        assert(!"should never be called");
+        CHECK(!"should never be called");
         static T t;
         return t;
     }
@@ -253,7 +253,7 @@ class optional {
     bool has_value() const { return set; }
 
   private:
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
     // Work-around NVCC bug
     PBRT_CPU_GPU
     T *ptr() { return reinterpret_cast<T *>(&optionalValue); }
diff --git a/src/pbrt/util/sampling.h b/src/pbrt/util/sampling.h
index e5caf7831..f0f844c1f 100644
--- a/src/pbrt/util/sampling.h
+++ b/src/pbrt/util/sampling.h
@@ -1300,7 +1300,7 @@ class PiecewiseLinear2D {
   private:
     using FloatStorage = pstd::vector<float>;
 
-#if !defined(_MSC_VER) && !defined(__CUDACC__)
+#if !defined(_MSC_VER) && !(defined(__CUDACC__) || defined(__HIPCC__))
     static constexpr size_t ArraySize = Dimension;
 #else
     static constexpr size_t ArraySize = (Dimension != 0) ? Dimension : 1;
diff --git a/src/pbrt/util/scattering.cpp b/src/pbrt/util/scattering.cpp
index 0353d1a1f..5dde4610a 100644
--- a/src/pbrt/util/scattering.cpp
+++ b/src/pbrt/util/scattering.cpp
@@ -7,7 +7,7 @@
 namespace pbrt {
 
 // BSSRDF Utility Functions
-Float FresnelMoment1(Float eta) {
+PBRT_CPU_GPU Float FresnelMoment1(Float eta) {
     Float eta2 = eta * eta, eta3 = eta2 * eta, eta4 = eta3 * eta, eta5 = eta4 * eta;
     if (eta < 1)
         return 0.45966f - 1.73965f * eta + 3.37668f * eta2 - 3.904945 * eta3 +
@@ -17,7 +17,7 @@ Float FresnelMoment1(Float eta) {
                1.27198f * eta4 + 0.12746f * eta5;
 }
 
-Float FresnelMoment2(Float eta) {
+PBRT_CPU_GPU Float FresnelMoment2(Float eta) {
     Float eta2 = eta * eta, eta3 = eta2 * eta, eta4 = eta3 * eta, eta5 = eta4 * eta;
     if (eta < 1) {
         return 0.27614f - 0.87350f * eta + 1.12077f * eta2 - 0.65095f * eta3 +
diff --git a/src/pbrt/util/taggedptr.h b/src/pbrt/util/taggedptr.h
index a430d575e..d9fb067c7 100644
--- a/src/pbrt/util/taggedptr.h
+++ b/src/pbrt/util/taggedptr.h
@@ -743,9 +743,14 @@ class TaggedPointer {
     template <typename T>
     PBRT_CPU_GPU TaggedPointer(T *ptr) {
         uint64_t iptr = reinterpret_cast<uint64_t>(ptr);
-        DCHECK_EQ(iptr & ptrMask, iptr);
         constexpr unsigned int type = TypeIndex<T>();
+#if defined(PBRT_IS_WINDOWS) && defined(__HIPCC__)
+        uint64_t aptr = (iptr & tagMask) >> 5ull;
+        bits = (iptr & ptrMask) | aptr | ((uint64_t)type << (tagShift + 2ull));
+#else
+        DCHECK_EQ(iptr & ptrMask, iptr);
         bits = iptr | ((uint64_t)type << tagShift);
+#endif
     }
 
     PBRT_CPU_GPU
@@ -769,7 +774,13 @@ class TaggedPointer {
     }
 
     PBRT_CPU_GPU
-    unsigned int Tag() const { return ((bits & tagMask) >> tagShift); }
+    unsigned int Tag() const {
+#if defined(PBRT_IS_WINDOWS) && defined(__HIPCC__)
+        return ((bits & tagMask) >> (tagShift + 2ull));
+#else
+        return ((bits & tagMask) >> tagShift);
+#endif
+    }
 
     template <typename T>
     PBRT_CPU_GPU bool Is() const {
@@ -825,10 +836,42 @@ class TaggedPointer {
     bool operator!=(const TaggedPointer &tp) const { return bits != tp.bits; }
 
     PBRT_CPU_GPU
-    void *ptr() { return reinterpret_cast<void *>(bits & ptrMask); }
+    void *ptr() {
+#if defined(PBRT_IS_WINDOWS) && defined(__HIPCC__)
+        unsigned int aptr = (bits >> tagShift) & 3;
+        uint64_t iptr = bits & ptrMask;
+        // lld crashes on Windows
+        // iptr |= (uint64_t)aptr << 60;
+        if (aptr == 1)
+            iptr |= 1ull << 60;
+        if (aptr == 2)
+            iptr |= 2ull << 60;
+        if (aptr == 3)
+            iptr |= 3ull << 60;
+        return reinterpret_cast<void *>(iptr);
+#else
+        return reinterpret_cast<void *>(bits & ptrMask);
+#endif
+    }
 
     PBRT_CPU_GPU
-    const void *ptr() const { return reinterpret_cast<const void *>(bits & ptrMask); }
+    const void *ptr() const {
+#if defined(PBRT_IS_WINDOWS) && defined(__HIPCC__)
+        unsigned int aptr = (bits >> tagShift) & 3;
+        uint64_t iptr = bits & ptrMask;
+        // lld crashes on Windows
+        // iptr |= (uint64_t)aptr << 60;
+        if (aptr == 1)
+            iptr |= 1ull << 60;
+        if (aptr == 2)
+            iptr |= 2ull << 60;
+        if (aptr == 3)
+            iptr |= 3ull << 60;
+        return reinterpret_cast<const void *>(iptr);
+#else
+        return reinterpret_cast<const void *>(bits & ptrMask);
+#endif
+    }
 
     template <typename F>
     PBRT_CPU_GPU decltype(auto) Dispatch(F &&func) {
@@ -862,7 +905,12 @@ class TaggedPointer {
     static_assert(sizeof(uintptr_t) <= sizeof(uint64_t),
                   "Expected pointer size to be <= 64 bits");
     // TaggedPointer Private Members
+#if defined(PBRT_IS_WINDOWS) && defined(__HIPCC__)
+    // top two bits are already used by the system on Windows
+    static constexpr int tagShift = 55;
+#else
     static constexpr int tagShift = 57;
+#endif
     static constexpr int tagBits = 64 - tagShift;
     static constexpr uint64_t tagMask = ((1ull << tagBits) - 1) << tagShift;
     static constexpr uint64_t ptrMask = ~tagMask;
diff --git a/src/pbrt/util/vecmath_test.cpp b/src/pbrt/util/vecmath_test.cpp
index 4569c6e6f..8ad27e1bf 100644
--- a/src/pbrt/util/vecmath_test.cpp
+++ b/src/pbrt/util/vecmath_test.cpp
@@ -104,6 +104,7 @@ TEST(Point2, InvertBilinear) {
     }
 }
 
+#if !defined(__HIPCC__)
 TEST(Vector, AngleBetween) {
     EXPECT_EQ(0, AngleBetween(Vector3f(1, 0, 0), Vector3f(1, 0, 0)));
 
@@ -176,6 +177,7 @@ TEST(Vector, AngleBetween) {
     Float abet = AngleBetween(a, b);
     EXPECT_EQ(abet, precise) << StringPrintf("vs naive %f", naive);
 }
+#endif
 
 TEST(Vector, CoordinateSystem) {
     // Duff et al 2017 footnote 1
diff --git a/src/pbrt/wavefront/integrator.cpp b/src/pbrt/wavefront/integrator.cpp
index c97d995e8..6729a7be5 100644
--- a/src/pbrt/wavefront/integrator.cpp
+++ b/src/pbrt/wavefront/integrator.cpp
@@ -9,7 +9,11 @@
 #include <pbrt/film.h>
 #include <pbrt/filters.h>
 #ifdef PBRT_BUILD_GPU_RENDERER
+#if defined(__HIPCC__)
+#include <pbrt/gpu/hiprt/aggregate.h>
+#else
 #include <pbrt/gpu/optix/aggregate.h>
+#endif
 #include <pbrt/gpu/memory.h>
 #endif  // PBRT_BUILD_GPU_RENDERER
 #include <pbrt/lights.h>
@@ -36,8 +40,12 @@
 #include <map>
 
 #ifdef PBRT_BUILD_GPU_RENDERER
+#if defined(__HIPCC__)
+#include <pbrt/util/hip_aliases.h>
+#else
 #include <cuda.h>
 #include <cuda_runtime.h>
+#endif
 #endif  // PBRT_BUILD_GPU_RENDERER
 
 namespace pbrt {
@@ -154,13 +162,28 @@ WavefrontPathIntegrator::WavefrontPathIntegrator(
     filter = film.GetFilter();
     sampler = scene.GetSampler();
 
+    // Compute number of scanlines to render per pass
+    Vector2i resolution = film.PixelBounds().Diagonal();
+    // TODO: make this configurable. Base it on the amount of GPU memory?
+    int maxSamples = 1024 * 1024;
+    scanlinesPerPass = std::max(1, maxSamples / resolution.x);
+    int nPasses = (resolution.y + scanlinesPerPass - 1) / scanlinesPerPass;
+    scanlinesPerPass = (resolution.y + nPasses - 1) / nPasses;
+    maxQueueSize = resolution.x * scanlinesPerPass;
+
     if (Options->useGPU) {
 #ifdef PBRT_BUILD_GPU_RENDERER
         CUDATrackedMemoryResource *mr =
             dynamic_cast<CUDATrackedMemoryResource *>(memoryResource);
         CHECK(mr);
+#ifdef __HIPCC__
+        aggregate = new HiprtAggregate(scene, mr, textures, shapeIndexToAreaLights, media,
+                                       namedMaterials, materials, maxQueueSize);
+#else
         aggregate = new OptiXAggregate(scene, mr, textures, shapeIndexToAreaLights, media,
                                        namedMaterials, materials);
+#endif
+
 #else
         LOG_FATAL("Options->useGPU was set without PBRT_BUILD_GPU_RENDERER enabled");
 #endif
@@ -224,14 +247,6 @@ WavefrontPathIntegrator::WavefrontPathIntegrator(
     }
 #endif  // PBRT_BUILD_GPU_RENDERER
 
-    // Compute number of scanlines to render per pass
-    Vector2i resolution = film.PixelBounds().Diagonal();
-    // TODO: make this configurable. Base it on the amount of GPU memory?
-    int maxSamples = 1024 * 1024;
-    scanlinesPerPass = std::max(1, maxSamples / resolution.x);
-    int nPasses = (resolution.y + scanlinesPerPass - 1) / scanlinesPerPass;
-    scanlinesPerPass = (resolution.y + nPasses - 1) / nPasses;
-    maxQueueSize = resolution.x * scanlinesPerPass;
     LOG_VERBOSE("Will render in %d passes %d scanlines per pass\n", nPasses,
                 scanlinesPerPass);
 
diff --git a/src/pbrt/wavefront/intersect.h b/src/pbrt/wavefront/intersect.h
index 310173a1e..690aa10e8 100644
--- a/src/pbrt/wavefront/intersect.h
+++ b/src/pbrt/wavefront/intersect.h
@@ -54,7 +54,7 @@ inline PBRT_CPU_GPU void EnqueueWorkAfterIntersection(
         intr.mediumInterface ? *intr.mediumInterface : MediumInterface(rayMedium);
 
     if (rayMedium) {
-        assert(mediumSampleQueue);
+        CHECK(mediumSampleQueue);
         PBRT_DBG("Enqueuing into medium sample queue\n");
         mediumSampleQueue->Push(MediumSampleWorkItem{r.ray,
                                                      r.depth,
diff --git a/src/pbrt/wavefront/wavefront.cpp b/src/pbrt/wavefront/wavefront.cpp
index 7482fdb84..c5d86cc68 100644
--- a/src/pbrt/wavefront/wavefront.cpp
+++ b/src/pbrt/wavefront/wavefront.cpp
@@ -66,7 +66,14 @@ void RenderWavefront(BasicScene &scene) {
     integrator->camera.InitMetadata(&metadata);
     metadata.renderTimeSeconds = seconds;
     metadata.samplesPerPixel = integrator->sampler.SamplesPerPixel();
+    // multithreading somehow hangs in HIP
+#ifdef __HIPCC__
+    if (Options->useGPU) DisableThreadPool();
+#endif
     integrator->film.WriteImage(metadata);
+#ifdef __HIPCC__
+    if (Options->useGPU) ReenableThreadPool();
+#endif
 }
 
 }  // namespace pbrt
diff --git a/src/pbrt/wavefront/workqueue.h b/src/pbrt/wavefront/workqueue.h
index 2f0827e4d..10e81f5a9 100644
--- a/src/pbrt/wavefront/workqueue.h
+++ b/src/pbrt/wavefront/workqueue.h
@@ -33,6 +33,9 @@
 #include <cuda/atomic>
 #endif
 
+#elif defined(__HIPCC__)
+#define PBRT_USE_LEGACY_CUDA_ATOMICS
+
 #endif  // __CUDACC__
 
 namespace pbrt {

From 1b53728b4acf4d2a168584a2975139c568f81f47 Mon Sep 17 00:00:00 2001
From: Aaryaman Vasishta <jem456.vasishta@gmail.com>
Date: Mon, 30 Dec 2024 13:42:06 +0000
Subject: [PATCH 2/3] Update LICENSE.txt

---
 LICENSE.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/LICENSE.txt b/LICENSE.txt
index acd97298b..37965801b 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,4 +1,4 @@
-All of the modifications made in commit ID dfd293c76b33ec03df55284283622868ea10aea9 are covered by the MIT license (below).
+All of the modifications made in commit ID 14582eebabcf1ca33323e6b6d6757a49d2306dbf are covered by the MIT license (below).
 The rest would be covered by the existing Apache license.
 
 ---------------------------------------------------------
@@ -228,4 +228,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
\ No newline at end of file
+THE SOFTWARE.

From 8d6e17f2028d5bf81c251d56cebb660afd688282 Mon Sep 17 00:00:00 2001
From: Daniel Meister <daniel.meister@amd.com>
Date: Thu, 20 Feb 2025 07:41:56 +0900
Subject: [PATCH 3/3] Reverting .gitignore

---
 .gitignore | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 4d3e0f77d..21b5c9b1f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,6 @@
 .#*
 #*#
 src/build
-src/ext/hiprtSdk/hiprt
 .DS_Store
 .ipynb_checkpoints/
 build/