diff --git a/0001-MSVC-HIP.patch b/0001-MSVC-HIP.patch
new file mode 100644
index 000000000..75dfb6092
--- /dev/null
+++ b/0001-MSVC-HIP.patch
@@ -0,0 +1,24 @@
+From 56933471af36147e1032fbbc7912ca0088797b78 Mon Sep 17 00:00:00 2001
+From: Aaryaman Vasishta <aaryaman.vasishta@amd.com>
+Date: Sun, 10 Dec 2023 18:40:22 +0900
+Subject: [PATCH] Fix BitScanreverse by including intrin.h for MSVC
+
+---
+ common/compiler_msc.h | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/common/compiler_msc.h b/common/compiler_msc.h
+index 18cfa12..2eafd55 100644
+--- a/common/compiler_msc.h
++++ b/common/compiler_msc.h
+@@ -4,6 +4,7 @@
+ 
+ #include <stdint.h>
+ #include <stdlib.h> /* for _byteswap_*() */
++#include <intrin.h>
+ 
+ #define LIBEXPORT	__declspec(dllexport)
+ 
+-- 
+2.33.0.windows.2
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6c6a50166..cf4120218 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,11 +11,17 @@ if (NOT DEFINED PBRT_SOURCE_CODE)
   set (PBRT_SOURCE_CODE ${CMAKE_SOURCE_DIR})
 endif ()
 
+# HIP
+list (APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+set (AMDGPU_TARGETS "gfx1100" CACHE STRING "List of specific machine types for library to target") # User variable
+
 # For sanitizers
 list (INSERT CMAKE_MODULE_PATH 0 "${PBRT_SOURCE_CODE}/cmake")
 
 # Configuration options
 
+option (PBRT_HIPRT "Use HIPRT as backend" OFF)
+option (PBRT_HIPRT_SDK "Use HIPRT SDK instead of HIPRT in HIP SDK" OFF)
 option (PBRT_FLOAT_AS_DOUBLE "Use 64-bit floats" OFF)
 option (PBRT_BUILD_NATIVE_EXECUTABLE "Build executable optimized for CPU architecture of system pbrt was built on" ON)
 option (PBRT_DBG_LOGGING "Enable (very verbose!) debug logging" OFF)
@@ -23,6 +29,7 @@ option (PBRT_NVTX "Insert NVTX annotations for NVIDIA Profiling and Debugging To
 option (PBRT_NVML "Use NVML for GPU performance measurement" OFF)
 option (PBRT_USE_PREGENERATED_RGB_TO_SPECTRUM_TABLES "Use pregenerated rgbspectrum_*.cpp files rather than running rgb2spec_opt to generate them at build time" OFF)
 set (PBRT_OPTIX7_PATH $ENV{PBRT_OPTIX7_PATH} CACHE PATH "Path to OptiX 7 SDK")
+set (PBRT_HIPRT_PATH $ENV{PBRT_HIPRT_PATH} CACHE PATH "Path to HIPRT SDK")
 set (PBRT_GPU_SHADER_MODEL "" CACHE STRING "")
 
 
@@ -64,7 +71,7 @@ function (CHECK_EXT NAME DIR HASH)
 endfunction ()
 
 check_ext ("OpenEXR" "openexr/OpenEXR" ca757f97033ae7cfcc6130f23c0e9ee86b108b07)
-check_ext ("OpenVDB" "openvdb/nanovdb" 414bed84c2fc22e188eac7b611aa85c7edd7a5a9)
+check_ext ("OpenVDB" "openvdb/nanovdb" 385516321b272c423e2a514b41bd21ce6ffdda72)
 check_ext ("Ptex" "ptex/src" 054047d02b9e06e690420b407114d2872435b953)
 check_ext ("double-conversion" "double-conversion/cmake" cc1f75a114aca8d2af69f73a5a959aecbab0e87a)
 check_ext ("filesystem" "filesystem/filesystem" c5f9de30142453eb3c6fe991e82dfc2583373116)
@@ -86,7 +93,7 @@ find_package(OpenGL REQUIRED)
 
 set_property (GLOBAL PROPERTY USE_FOLDERS ON)
 
-if (MSVC)
+if (WIN32)
   list (APPEND PBRT_DEFINITIONS "PBRT_IS_MSVC" "_CRT_SECURE_NO_WARNINGS")
   list (APPEND PBRT_DEFINITIONS "PBRT_IS_MSVC" "_ENABLE_EXTENDED_ALIGNED_STORAGE")
 endif ()
@@ -155,207 +162,268 @@ target_compile_options (
 add_library (pbrt_opt INTERFACE)
 
 #########################################
-## CUDA / OptiX
+## CUDA / OptiX / HIP / HIPRT
 
 add_library (cuda_build_configuration INTERFACE)
 
 include (CheckLanguage)
 
-check_language(CUDA)
-
-if (CMAKE_CUDA_COMPILER)
-  if (CUDA_VERSION_MAJOR LESS 11)
-    message (WARNING "pbrt-v4 requires CUDA version 11.0 or later but version ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}\
-was found. GPU support is therefore disabled. If you have multiple versions\
-of CUDA installed, please update your PATH.")
+# Function to get the short path
+function(get_short_path input_path output_var)
+    # Create a command to get the short path using cmd.exe
+    execute_process(
+        COMMAND cmd /c for %A in ("${input_path}") do @echo %~sA
+        OUTPUT_VARIABLE short_path
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    # Set the result to the output variable
+    set(${output_var} "${short_path}" PARENT_SCOPE)
+endfunction()
+
+if (PBRT_HIPRT)
+  find_package (hip REQUIRED)
+  set (PBRT_HIP_PATH $ENV{HIP_PATH})
+  get_short_path("${PBRT_HIP_PATH}" PBRT_HIP_PATH)
+  string(REGEX REPLACE "[/\\]$" "" PBRT_HIP_PATH "${PBRT_HIP_PATH}")
+
+  set (CUDA_COMPILE_LANGUAGE CXX)
+
+  if (WIN32)
+    if (PBRT_HIPRT_SDK)
+      set (PBRT_HIPRT_BINARY_PATH ${PBRT_HIPRT_PATH}/hiprt/win)
+    else ()
+      set (PBRT_HIPRT_PATH ${PBRT_HIP_PATH}/include)
+      set (PBRT_HIPRT_BINARY_PATH ${PBRT_HIP_PATH}/bin)
+    endif ()
+  else ()
+    set (PBRT_HIPRT_BINARY_PATH ${PBRT_HIPRT_PATH}/hiprt/linux64)
+  endif ()
+  link_directories (${PBRT_HIPRT_BINARY_PATH})
+  include_directories (${PBRT_HIPRT_PATH})
+  list (APPEND PBRT_DEFINITIONS "PBRT_BUILD_GPU_RENDERER")
+  list (APPEND PBRT_DEFINITIONS "GTEST_HAS_SEH=0")
+  list (APPEND PBRT_CXX_FLAGS "-std=c++17")
+  list (APPEND PBRT_CXX_FLAGS "-fgpu-rdc")
+  list (APPEND PBRT_LINK_FLAGS "-fgpu-rdc")
+  if (WIN32)
+	  list (APPEND PBRT_LINK_FLAGS "-fuse-ld=lld")
+	endif()
+  list (APPEND PBRT_LINK_FLAGS "--hip-link")
+  list (APPEND PBRT_LINK_FLAGS "-Xoffload-linker")
+  list (APPEND PBRT_LINK_FLAGS "--whole-archive")
+  set (PBRT_HIP_ENABLED ON)
+  file (GLOB PBRT_HIPRT_BITCODES ${PBRT_HIPRT_BINARY_PATH}/*.bc)
+  file (GLOB PBRT_HIPRT_FATBINS ${PBRT_HIPRT_BINARY_PATH}/*.hipfb)
+  file (COPY ${PBRT_HIPRT_BITCODES} DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
+  file (COPY ${PBRT_HIPRT_FATBINS} DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
+  get_filename_component(PBRT_HIPRT_BITCODE_FILENAME ${PBRT_HIPRT_BITCODES} NAME)
+  if (WIN32)
+    file (GLOB PBRT_HIPRT_DLLS ${PBRT_HIPRT_BINARY_PATH}/*.dll)
+    file (GLOB PBRT_HIPRT_LIB_NAME ${PBRT_HIPRT_BINARY_PATH}/*.lib)
+    file (COPY ${PBRT_HIPRT_DLLS} DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
+    get_filename_component(PBRT_HIPRT_LIB_NAME ${PBRT_HIPRT_LIB_NAME} NAME_WE)
   else ()
-    find_package (CUDA REQUIRED)
+    file (GLOB PBRT_HIPRT_LIB_NAME ${PBRT_HIPRT_BINARY_PATH}/*.so)
+    get_filename_component(PBRT_HIPRT_LIB_NAME ${PBRT_HIPRT_LIB_NAME} NAME_WE)
+    string(REGEX REPLACE "^lib" "" PBRT_HIPRT_LIB_NAME ${PBRT_HIPRT_LIB_NAME})
+  endif ()
+else ()
+  check_language(CUDA)
 
-    # This seems to be necessary starting with 3.17.1, but gives an error
-    # about 17 being an unsupported version earlier...
-    if (${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.17.0")
-        set (CMAKE_CUDA_STANDARD 17)
-    endif ()
+  if (CMAKE_CUDA_COMPILER)
+    if (CUDA_VERSION_MAJOR LESS 11)
+      message (WARNING "pbrt-v4 requires CUDA version 11.0 or later but version ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}\
+  was found. GPU support is therefore disabled. If you have multiple versions\
+  of CUDA installed, please update your PATH.")
+    else ()
+      find_package (CUDA REQUIRED)
 
-    # https://github.com/VIAME/VIAME/blob/aa6c5f56a898b08e4da102c400b453e23952199c/CMakeLists.txt#L291
-    if (NOT CUDA_VERSION_PATCH)
-      if (CUDA_NVCC_EXECUTABLE AND
-          CUDA_NVCC_EXECUTABLE STREQUAL CMAKE_CUDA_COMPILER AND
-          CMAKE_CUDA_COMPILER_VERSION MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=])
-        set (CUDA_VERSION_PATCH "${CMAKE_MATCH_3}")
-      elseif (CUDA_NVCC_EXECUTABLE)
-        execute_process (COMMAND ${CUDA_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NOUT)
-        if (NOUT MATCHES [=[ V([0-9]+)\.([0-9]+)\.([0-9]+)]=])
-          set (CUDA_VERSION_PATCH "${CMAKE_MATCH_3}")
-        endif ()
+      # This seems to be necessary starting with 3.17.1, but gives an error
+      # about 17 being an unsupported version earlier...
+      if (${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.17.0")
+          set (CMAKE_CUDA_STANDARD 17)
       endif ()
-    endif ()
-
-    message (STATUS "Found CUDA: ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}.${CUDA_VERSION_PATCH}")
-    if (CUDA_VERSION_MAJOR EQUAL 11 AND CUDA_VERSION_MINOR EQUAL 3 AND CUDA_VERSION_PATCH LESS 109)
-      message (SEND_ERROR "Unfortunately, pbrt-v4 triggers an internal compiler error in CUDA 11.3.0. Please either use CUDA 11.0-11.2 or 11.3.1 or later.")
-    endif ()
 
-    if ("${PBRT_OPTIX7_PATH}" STREQUAL "")
-        message (WARNING "Found CUDA but PBRT_OPTIX7_PATH is not set. Disabling GPU compilation.")
-    else ()
-        enable_language (CUDA)
-        list (APPEND PBRT_DEFINITIONS "PBRT_BUILD_GPU_RENDERER")
-        if (PBRT_NVTX)
-            list (APPEND PBRT_DEFINITIONS "NVTX")
-        endif ()
-        if (PBRT_NVML)
-            list (APPEND PBRT_DEFINITIONS "PBRT_USE_NVML")
+      # https://github.com/VIAME/VIAME/blob/aa6c5f56a898b08e4da102c400b453e23952199c/CMakeLists.txt#L291
+      if (NOT CUDA_VERSION_PATCH)
+        if (CUDA_NVCC_EXECUTABLE AND
+            CUDA_NVCC_EXECUTABLE STREQUAL CMAKE_CUDA_COMPILER AND
+            CMAKE_CUDA_COMPILER_VERSION MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=])
+          set (CUDA_VERSION_PATCH "${CMAKE_MATCH_3}")
+        elseif (CUDA_NVCC_EXECUTABLE)
+          execute_process (COMMAND ${CUDA_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NOUT)
+          if (NOUT MATCHES [=[ V([0-9]+)\.([0-9]+)\.([0-9]+)]=])
+            set (CUDA_VERSION_PATCH "${CMAKE_MATCH_3}")
+          endif ()
         endif ()
-        set (PBRT_CUDA_ENABLED ON)
-
-        # FIXME
-        include_directories (${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})  # for regular c++ compiles
-
-        # http://www.ssl.berkeley.edu/~jimm/grizzly_docs/SSL/opt/intel/cc/9.0/lib/locale/en_US/mcpcom.msg
-        target_compile_options (
-            pbrt_warnings
-            INTERFACE
-                #"$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xptxas --warn-on-double-precision-use>"
-                "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcudafe --diag_suppress=partial_override>"
-                "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcudafe --diag_suppress=virtual_function_decl_hidden>"
-                "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcudafe --diag_suppress=integer_sign_change>"
-                "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcudafe --diag_suppress=declared_but_not_referenced>"
-                # WAR invalid warnings about this with "if constexpr"
-                "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcudafe --diag_suppress=implicit_return_from_non_void_function>"
-        )
-
-        # Willie hears yeh..
-        string (APPEND CMAKE_CUDA_FLAGS " -Xnvlink -suppress-stack-size-warning")
-
-        target_compile_options (
-            cuda_build_configuration
-            INTERFACE
-                "$<$<COMPILE_LANGUAGE:CUDA>:--std=c++17;--use_fast_math;--expt-relaxed-constexpr;--extended-lambda;--forward-unknown-to-host-compiler>"
-                # The "$<NOT:$<BOOL:$<TARGET_PROPERTY:CUDA_PTX_COMPILATION>>>" part is to not add debugging symbols when generating PTX files for OptiX; see https://github.com/mmp/pbrt-v4/issues/69#issuecomment-715499748.
-                "$<$<COMPILE_LANGUAGE:CUDA>:$<IF:$<AND:$<CONFIG:Debug>,$<NOT:$<BOOL:$<TARGET_PROPERTY:CUDA_PTX_COMPILATION>>>>,-G;-g,-lineinfo;-maxrregcount;128>>"
-        )
-
-        if (PBRT_GPU_SHADER_MODEL STREQUAL "")
-            # https://wagonhelm.github.io/articles/2018-03/detecting-cuda-capability-with-cmake
-            # Get CUDA compute capability
-            set (CHECK_CUDA_OUTPUT_EXE ${CMAKE_BINARY_DIR}/checkcuda)
-            if (MSVC)
-                execute_process (COMMAND ${CMAKE_CUDA_COMPILER} -lcuda ${PBRT_SOURCE_CODE}/cmake/checkcuda.cu -ccbin ${CMAKE_CXX_COMPILER} -o ${CHECK_CUDA_OUTPUT_EXE}
-                                 RESULT_VARIABLE BUILD_CHECK_CUDA_RETURN_CODE)
-            else  ()
-                execute_process (COMMAND ${CMAKE_CUDA_COMPILER} -lcuda ${PBRT_SOURCE_CODE}/cmake/checkcuda.cu -o ${CHECK_CUDA_OUTPUT_EXE}
-                                 RESULT_VARIABLE BUILD_CHECK_CUDA_RETURN_CODE)
-            endif ()
+      endif ()
 
-            if (NOT ${BUILD_CHECK_CUDA_RETURN_CODE} EQUAL 0)
-                message (SEND_ERROR "Was unable to build checkcuda, consider manually setting PBRT_GPU_SHADER_MODEL")
-            endif ()
+      message (STATUS "Found CUDA: ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}.${CUDA_VERSION_PATCH}")
+      if (CUDA_VERSION_MAJOR EQUAL 11 AND CUDA_VERSION_MINOR EQUAL 3 AND CUDA_VERSION_PATCH LESS 109)
+        message (SEND_ERROR "Unfortunately, pbrt-v4 triggers an internal compiler error in CUDA 11.3.0. Please either use CUDA 11.0-11.2 or 11.3.1 or later.")
+      endif ()
 
-            execute_process (COMMAND ${CHECK_CUDA_OUTPUT_EXE}
-                             RESULT_VARIABLE CUDA_RETURN_CODE
-                             OUTPUT_VARIABLE CHECK_CUDA_OUTPUT)
+      if ("${PBRT_OPTIX7_PATH}" STREQUAL "")
+          message (WARNING "Found CUDA but PBRT_OPTIX7_PATH is not set. Disabling GPU compilation.")
+      else ()
+          enable_language (CUDA)
+          list (APPEND PBRT_DEFINITIONS "PBRT_BUILD_GPU_RENDERER")
+          if (PBRT_NVTX)
+              list (APPEND PBRT_DEFINITIONS "NVTX")
+          endif ()
+          if (PBRT_NVML)
+              list (APPEND PBRT_DEFINITIONS "PBRT_USE_NVML")
+          endif ()
+          set (PBRT_CUDA_ENABLED ON)
+
+          # FIXME
+          include_directories (${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})  # for regular c++ compiles
+
+          # http://www.ssl.berkeley.edu/~jimm/grizzly_docs/SSL/opt/intel/cc/9.0/lib/locale/en_US/mcpcom.msg
+          target_compile_options (
+              pbrt_warnings
+              INTERFACE
+                  #"$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xptxas --warn-on-double-precision-use>"
+                  "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcudafe --diag_suppress=partial_override>"
+                  "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcudafe --diag_suppress=virtual_function_decl_hidden>"
+                  "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcudafe --diag_suppress=integer_sign_change>"
+                  "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcudafe --diag_suppress=declared_but_not_referenced>"
+                  # WAR invalid warnings about this with "if constexpr"
+                  "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcudafe --diag_suppress=implicit_return_from_non_void_function>"
+          )
 
-            if (NOT ${CUDA_RETURN_CODE} EQUAL 0)
-                message (SEND_ERROR ${CHECK_CUDA_OUTPUT})
-              else ()
-                set(ARCH "${CHECK_CUDA_OUTPUT}")
-                message (STATUS "Detected CUDA Architecture: ${ARCH}")
-                string (APPEND CMAKE_CUDA_FLAGS " --gpu-architecture=${ARCH}")
-            endif ()
-        else ()
-            set(ARCH "${PBRT_GPU_SHADER_MODEL}")
-            message (STATUS "Specified CUDA Architecture: ${ARCH}")
-            string (APPEND CMAKE_CUDA_FLAGS " --gpu-architecture=${ARCH}")
-        endif ()
+          # Willie hears yeh..
+          string (APPEND CMAKE_CUDA_FLAGS " -Xnvlink -suppress-stack-size-warning")
 
-        set (PBRT_CUDA_LIB cuda)
-        # optix
-        # FIXME
-        include_directories (${PBRT_OPTIX7_PATH}/include)
-
-        # find CUDA's bin2c executable
-        get_filename_component (cuda_compiler_bin "${CMAKE_CUDA_COMPILER}" DIRECTORY)
-        find_program (BIN2C
-                      NAMES bin2c
-                      PATHS ${cuda_compiler_bin}
-                      DOC "Path to the CUDA SDK bin2c executable."
-                      NO_DEFAULT_PATH)
-        if (NOT BIN2C)
-            message (FATAL_ERROR
-                     "bin2c not found:\n"
-                     "  CMAKE_CUDA_COMPILER='${CMAKE_CUDA_COMPILER}'\n"
-                     "  cuda_compiler_bin='${cuda_compiler_bin}'\n"
-            )
-        endif ()
+          target_compile_options (
+              cuda_build_configuration
+              INTERFACE
+                  "$<$<COMPILE_LANGUAGE:CUDA>:--std=c++17;--use_fast_math;--expt-relaxed-constexpr;--extended-lambda;--forward-unknown-to-host-compiler>"
+                  # The "$<NOT:$<BOOL:$<TARGET_PROPERTY:CUDA_PTX_COMPILATION>>>" part is to not add debugging symbols when generating PTX files for OptiX; see https://github.com/mmp/pbrt-v4/issues/69#issuecomment-715499748.
+                  "$<$<COMPILE_LANGUAGE:CUDA>:$<IF:$<AND:$<CONFIG:Debug>,$<NOT:$<BOOL:$<TARGET_PROPERTY:CUDA_PTX_COMPILATION>>>>,-G;-g,-lineinfo;-maxrregcount;128>>"
+          )
 
-        # this macro defines cmake rules that execute the following four steps:
-        # 1) compile the given cuda file ${cuda_file} to an intermediary PTX file
-        # 2) use the 'bin2c' tool (that comes with CUDA) to
-        #    create a second intermediary (.c-)file which defines a const string variable
-        #    (named '${c_var_name}') whose (constant) value is the PTX output
-        #    from the previous step.
-        # 3) compile the given .c file to an intermediary object file (why thus has
-        #    that PTX string 'embedded' as a global constant.
-        # 4) assign the name of the intermediary .o file to the cmake variable
-        #    'output_var', which can then be added to cmake targets.
-        macro (cuda_compile_and_embed output_var cuda_file lib_name)
-          add_library ("${lib_name}" OBJECT "${cuda_file}")
-          set_property (TARGET "${lib_name}" PROPERTY CUDA_PTX_COMPILATION ON)
-
-          # disable "extern declaration... is treated as a static definition" warning
-          if (CUDA_VERSION_MAJOR EQUAL 11 AND CUDA_VERSION_MINOR LESS 2)
-              target_compile_options ("${lib_name}" PRIVATE
-                                      -Xcudafe=--display_error_number -Xcudafe=--diag_suppress=3089)
+          if (PBRT_GPU_SHADER_MODEL STREQUAL "")
+              # https://wagonhelm.github.io/articles/2018-03/detecting-cuda-capability-with-cmake
+              # Get CUDA compute capability
+              set (CHECK_CUDA_OUTPUT_EXE ${CMAKE_BINARY_DIR}/checkcuda)
+              if (MSVC)
+                  execute_process (COMMAND ${CMAKE_CUDA_COMPILER} -lcuda ${PBRT_SOURCE_CODE}/cmake/checkcuda.cu -ccbin ${CMAKE_CXX_COMPILER} -o ${CHECK_CUDA_OUTPUT_EXE}
+                                  RESULT_VARIABLE BUILD_CHECK_CUDA_RETURN_CODE)
+              else  ()
+                  execute_process (COMMAND ${CMAKE_CUDA_COMPILER} -lcuda ${PBRT_SOURCE_CODE}/cmake/checkcuda.cu -o ${CHECK_CUDA_OUTPUT_EXE}
+                                  RESULT_VARIABLE BUILD_CHECK_CUDA_RETURN_CODE)
+              endif ()
+
+              if (NOT ${BUILD_CHECK_CUDA_RETURN_CODE} EQUAL 0)
+                  message (SEND_ERROR "Was unable to build checkcuda, consider manually setting PBRT_GPU_SHADER_MODEL")
+              endif ()
+
+              execute_process (COMMAND ${CHECK_CUDA_OUTPUT_EXE}
+                              RESULT_VARIABLE CUDA_RETURN_CODE
+                              OUTPUT_VARIABLE CHECK_CUDA_OUTPUT)
+
+              if (NOT ${CUDA_RETURN_CODE} EQUAL 0)
+                  message (SEND_ERROR ${CHECK_CUDA_OUTPUT})
+                else ()
+                  set(ARCH "${CHECK_CUDA_OUTPUT}")
+                  message (STATUS "Detected CUDA Architecture: ${ARCH}")
+                  string (APPEND CMAKE_CUDA_FLAGS " --gpu-architecture=${ARCH}")
+              endif ()
           else ()
-              target_compile_options ("${lib_name}" PRIVATE
-                                      -Xcudafe=--display_error_number -Xcudafe=--diag_suppress=20044)
+              set(ARCH "${PBRT_GPU_SHADER_MODEL}")
+              message (STATUS "Specified CUDA Architecture: ${ARCH}")
+              string (APPEND CMAKE_CUDA_FLAGS " --gpu-architecture=${ARCH}")
           endif ()
 
-          # CUDA integration in Visual Studio seems broken as even if "Use
-          # Host Preprocessor Definitions" is checked, the host preprocessor
-          # definitions are still not used when compiling device code.
-          # To work around that, define the macros using --define-macro to
-          # avoid CMake identifying those as macros and using the proper (but
-          # broken) way of specifying them.
-          if (${CMAKE_GENERATOR} MATCHES "^Visual Studio")
-            # As PBRT_DEBUG_BUILD is specified globally as a definition, we need to
-            # manually add it due to the bug mentioned earlier and due to it
-            # not being found in PBRT_DEFINITIONS.
-            if (CMAKE_BUILD_TYPE MATCHES Debug)
-                set (cuda_definitions "--define-macro=PBRT_DEBUG_BUILD")
-            endif ()
-            foreach (arg ${PBRT_DEFINITIONS})
-              list (APPEND cuda_definitions "--define-macro=${arg}")
-            endforeach ()
-            target_compile_options ("${lib_name}" PRIVATE ${cuda_definitions})
-          else ()
-            target_compile_definitions ("${lib_name}" PRIVATE ${PBRT_DEFINITIONS})
+          set (PBRT_CUDA_LIB cuda)
+          # optix
+          # FIXME
+          include_directories (${PBRT_OPTIX7_PATH}/include)
+
+          # find CUDA's bin2c executable
+          get_filename_component (cuda_compiler_bin "${CMAKE_CUDA_COMPILER}" DIRECTORY)
+          find_program (BIN2C
+                        NAMES bin2c
+                        PATHS ${cuda_compiler_bin}
+                        DOC "Path to the CUDA SDK bin2c executable."
+                        NO_DEFAULT_PATH)
+          if (NOT BIN2C)
+              message (FATAL_ERROR
+                      "bin2c not found:\n"
+                      "  CMAKE_CUDA_COMPILER='${CMAKE_CUDA_COMPILER}'\n"
+                      "  cuda_compiler_bin='${cuda_compiler_bin}'\n"
+              )
           endif ()
-          target_include_directories ("${lib_name}" PRIVATE src ${CMAKE_BINARY_DIR})
-          target_include_directories ("${lib_name}" SYSTEM PRIVATE ${NANOVDB_INCLUDE})
-          target_link_libraries ("${lib_name}" PRIVATE cuda_build_configuration pbrt_opt pbrt_warnings)
-          add_dependencies ("${lib_name}" pbrt_soa_generated)
-          set (c_var_name ${output_var})
-          set (embedded_file ${cuda_file}.ptx_embedded.c)
-          add_custom_command (
-            OUTPUT "${embedded_file}"
-            COMMAND ${CMAKE_COMMAND}
-              "-DBIN_TO_C_COMMAND=${BIN2C}"
-              "-DOBJECTS=$<TARGET_OBJECTS:${lib_name}>"
-              "-DVAR_NAME=${c_var_name}"
-              "-DOUTPUT=${embedded_file}"
-              -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/bin2c_wrapper.cmake
-            VERBATIM
-            DEPENDS "${lib_name}" $<TARGET_OBJECTS:${lib_name}>
-            COMMENT "Embedding PTX generated from ${cuda_file}"
-          )
-          set (${output_var} ${embedded_file})
-        endmacro ()
-    endif ()
-  endif()
-else ()
-    message (STATUS "CUDA not found")
+
+          # this macro defines cmake rules that execute the following four steps:
+          # 1) compile the given cuda file ${cuda_file} to an intermediary PTX file
+          # 2) use the 'bin2c' tool (that comes with CUDA) to
+          #    create a second intermediary (.c-)file which defines a const string variable
+          #    (named '${c_var_name}') whose (constant) value is the PTX output
+          #    from the previous step.
+          # 3) compile the given .c file to an intermediary object file (why thus has
+          #    that PTX string 'embedded' as a global constant.
+          # 4) assign the name of the intermediary .o file to the cmake variable
+          #    'output_var', which can then be added to cmake targets.
+          macro (cuda_compile_and_embed output_var cuda_file lib_name)
+            add_library ("${lib_name}" OBJECT "${cuda_file}")
+            set_property (TARGET "${lib_name}" PROPERTY CUDA_PTX_COMPILATION ON)
+
+            # disable "extern declaration... is treated as a static definition" warning
+            if (CUDA_VERSION_MAJOR EQUAL 11 AND CUDA_VERSION_MINOR LESS 2)
+                target_compile_options ("${lib_name}" PRIVATE
+                                        -Xcudafe=--display_error_number -Xcudafe=--diag_suppress=3089)
+            else ()
+                target_compile_options ("${lib_name}" PRIVATE
+                                        -Xcudafe=--display_error_number -Xcudafe=--diag_suppress=20044)
+            endif ()
+
+            # CUDA integration in Visual Studio seems broken as even if "Use
+            # Host Preprocessor Definitions" is checked, the host preprocessor
+            # definitions are still not used when compiling device code.
+            # To work around that, define the macros using --define-macro to
+            # avoid CMake identifying those as macros and using the proper (but
+            # broken) way of specifying them.
+            if (${CMAKE_GENERATOR} MATCHES "^Visual Studio")
+              # As PBRT_DEBUG_BUILD is specified globally as a definition, we need to
+              # manually add it due to the bug mentioned earlier and due to it
+              # not being found in PBRT_DEFINITIONS.
+              if (CMAKE_BUILD_TYPE MATCHES Debug)
+                  set (cuda_definitions "--define-macro=PBRT_DEBUG_BUILD")
+              endif ()
+              foreach (arg ${PBRT_DEFINITIONS})
+                list (APPEND cuda_definitions "--define-macro=${arg}")
+              endforeach ()
+              target_compile_options ("${lib_name}" PRIVATE ${cuda_definitions})
+            else ()
+              target_compile_definitions ("${lib_name}" PRIVATE ${PBRT_DEFINITIONS})
+            endif ()
+            target_include_directories ("${lib_name}" PRIVATE src ${CMAKE_BINARY_DIR})
+            target_include_directories ("${lib_name}" SYSTEM PRIVATE ${NANOVDB_INCLUDE})
+            target_link_libraries ("${lib_name}" PRIVATE cuda_build_configuration pbrt_opt pbrt_warnings)
+            add_dependencies ("${lib_name}" pbrt_soa_generated)
+            set (c_var_name ${output_var})
+            set (embedded_file ${cuda_file}.ptx_embedded.c)
+            add_custom_command (
+              OUTPUT "${embedded_file}"
+              COMMAND ${CMAKE_COMMAND}
+                "-DBIN_TO_C_COMMAND=${BIN2C}"
+                "-DOBJECTS=$<TARGET_OBJECTS:${lib_name}>"
+                "-DVAR_NAME=${c_var_name}"
+                "-DOUTPUT=${embedded_file}"
+                -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/bin2c_wrapper.cmake
+              VERBATIM
+              DEPENDS "${lib_name}" $<TARGET_OBJECTS:${lib_name}>
+              COMMENT "Embedding PTX generated from ${cuda_file}"
+            )
+            set (${output_var} ${embedded_file})
+          endmacro ()
+      endif ()
+    endif()
+  else ()
+      message (STATUS "CUDA not found")
+  endif ()
 endif ()
 
 ###########################################################################
@@ -363,10 +431,12 @@ endif ()
 
 include (CheckCXXCompilerFlag)
 
-check_cxx_compiler_flag ("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE)
-if (COMPILER_SUPPORTS_MARCH_NATIVE AND PBRT_BUILD_NATIVE_EXECUTABLE)
-    target_compile_options (pbrt_opt INTERFACE
-          "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler >-march=native")
+if (NOT PBRT_HIP_ENABLED)
+  check_cxx_compiler_flag ("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE)
+  if (COMPILER_SUPPORTS_MARCH_NATIVE AND PBRT_BUILD_NATIVE_EXECUTABLE)
+      target_compile_options (pbrt_opt INTERFACE
+            "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler >-march=native")
+  endif ()
 endif ()
 
 if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
@@ -692,85 +762,72 @@ SET (PBRT_UTIL_SOURCE_HEADERS
   src/pbrt/util/vecmath.h
   )
 
-if (PBRT_CUDA_ENABLED)
-  set (PBRT_GPU_SOURCE
-    src/pbrt/gpu/optix/aggregate.cpp
-    src/pbrt/gpu/optix/denoiser.cpp
-    src/pbrt/gpu/memory.cpp
-    src/pbrt/gpu/util.cpp
-  )
-  set (PBRT_GPU_SOURCE_HEADERS
-    src/pbrt/gpu/optix/aggregate.h
-    src/pbrt/gpu/cudagl.h
-    src/pbrt/gpu/optix/denoiser.h
-    src/pbrt/gpu/memory.h
-    src/pbrt/gpu/optix/optix.h
-    src/pbrt/gpu/util.h
-  )
-
-  set_source_files_properties (
-   src/pbrt/bsdf.cpp
-   src/pbrt/bssrdf.cpp
-   src/pbrt/bxdfs.cpp
-   src/pbrt/cameras.cpp
-   src/pbrt/film.cpp
-   src/pbrt/filters.cpp
-#   src/pbrt/genscene.cpp
-   src/pbrt/interaction.cpp
-   src/pbrt/lights.cpp
-   src/pbrt/lightsamplers.cpp
-   src/pbrt/materials.cpp
-#   src/pbrt/media.cpp
-   src/pbrt/options.cpp
-#   src/pbrt/paramdict.cpp
-#   src/pbrt/parser.cpp
-   src/pbrt/pbrt.cpp
-   src/pbrt/samplers.cpp
-   src/pbrt/shapes.cpp
-   src/pbrt/textures.cpp
-
-   src/pbrt/util/bluenoise.cpp
-   src/pbrt/util/check.cpp
-   src/pbrt/util/color.cpp
-   src/pbrt/util/colorspace.cpp
-   src/pbrt/util/error.cpp
-#   src/pbrt/util/file.cpp
-#   src/pbrt/util/float.cpp
-#   src/pbrt/util/image.cpp
-   src/pbrt/util/log.cpp
-#   src/pbrt/util/loopsubdiv.cpp
-   src/pbrt/util/lowdiscrepancy.cpp
-   src/pbrt/util/math.cpp
-#   src/pbrt/util/memory.cpp
-   src/pbrt/util/mesh.cpp
-#   src/pbrt/util/mipmap.cpp
-   src/pbrt/util/noise.cpp
-#   src/pbrt/util/parallel.cpp
-   src/pbrt/util/pmj02tables.cpp
-   src/pbrt/util/primes.cpp
-#   src/pbrt/util/print.cpp
-#   src/pbrt/util/progressreporter.cpp
-   src/pbrt/util/pstd.cpp
-   src/pbrt/util/rng.cpp
-   src/pbrt/util/sampling.cpp
-   src/pbrt/util/scattering.cpp
-   src/pbrt/util/sobolmatrices.cpp
-   src/pbrt/util/spectrum.cpp
-   src/pbrt/util/stats.cpp
-#   src/pbrt/util/stbimage.cpp
-#   src/pbrt/util/string.cpp
-   src/pbrt/util/transform.cpp
-   src/pbrt/util/vecmath.cpp
+if (PBRT_CUDA_ENABLED OR PBRT_HIP_ENABLED)
+  if (PBRT_HIP_ENABLED)
+    set (PBRT_GPU_SOURCE src/pbrt/gpu/hiprt/aggregate.cpp src/pbrt/gpu/memory.cpp src/pbrt/gpu/util.cpp)
+    set (PBRT_GPU_SOURCE_HEADERS src/pbrt/gpu/hiprt/aggregate.h src/pbrt/gpu/common.h src/pbrt/gpu/cudagl.h src/pbrt/gpu/memory.h src/pbrt/gpu/util.h)
+  else ()
+    set (PBRT_GPU_SOURCE src/pbrt/gpu/optix/aggregate.cpp src/pbrt/gpu/optix/denoiser.cpp src/pbrt/gpu/memory.cpp src/pbrt/gpu/util.cpp)
+    set (PBRT_GPU_SOURCE_HEADERS src/pbrt/gpu/optix/aggregate.h src/pbrt/gpu/optix/denoiser.h src/pbrt/gpu/common.h src/pbrt/gpu/cudagl.h src/pbrt/gpu/memory.h src/pbrt/gpu/util.h)
+  endif ()
 
+set(PBRT_GPU_SOURCE_ALL
+    src/pbrt/bsdf.cpp
+    src/pbrt/bssrdf.cpp
+    src/pbrt/bxdfs.cpp
+    src/pbrt/cameras.cpp
+    src/pbrt/film.cpp
+    src/pbrt/filters.cpp
+    # src/pbrt/genscene.cpp
+    src/pbrt/interaction.cpp
+    src/pbrt/lights.cpp
+    src/pbrt/lightsamplers.cpp
+    src/pbrt/materials.cpp
+    # src/pbrt/media.cpp
+    src/pbrt/options.cpp
+    # src/pbrt/paramdict.cpp src/pbrt/parser.cpp
+    src/pbrt/pbrt.cpp
+    src/pbrt/samplers.cpp
+    src/pbrt/shapes.cpp
+    src/pbrt/textures.cpp
+    src/pbrt/util/bluenoise.cpp
+    src/pbrt/util/check.cpp
+    src/pbrt/util/color.cpp
+    src/pbrt/util/colorspace.cpp
+    src/pbrt/util/error.cpp
+    # src/pbrt/util/file.cpp src/pbrt/util/float.cpp src/pbrt/util/image.cpp
+    src/pbrt/util/log.cpp
+    # src/pbrt/util/loopsubdiv.cpp
+    src/pbrt/util/lowdiscrepancy.cpp
+    src/pbrt/util/math.cpp
+    # src/pbrt/util/memory.cpp
+    src/pbrt/util/mesh.cpp
+    # src/pbrt/util/mipmap.cpp
+    src/pbrt/util/noise.cpp
+    # src/pbrt/util/parallel.cpp
+    src/pbrt/util/pmj02tables.cpp
+    src/pbrt/util/primes.cpp
+    # src/pbrt/util/print.cpp src/pbrt/util/progressreporter.cpp
+    src/pbrt/util/pstd.cpp
+    src/pbrt/util/rng.cpp
+    src/pbrt/util/sampling.cpp
+    src/pbrt/util/scattering.cpp
+    src/pbrt/util/sobolmatrices.cpp
+    src/pbrt/util/spectrum.cpp
+    src/pbrt/util/stats.cpp
+    # src/pbrt/util/stbimage.cpp src/pbrt/util/string.cpp
+    src/pbrt/util/transform.cpp
+    src/pbrt/util/vecmath.cpp
     ${PBRT_WAVEFRONT_SOURCE}
     ${PBRT_GPU_SOURCE}
+    src/pbrt/cmd/pspec_gpu.cpp)
 
-    src/pbrt/cmd/pspec_gpu.cpp
-
-    PROPERTIES LANGUAGE CUDA
-  )
-
-  cuda_compile_and_embed (PBRT_EMBEDDED_PTX src/pbrt/gpu/optix/optix.cu optix.cu)
+  if(PBRT_HIP_ENABLED)
+    set_source_files_properties(${PBRT_GPU_SOURCE_ALL} PROPERTIES LANGUAGE CXX)
+  else()
+    set_source_files_properties(${PBRT_GPU_SOURCE_ALL} PROPERTIES LANGUAGE CUDA)
+    cuda_compile_and_embed(PBRT_EMBEDDED_PTX src/pbrt/gpu/optix/optix.cu optix.cu)
+  endif()
 endif ()
 
 source_group ("Source Files" FILES ${PBRT_SOURCE})
@@ -781,7 +838,7 @@ source_group ("Source Files/util" FILES ${PBRT_UTIL_SOURCE})
 source_group ("Header Files/util" FILES ${PBRT_UTIL_SOURCE_HEADERS})
 source_group ("Source Files/wavefront" FILES ${PBRT_WAVEFRONT_SOURCE})
 source_group ("Header Files/wavefront" FILES ${PBRT_WAVEFRONT_SOURCE_HEADERS})
-if (PBRT_CUDA_ENABLED)
+if (PBRT_CUDA_ENABLED OR PBRT_HIP_ENABLED)
   source_group ("Source Files/gpu" FILES ${PBRT_GPU_SOURCE})
   source_group ("Header Files/gpu" FILES ${PBRT_GPU_SOURCE_HEADERS})
 endif ()
@@ -875,7 +932,12 @@ endif ()
 
 target_compile_options (pbrt_lib PUBLIC ${PBRT_CXX_FLAGS})
 
-target_link_libraries (pbrt_lib PRIVATE pbrt_warnings pbrt_opt $<$<BOOL:PBRT_CUDA_ENABLED>:cuda_build_configuration>)
+if (PBRT_HIP_ENABLED)
+  target_link_libraries (pbrt_lib PUBLIC hip::device)
+  target_link_libraries (pbrt_lib PRIVATE pbrt_warnings pbrt_opt)
+else ()
+  target_link_libraries (pbrt_lib PRIVATE pbrt_warnings pbrt_opt $<$<BOOL:PBRT_CUDA_ENABLED>:cuda_build_configuration>)
+endif()
 
 add_sanitizers (pbrt_lib)
 
@@ -910,6 +972,84 @@ if (PBRT_CUDA_ENABLED)
   endif ()
 endif ()
 
+if (PBRT_HIP_ENABLED)
+  list (APPEND ALL_PBRT_LIBS ${PBRT_HIPRT_LIB_NAME})
+  if (CMAKE_SYSTEM_NAME STREQUAL Windows)
+    set(HIPRT_BUILD_CMD "call ${PBRT_HIP_PATH}/bin/hipcc.bat")
+    string (APPEND HIPRT_BUILD_CMD " -DPBRT_IS_WINDOWS")
+    string (APPEND HIPRT_BUILD_CMD " -DNOMINMAX")
+    string (APPEND HIPRT_BUILD_CMD " -DPBRT_IS_MSVC")
+    string (APPEND HIPRT_BUILD_CMD " -D_CRT_SECURE_NO_WARNINGS")
+    string (APPEND HIPRT_BUILD_CMD " -D_ENABLE_EXTENDED_ALIGNED_STORAGE")
+    set(HIPRT_LINK_CMD "call ${PBRT_HIP_PATH}/bin/clang++.exe")
+  elseif (CMAKE_SYSTEM_NAME STREQUAL Linux)
+    set(HIPRT_BUILD_CMD "hipcc")
+    string (APPEND HIPRT_BUILD_CMD " -DPBRT_IS_LINUX")
+    set(HIPRT_LINK_CMD "/opt/rocm/bin/amdclang++" )
+  endif ()
+  foreach (TARGET IN LISTS AMDGPU_TARGETS)
+    message (STATUS "Current item: ${item}")
+    string (APPEND HIPRT_BUILD_CMD " --offload-arch=")
+    string (APPEND HIPRT_BUILD_CMD ${TARGET})
+    string (APPEND HIPRT_LINK_CMD " --offload-arch=")
+    string (APPEND HIPRT_LINK_CMD ${TARGET})
+  endforeach ()
+  string (APPEND HIPRT_BUILD_CMD " -I.")
+  string (APPEND HIPRT_BUILD_CMD " -I../src")
+  string (APPEND HIPRT_BUILD_CMD " -I${PBRT_HIPRT_PATH}")
+  string (APPEND HIPRT_BUILD_CMD " -I../src/ext/openvdb/nanovdb")
+  string (APPEND HIPRT_BUILD_CMD " -x hip")
+  string (APPEND HIPRT_BUILD_CMD " ../src/pbrt/gpu/hiprt/hiprt.cu")
+  string (APPEND HIPRT_BUILD_CMD " ../src/pbrt/util/sobolmatrices.cpp")
+  string (APPEND HIPRT_BUILD_CMD " ../src/pbrt/util/primes.cpp")
+  string (APPEND HIPRT_BUILD_CMD " ../src/pbrt/options.cpp")
+  string (APPEND HIPRT_BUILD_CMD " ../src/pbrt/shapes.cpp")
+  string (APPEND HIPRT_BUILD_CMD " -O3")
+  string (APPEND HIPRT_BUILD_CMD " -ffast-math")
+  string (APPEND HIPRT_BUILD_CMD " -std=c++17")
+  string (APPEND HIPRT_BUILD_CMD " --cuda-device-only")
+  string (APPEND HIPRT_BUILD_CMD " -fgpu-rdc -c")
+  string (APPEND HIPRT_BUILD_CMD " --gpu-bundle-output -c -emit-llvm")
+  string (APPEND HIPRT_BUILD_CMD " -DWARP_THREADS=32 ")
+  string (APPEND HIPRT_BUILD_CMD " -DPBRT_BUILD_GPU_RENDERER")
+  string (APPEND HIPRT_BUILD_CMD " -DPBRT_RESTRICT=__restrict__ ")
+  string (APPEND HIPRT_BUILD_CMD " -DBLOCK_SIZE=64")
+  string (APPEND HIPRT_BUILD_CMD " -DSHARED_STACK_SIZE=16")
+  string (APPEND HIPRT_LINK_CMD " -o hiprt.hipfb")
+  string (APPEND HIPRT_LINK_CMD " -fgpu-rdc")
+  string (APPEND HIPRT_LINK_CMD " --hip-link")
+  string (APPEND HIPRT_LINK_CMD " -Xoffload-linker")
+  string (APPEND HIPRT_LINK_CMD " --whole-archive")
+  string (APPEND HIPRT_LINK_CMD " --cuda-device-only")
+  string (APPEND HIPRT_LINK_CMD " ${PBRT_HIPRT_BITCODE_FILENAME}")
+  string (APPEND HIPRT_LINK_CMD " hiprt-hip-amdgcn-amd-amdhsa.bc")
+  string (APPEND HIPRT_LINK_CMD " primes-hip-amdgcn-amd-amdhsa.bc")
+  string (APPEND HIPRT_LINK_CMD " sobolmatrices-hip-amdgcn-amd-amdhsa.bc")
+  string (APPEND HIPRT_LINK_CMD " options-hip-amdgcn-amd-amdhsa.bc")
+  string (APPEND HIPRT_LINK_CMD " shapes-hip-amdgcn-amd-amdhsa.bc")
+  if (CMAKE_SYSTEM_NAME STREQUAL Windows)
+    add_custom_target (
+        libdeflate_msvc_patch
+        COMMAND cd ${CMAKE_CURRENT_SOURCE_DIR}/src/ext/libdeflate && git apply ${CMAKE_CURRENT_SOURCE_DIR}/0001-MSVC-HIP.patch > nul 2> nul & exit 0
+    )
+    add_dependencies (pbrt_lib libdeflate_msvc_patch)
+	  message (STATUS "Applying patch to libdeflate to support _BitScanReverse for Windows HIP build.")
+    file (WRITE "${CMAKE_CURRENT_BINARY_DIR}/hiprt.bat" "${HIPRT_BUILD_CMD}\n${HIPRT_LINK_CMD}\n")
+    add_custom_target (
+      hiprt_kernels ALL
+      DEPENDS pbrt_lib
+      COMMAND ${CMAKE_CURRENT_BINARY_DIR}/hiprt.bat
+      COMMENT "Compiling HIPRT kernels...")
+  elseif (CMAKE_SYSTEM_NAME STREQUAL Linux)
+    file (WRITE "${CMAKE_CURRENT_BINARY_DIR}/hiprt.sh" "${HIPRT_BUILD_CMD}\n${HIPRT_LINK_CMD}\n")
+    add_custom_target (
+      hiprt_kernels ALL
+      DEPENDS pbrt_lib
+      COMMAND chmod 777 ${CMAKE_CURRENT_BINARY_DIR}/hiprt.sh && ${CMAKE_CURRENT_BINARY_DIR}/hiprt.sh
+      COMMENT "Compiling HIPRT kernels...")
+  endif ()
+endif ()
+
 if (WIN32)
   list (APPEND ALL_PBRT_LIBS "dbghelp" "wsock32" "ws2_32")
 endif ()
@@ -956,6 +1096,7 @@ add_executable (pbrt::pbrt_exe ALIAS pbrt_exe)
 
 target_compile_definitions (pbrt_exe PRIVATE ${PBRT_DEFINITIONS})
 target_compile_options (pbrt_exe PRIVATE ${PBRT_CXX_FLAGS})
+target_link_options (pbrt_exe PRIVATE ${PBRT_LINK_FLAGS})
 target_include_directories (pbrt_exe PRIVATE src src/ext)
 target_link_libraries (pbrt_exe PRIVATE ${ALL_PBRT_LIBS} pbrt_opt pbrt_warnings)
 
@@ -976,6 +1117,7 @@ set_property (TARGET sky_lib PROPERTY FOLDER "ext")
 
 target_compile_definitions (imgtool PRIVATE ${PBRT_DEFINITIONS})
 target_compile_options (imgtool PRIVATE ${PBRT_CXX_FLAGS})
+target_link_options (imgtool PRIVATE ${PBRT_LINK_FLAGS})
 target_include_directories (imgtool PRIVATE src src/ext ${FLIP_INCLUDE})
 target_link_libraries (imgtool PRIVATE ${ALL_PBRT_LIBS} pbrt_opt pbrt_warnings sky_lib flip_lib)
 
@@ -991,6 +1133,7 @@ add_executable (pbrt::pspec ALIAS pspec)
 
 target_compile_definitions (pspec PRIVATE ${PBRT_DEFINITIONS})
 target_compile_options (pspec PRIVATE ${PBRT_CXX_FLAGS})
+target_link_options (pspec PRIVATE ${PBRT_LINK_FLAGS})
 target_include_directories (pspec PRIVATE src src/ext)
 target_link_libraries (pspec PRIVATE ${ALL_PBRT_LIBS} pbrt_warnings)
 
@@ -1006,6 +1149,7 @@ add_executable (pbrt::plytool ALIAS plytool)
 
 target_compile_definitions (plytool PRIVATE ${PBRT_DEFINITIONS})
 target_compile_options (plytool PUBLIC ${PBRT_CXX_FLAGS})
+target_link_options (plytool PRIVATE ${PBRT_LINK_FLAGS})
 target_include_directories (plytool PUBLIC src src/ext)
 target_link_libraries (plytool PRIVATE ${ALL_PBRT_LIBS} pbrt_warnings pbrt_opt)
 
@@ -1025,6 +1169,7 @@ target_compile_definitions (nanovdb2pbrt PRIVATE ${PBRT_DEFINITIONS})
 target_compile_options (nanovdb2pbrt PUBLIC ${PBRT_CXX_FLAGS})
 target_include_directories (nanovdb2pbrt PUBLIC src src/ext)
 target_link_libraries (nanovdb2pbrt PRIVATE ${ALL_PBRT_LIBS} pbrt_warnings pbrt_opt)
+target_link_options (nanovdb2pbrt PRIVATE ${PBRT_LINK_FLAGS})
 
 set_target_properties (nanovdb2pbrt PROPERTIES OUTPUT_NAME nanovdb2pbrt)
 
@@ -1087,7 +1232,8 @@ add_executable (pbrt_test src/pbrt/cmd/pbrt_test.cpp ${PBRT_TEST_SOURCE})
 target_link_libraries (pbrt_test PRIVATE ${ALL_PBRT_LIBS} pbrt_opt pbrt_warnings)
 target_compile_definitions (pbrt_test PRIVATE ${PBRT_DEFINITIONS})
 target_include_directories (pbrt_test PRIVATE src src/ext ${DOUBLE_CONVERSION_INCLUDE})
-target_compile_options(pbrt_test PUBLIC ${PBRT_CXX_FLAGS})
+target_compile_options (pbrt_test PUBLIC ${PBRT_CXX_FLAGS})
+target_link_options (pbrt_test PRIVATE ${PBRT_LINK_FLAGS})
 
 add_sanitizers (pbrt_test)
 
diff --git a/LICENSE.txt b/LICENSE.txt
index d64569567..37965801b 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,3 +1,7 @@
+All of the modifications made in commit ID 14582eebabcf1ca33323e6b6d6757a49d2306dbf are covered by the MIT license (below).
+The rest would be covered by the existing Apache license.
+
+---------------------------------------------------------
 
                                  Apache License
                            Version 2.0, January 2004
@@ -200,3 +204,28 @@
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.
+
+
+---------------------------------------------------------
+
+The MIT License (MIT)
+
+Copyright (c) 2022-2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/README.md b/README.md
index 0454dda15..d1d822297 100644
--- a/README.md
+++ b/README.md
@@ -228,3 +228,33 @@ is straightforward:
 ```bash
 $ imgtool denoise-optix noisy.exr --outfile denoised.exr
 ```
+
+Instructions to build the HIP port
+--------
+
+Linux:
+* Install [ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/native-install/ubuntu.html)
+* Download and extract [HIPRT](https://gpuopen.com/hiprt/)
+  * Note that HIP is very sensitive to the difference between versions of HIPRT and the application on Linux
+  * If you encounter linking error, we recommend to compile HIPRT by yourself
+* `cmake -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ -DPBRT_HIPRT=ON -DPBRT_HIPRT_PATH=~/hiprtSdk ..`
+* `make` or `make pbrt_exe`
+
+Windows:
+* Download and install [HIP SDK](https://www.amd.com/en/developer/resources/rocm-hub/hip-sdk.html)
+  * Make sure that you checked HIPRT to be installed
+* Download and install [Strawberry Perl](https://strawberryperl.com/)
+* Run `x64 Native Tools Command Prompt for VS 2022` as administrator
+* `set CC=clang`
+* `set CXX=clang++`
+* `mkdir build` and `cd build`
+* `cmake -G "Ninja" -DCMAKE_BUILD_TYPE=Release -DPBRT_HIPRT=ON ..`
+* `cmake --build . --config=Release`
+
+Example: `./pbrt --gpu ~/pbrt-v4-scenes/killeroos/killeroo-simple.pbrt`
+Add `--interactive` for the interactive mode.
+
+GPU architecture and ROCm version
+--------
+
+The instructions above assume the `gfx1100` architecture. You can specify other architectures via the CMake `AMDGPU_TARGETS` variable. Note that PBRT should be compiled with the same version as HIPRT binaries (e.g., using ROCm 6.0 and `hiprt02004_6.0_amd_lib_linux.bc`).
\ No newline at end of file
diff --git a/src/ext/flip/flip.cpp b/src/ext/flip/flip.cpp
index 0eedeed44..ba0f18541 100644
--- a/src/ext/flip/flip.cpp
+++ b/src/ext/flip/flip.cpp
@@ -50,6 +50,10 @@
 #include <fstream>
 #include <cassert>
 
+#ifndef M_PI
+#define M_PI 3.14159265358979323846f
+#endif
+
 namespace flip_detail {
 
 class histogram
diff --git a/src/ext/openvdb b/src/ext/openvdb
index 414bed84c..385516321 160000
--- a/src/ext/openvdb
+++ b/src/ext/openvdb
@@ -1 +1 @@
-Subproject commit 414bed84c2fc22e188eac7b611aa85c7edd7a5a9
+Subproject commit 385516321b272c423e2a514b41bd21ce6ffdda72
diff --git a/src/pbrt/base/medium.h b/src/pbrt/base/medium.h
index 698b04816..30500d006 100644
--- a/src/pbrt/base/medium.h
+++ b/src/pbrt/base/medium.h
@@ -88,7 +88,6 @@ class Medium
 
     std::string ToString() const;
 
-    PBRT_CPU_GPU
     bool IsEmissive() const;
 
     PBRT_CPU_GPU
diff --git a/src/pbrt/cmd/imgtool.cpp b/src/pbrt/cmd/imgtool.cpp
index a812fc7bb..e2c14dbf0 100644
--- a/src/pbrt/cmd/imgtool.cpp
+++ b/src/pbrt/cmd/imgtool.cpp
@@ -7,9 +7,9 @@
 #include <pbrt/filters.h>
 #include <pbrt/options.h>
 #ifdef PBRT_BUILD_GPU_RENDERER
-#ifndef __HIP_PLATFORM_AMD__
+#ifdef __NVCC__
 #include <pbrt/gpu/optix/denoiser.h>
-#endif // __HIP_PLATFORM_AMD__
+#endif // __NVCC__
 #include <pbrt/gpu/util.h>
 #endif  // PBRT_BUILD_GPU_RENDERER
 #include <pbrt/util/args.h>
@@ -2220,7 +2220,7 @@ int makeequiarea(std::vector<std::string> args) {
     return 0;
 }
 
-#ifdef PBRT_BUILD_GPU_RENDERER
+#ifdef __NVCC__
 int denoise_optix(std::vector<std::string> args) {
     std::string inFilename, outFilename;
 
@@ -2335,7 +2335,7 @@ int denoise_optix(std::vector<std::string> args) {
 
     return 0;
 }
-#endif  // PBRT_BUILD_GPU_RENDERER
+#endif  // __NVCC__
 
 int main(int argc, char *argv[]) {
     PBRTOptions opt;
@@ -2362,10 +2362,10 @@ int main(int argc, char *argv[]) {
         return convert(args);
     else if (cmd == "diff")
         return diff(args);
-#ifdef PBRT_BUILD_GPU_RENDERER
+#ifdef __NVCC__
     else if (cmd == "denoise-optix")
         return denoise_optix(args);
-#endif  // PBRT_BUILD_GPU_RENDERER
+#endif  // __NVCC__
     else if (cmd == "error")
         return error(args);
     else if (cmd == "falsecolor")
diff --git a/src/pbrt/cmd/pbrt_test.cpp b/src/pbrt/cmd/pbrt_test.cpp
index 4bd7b23d3..3789dae8b 100644
--- a/src/pbrt/cmd/pbrt_test.cpp
+++ b/src/pbrt/cmd/pbrt_test.cpp
@@ -2,6 +2,7 @@
 // The pbrt source code is licensed under the Apache License, Version 2.0.
 // SPDX: Apache-2.0
 
+#include <gtest/gtest.h>
 #include <pbrt/pbrt.h>
 
 #include <pbrt/options.h>
@@ -9,7 +10,6 @@
 #include <pbrt/util/error.h>
 #include <pbrt/util/print.h>
 
-#include <gtest/gtest.h>
 #include <string>
 
 using namespace pbrt;
diff --git a/src/pbrt/cmd/pspec_gpu.cpp b/src/pbrt/cmd/pspec_gpu.cpp
index ce5b1a1f3..5cc0ffa00 100644
--- a/src/pbrt/cmd/pspec_gpu.cpp
+++ b/src/pbrt/cmd/pspec_gpu.cpp
@@ -16,8 +16,12 @@
 #include <pbrt/util/image.h>
 #include <pbrt/util/vecmath.h>
 
+#if defined(__HIPCC__)
+#include <pbrt/util/hip_aliases.h>
+#else
 #include <cuda.h>
 #include <cuda_runtime_api.h>
+#endif
 
 #include <vector>
 
diff --git a/src/pbrt/gpu/common.h b/src/pbrt/gpu/common.h
new file mode 100644
index 000000000..7451d958d
--- /dev/null
+++ b/src/pbrt/gpu/common.h
@@ -0,0 +1,111 @@
+#ifndef PBRT_GPU_COMMON_H
+#define PBRT_GPU_COMMON_H
+
+#include <pbrt/pbrt.h>
+
+#include <pbrt/base/light.h>
+#include <pbrt/base/material.h>
+#include <pbrt/base/medium.h>
+#include <pbrt/base/shape.h>
+#include <pbrt/base/texture.h>
+#include <pbrt/util/pstd.h>
+#include <pbrt/wavefront/workitems.h>
+#include <pbrt/wavefront/workqueue.h>
+
+#if defined(__HIPCC__)
+#include <hiprt/hiprt.h>
+#include <hiprt/hiprt_vec.h>
+#else
+#include <optix.h>
+#endif
+
+namespace pbrt {
+
+class TriangleMesh;
+class BilinearPatchMesh;
+
+struct TriangleMeshRecord {
+    const TriangleMesh *mesh;
+    Material material;
+    FloatTexture alphaTexture;
+    pstd::span<Light> areaLights;
+    MediumInterface *mediumInterface;
+};
+
+struct BilinearMeshRecord {
+    const BilinearPatchMesh *mesh;
+    Material material;
+    FloatTexture alphaTexture;
+    pstd::span<Light> areaLights;
+    MediumInterface *mediumInterface;
+};
+
+struct QuadricRecord {
+    Shape shape;
+    Material material;
+    FloatTexture alphaTexture;
+    Light areaLight;
+    MediumInterface *mediumInterface;
+};
+
+#if defined(__HIP_PLATFORM_AMD__)
+static constexpr size_t HitgroupAlignment = 16u;
+
+struct alignas(HitgroupAlignment) HitgroupRecord {
+    PBRT_CPU_GPU HitgroupRecord() {}
+    PBRT_CPU_GPU HitgroupRecord(const HitgroupRecord &r) {
+        memcpy(this, &r, sizeof(HitgroupRecord));
+    }
+    PBRT_CPU_GPU HitgroupRecord &operator=(const HitgroupRecord &r) {
+        if (this != &r)
+            memcpy(this, &r, sizeof(HitgroupRecord));
+        return *this;
+    }
+
+    union {
+        TriangleMeshRecord triRec;
+        BilinearMeshRecord blpRec;
+        QuadricRecord quadricRec;
+    };
+    enum { TriangleMesh, BilinearMesh, Quadric } type;
+};
+#endif
+
+struct RayIntersectParameters {
+#if defined(__HIPCC__)
+    hiprtScene traversable;
+#else
+    OptixTraversableHandle traversable;
+#endif
+
+    const RayQueue *rayQueue;
+
+    // Closest hit
+    RayQueue *nextRayQueue;
+    EscapedRayQueue *escapedRayQueue;
+    HitAreaLightQueue *hitAreaLightQueue;
+    MaterialEvalQueue *basicEvalMaterialQueue, *universalEvalMaterialQueue;
+    MediumSampleQueue *mediumSampleQueue;
+
+    // Shadow rays
+    ShadowRayQueue *shadowRayQueue;
+    SOA<PixelSampleState> pixelSampleState;
+
+    // Subsurface scattering...
+    SubsurfaceScatterQueue *subsurfaceScatterQueue;
+
+#if defined(__HIPCC__)
+    // Stack buffers
+    hiprtGlobalStackBuffer globalStackBuffer;
+    hiprtGlobalStackBuffer globalInstanceStackBuffer;
+    // Custom function table
+    hiprtFuncTable funcTable;
+    // Hitgroup records
+    HitgroupRecord *hgRecords;
+    // Offsets for hitgroup records
+    uint32_t *offsets;
+#endif
+};
+}  // namespace pbrt
+
+#endif  // PBRT_GPU_COMMON_H
diff --git a/src/pbrt/gpu/cudagl.h b/src/pbrt/gpu/cudagl.h
index 2695844f2..d04dd6351 100644
--- a/src/pbrt/gpu/cudagl.h
+++ b/src/pbrt/gpu/cudagl.h
@@ -34,9 +34,13 @@
 
 #include <glad/glad.h>
 
+#if defined(__HIPCC__)
+#include <pbrt/util/hip_aliases.h>
+#else
 #include <cuda.h>
 #include <cuda_gl_interop.h>
 #include <cuda_runtime.h>
+#endif
 
 #define GL_CHECK(call)                                                   \
     do {                                                                 \
@@ -370,6 +374,15 @@ CUDAOutputBuffer<PIXEL_FORMAT>::CUDAOutputBuffer(int32_t width, int32_t height)
                           nullptr, GL_STREAM_DRAW));
     GL_CHECK(glBindBuffer(GL_ARRAY_BUFFER, 0u));
 
+#ifdef __HIPCC__
+    uint32_t num_gl_devices = 0;
+
+    int glDevice;
+    cudaGLGetDevices(&num_gl_devices, &glDevice, 1, cudaGLDeviceListAll);
+
+    if (glDevice != current_device)
+        LOG_FATAL("Multi-GPU not supported with GL interop yet");
+#endif
     CUDA_CHECK(cudaGraphicsGLRegisterBuffer(&m_cuda_gfx_resource, m_pbo,
                                             cudaGraphicsMapFlagsWriteDiscard));
 
diff --git a/src/pbrt/gpu/hiprt/aggregate.cpp b/src/pbrt/gpu/hiprt/aggregate.cpp
new file mode 100644
index 000000000..d99f2fa96
--- /dev/null
+++ b/src/pbrt/gpu/hiprt/aggregate.cpp
@@ -0,0 +1,1499 @@
+// pbrt is Copyright(c) 1998-2020 Matt Pharr, Wenzel Jakob, and Greg Humphreys.
+// The pbrt source code is licensed under the Apache License, Version 2.0.
+// SPDX: Apache-2.0
+
+#include <pbrt/gpu/hiprt/aggregate.h>
+
+#include <pbrt/gpu/common.h>
+#include <pbrt/gpu/util.h>
+#include <pbrt/lights.h>
+#include <pbrt/materials.h>
+#include <pbrt/scene.h>
+#include <pbrt/textures.h>
+#include <pbrt/util/error.h>
+#include <pbrt/util/file.h>
+#include <pbrt/util/log.h>
+#include <pbrt/util/loopsubdiv.h>
+#include <pbrt/util/mesh.h>
+#include <pbrt/util/parallel.h>
+#include <pbrt/util/pstd.h>
+#include <pbrt/util/splines.h>
+#include <pbrt/util/stats.h>
+#include <pbrt/wavefront/intersect.h>
+
+#include <atomic>
+#include <fstream>
+#include <mutex>
+#include <unordered_map>
+
+#include <hiprt/hiprt.h>
+
+#define HIPRT_CHECK(EXPR)                                                    \
+    do {                                                                     \
+        hiprtError res = EXPR;                                               \
+        if (res != hiprtSuccess)                                             \
+            LOG_FATAL("HIPRT call " #EXPR " failed with code %d", int(res)); \
+    } while (false) /* eat semicolon */
+
+namespace pbrt {
+
+STAT_MEMORY_COUNTER("Memory/Acceleration structures", gpuBVHBytes);
+STAT_COUNTER("Geometry/Triangles added from displacement mapping", displacedTrisDelta);
+STAT_COUNTER("Geometry/Curves", nCurves);
+STAT_COUNTER("Geometry/Bilinear patches created for diced curves", nBLPsForCurves);
+
+static Material getMaterial(const ShapeSceneEntity &shape,
+                            const std::map<std::string, Material> &namedMaterials,
+                            const std::vector<Material> &materials) {
+    if (!shape.materialName.empty()) {
+        auto iter = namedMaterials.find(shape.materialName);
+        if (iter == namedMaterials.end())
+            ErrorExit(&shape.loc, "%s: material not defined", shape.materialName);
+        return iter->second;
+    } else {
+        CHECK_NE(shape.materialIndex, -1);
+        return materials[shape.materialIndex];
+    }
+}
+
+static FloatTexture getAlphaTexture(
+    const ShapeSceneEntity &shape,
+    const std::map<std::string, FloatTexture> &floatTextures, Allocator alloc) {
+    FloatTexture alphaTexture;
+
+    std::string alphaTexName = shape.parameters.GetTexture("alpha");
+    if (alphaTexName.empty()) {
+        if (Float alpha = shape.parameters.GetOneFloat("alpha", 1.f); alpha < 1.f)
+            alphaTexture = alloc.new_object<FloatConstantTexture>(alpha);
+        else
+            return nullptr;
+    } else {
+        auto iter = floatTextures.find(alphaTexName);
+        if (iter == floatTextures.end())
+            ErrorExit(&shape.loc, "%s: alpha texture not defined.", alphaTexName);
+
+        alphaTexture = iter->second;
+    }
+
+    if (!BasicTextureEvaluator().CanEvaluate({alphaTexture}, {})) {
+        // It would be nice to just use the UniversalTextureEvaluator (maybe
+        // always), but optix complains "Error: Found call graph recursion"...
+        Warning(&shape.loc,
+                "%s: alpha texture too complex for BasicTextureEvaluator "
+                "(need fallback path). Ignoring for now.",
+                alphaTexName);
+        alphaTexture = nullptr;
+    }
+
+    return alphaTexture;
+}
+
+static MediumInterface *getMediumInterface(const ShapeSceneEntity &shape,
+                                           const std::map<std::string, Medium> &media,
+                                           Allocator alloc) {
+    if (shape.insideMedium.empty() && shape.outsideMedium.empty())
+        return nullptr;
+
+    auto getMedium = [&](const std::string &name) -> Medium {
+        if (name.empty())
+            return nullptr;
+
+        auto iter = media.find(name);
+        if (iter == media.end())
+            ErrorExit(&shape.loc, "%s: medium not defined", name);
+        return iter->second;
+    };
+
+    return alloc.new_object<MediumInterface>(getMedium(shape.insideMedium),
+                                             getMedium(shape.outsideMedium));
+}
+
+HiprtAggregate::HiprtAggregate(
+    const BasicScene &scene, CUDATrackedMemoryResource *memoryResource,
+    NamedTextures &textures,
+    const std::map<int, pstd::vector<Light> *> &shapeIndexToAreaLights,
+    const std::map<std::string, Medium> &media,
+    const std::map<std::string, pbrt::Material> &namedMaterials,
+    const std::vector<pbrt::Material> &materials, int maxQueueSize)
+    : memoryResource(memoryResource), cudaStream(nullptr), scene(nullptr) {
+    hipCtx_t cudaContext;
+    CUDA_CHECK(hipCtxGetCurrent(&cudaContext));
+    CHECK(cudaContext != nullptr);
+
+#if defined(PBRT_IS_WINDOWS)
+    // On Windows, it is unfortunately necessary to disable
+    // multithreading here.  The issue is that GPU managed memory can
+    // only be accessed by one of the CPU or the GPU at a time; the
+    // program crashes if this is restriction is violated.  Thus, it's
+    // bad news if we are simultaneously, say, reading PLY files on
+    // the CPU and storing them in managed memory while an OptiX
+    // kernel is running on the GPU to build a BVH... (Issue #164).
+    if (Options->useGPU)
+        DisableThreadPool();
+#endif  // PBRT_IS_WINDOWS
+
+    ThreadLocal<hipStream_t> threadCUDAStreams([]() {
+        hipStream_t stream;
+        hipStreamCreate(&stream);
+        return stream;
+    });
+
+    paramsPool.resize(256);  // should be plenty
+    for (ParamBufferState &ps : paramsPool) {
+        void *ptr;
+        CUDA_CHECK(hipMalloc(&ptr, sizeof(RayIntersectParameters)));
+        ps.ptr = (hipDeviceptr_t)ptr;
+        CUDA_CHECK(hipEventCreate(&ps.finishedEvent));
+        CUDA_CHECK(hipHostMalloc(&ps.hostPtr, sizeof(RayIntersectParameters)));
+    }
+
+    // Create HIPRT context
+    LOG_VERBOSE("Starting HIPRT initialization");
+
+    int current_device;
+    CUDA_CHECK(hipGetDevice(&current_device));
+
+    hiprtContextCreationInput ctxInput;
+    ctxInput.ctxt = cudaContext;
+    ctxInput.device = current_device;
+    ctxInput.deviceType = hiprtDeviceAMD;
+    HIPRT_CHECK(hiprtCreateContext(HIPRT_API_VERSION, ctxInput, context));
+
+#ifdef NDEBUG
+    hiprtSetLogLevel(hiprtLogLevelError);
+#endif
+    LOG_VERBOSE("HIPRT version %d.%d.%x successfully initialized", HIPRT_MAJOR_VERSION,
+                HIPRT_MINOR_VERSION, HIPRT_PATCH_VERSION);
+
+    // HIPRT module
+    module = compileHiprtModule(context);
+
+    LOG_VERBOSE("Finished HIPRT initialization");
+
+    // Note: do not delete the pointers in threadBufferResources, since doing
+    // so would cause the memory they manage to be freed.
+    ThreadLocal<Allocator> threadAllocators([memoryResource]() {
+        pstd::pmr::monotonic_buffer_resource *resource =
+            new pstd::pmr::monotonic_buffer_resource(1024 * 1024, memoryResource);
+        return Allocator(resource);
+    });
+
+    ///////////////////////////////////////////////////////////////////////////
+    // Build top-level acceleration structures for non-instanced shapes
+    LOG_VERBOSE("Starting to create shapes and acceleration structures");
+    for (const auto &shape : scene.shapes)
+        if (shape.name != "sphere" && shape.name != "cylinder" && shape.name != "disk" &&
+            shape.name != "trianglemesh" && shape.name != "plymesh" &&
+            shape.name != "loopsubdiv" && shape.name != "bilinearmesh" &&
+            shape.name != "curve")
+            ErrorExit(&shape.loc, "%s: unknown shape", shape.name);
+
+    LOG_VERBOSE("Starting to read PLY meshes");
+    std::map<int, TriQuadMesh> plyMeshes =
+        PreparePLYMeshes(scene.shapes, textures.floatTextures);
+    LOG_VERBOSE("Finished reading PLY meshes");
+
+    LOG_VERBOSE("Starting to build geometries (BLAS)");
+    geomGroups.resize(scene.instanceDefinitions.size() + 1);
+
+    geomGroups[0].triGeomContainer = buildBVHForTriangles(
+        scene.shapes, plyMeshes, context, textures.floatTextures, namedMaterials,
+        materials, media, shapeIndexToAreaLights, threadAllocators, threadCUDAStreams);
+
+    geomGroups[0].blpGeomContainer = buildBVHForBLPs(
+        scene.shapes, context, textures.floatTextures, namedMaterials, materials, media,
+        shapeIndexToAreaLights, threadAllocators, threadCUDAStreams);
+
+    geomGroups[0].quadricGeomContainer = buildBVHForQuadrics(
+        scene.shapes, context, textures.floatTextures, namedMaterials, materials, media,
+        shapeIndexToAreaLights, threadAllocators, threadCUDAStreams);
+    LOG_VERBOSE("Finished building geometries (BLAS)");
+
+    ///////////////////////////////////////////////////////////////////////////
+    // Create instanced geometries
+    LOG_VERBOSE("Starting to build instanced geometries",
+                scene.instanceDefinitions.size());
+
+    InstanceContainer instanceContainer;
+    instanceContainer.appendRecords(geomGroups[0].triGeomContainer);
+    instanceContainer.appendRecords(geomGroups[0].blpGeomContainer);
+    instanceContainer.appendRecords(geomGroups[0].quadricGeomContainer);
+
+    std::vector<InternedString> allInstanceNames;
+    for (const auto &def : scene.instanceDefinitions)
+        allInstanceNames.push_back(def.first);
+
+    std::unordered_map<InternedString, GeometryGroup, InternedStringHash> instanceMap;
+    for (int i = 0; i < scene.instanceDefinitions.size(); ++i) {
+        InternedString name = allInstanceNames[i];
+        auto iter = scene.instanceDefinitions.find(name);
+        CHECK(iter != scene.instanceDefinitions.end());
+        const auto &def = *iter;
+
+        if (!def.second->animatedShapes.empty())
+            Warning("Ignoring %d animated shapes in instance \"%s\".",
+                    def.second->animatedShapes.size(), def.first);
+
+        std::map<int, TriQuadMesh> meshes =
+            PreparePLYMeshes(def.second->shapes, textures.floatTextures);
+
+        GeometryGroup &geomGroup = geomGroups[i + 1];
+
+        geomGroup.triGeomContainer = buildBVHForTriangles(
+            def.second->shapes, meshes, context, textures.floatTextures, namedMaterials,
+            materials, media, {}, threadAllocators, threadCUDAStreams);
+
+        geomGroup.blpGeomContainer = buildBVHForBLPs(
+            def.second->shapes, context, textures.floatTextures, namedMaterials,
+            materials, media, {}, threadAllocators, threadCUDAStreams);
+
+        geomGroup.quadricGeomContainer = buildBVHForQuadrics(
+            def.second->shapes, context, textures.floatTextures, namedMaterials,
+            materials, media, {}, threadAllocators, threadCUDAStreams);
+
+        instanceContainer.appendRecords(geomGroup.triGeomContainer);
+        instanceContainer.appendRecords(geomGroup.blpGeomContainer);
+        instanceContainer.appendRecords(geomGroup.quadricGeomContainer);
+
+        meshes.clear();
+        instanceMap[def.first] = geomGroup;
+    }
+    LOG_VERBOSE("Finished building instanced geometries");
+
+    ///////////////////////////////////////////////////////////////////////////
+    // Instancing
+    LOG_VERBOSE("Starting to build scene (TLAS)");
+    hiprtFrameMatrix identity{};
+    for (size_t i = 0; i < 3; ++i)
+        identity.matrix[i][i] = 1.0f;
+
+    instanceContainer.insertInstance(geomGroups[0].triGeomContainer, identity);
+    instanceContainer.insertInstance(geomGroups[0].blpGeomContainer, identity);
+    instanceContainer.insertInstance(geomGroups[0].quadricGeomContainer, identity);
+    size_t nNoInstancedGeoms = instanceContainer.instances.size();
+
+    for (size_t i = 0; i < scene.instances.size(); ++i) {
+        const auto &sceneInstance = scene.instances[i];
+        auto iter = instanceMap.find(sceneInstance.name);
+
+        if (iter != instanceMap.end()) {
+            SquareMatrix<4> O2WTransfrom = sceneInstance.renderFromInstance->GetMatrix();
+            SquareMatrix<4> W2OTransfrom = Inverse(O2WTransfrom).value();
+
+            hiprtFrameMatrix transform;
+            for (int j = 0; j < 3; ++j)
+                for (int k = 0; k < 4; ++k)
+                    transform.matrix[j][k] = O2WTransfrom[j][k];
+
+            GeometryGroup &geomGroup = iter->second;
+            instanceContainer.insertInstance(geomGroup.triGeomContainer, transform);
+            instanceContainer.insertInstance(geomGroup.blpGeomContainer, transform);
+            instanceContainer.insertInstance(geomGroup.quadricGeomContainer, transform);
+        }
+    }
+
+    this->scene =
+        buildBVHForInstances(instanceContainer.instances, instanceContainer.transforms,
+                             context, threadAllocators, threadCUDAStreams);
+    LOG_VERBOSE("Finished building scene (TLAS)");
+
+    ///////////////////////////////////////////////////////////////////////////
+    // Bounds
+    hiprtFloat3 aabbMin, aabbMax;
+    HIPRT_CHECK(hiprtExportSceneAabb(context, this->scene, aabbMin, aabbMax));
+    bounds.pMin = {aabbMin.x, aabbMin.y, aabbMin.z};
+    bounds.pMax = {aabbMax.x, aabbMax.y, aabbMax.z};
+
+    ///////////////////////////////////////////////////////////////////////////
+    // Copy hitgroup records and offsets
+    hipStream_t buildStream = threadCUDAStreams.Get();
+    CUDA_CHECK(hipMalloc(&hgRecords,
+                         instanceContainer.hgRecords.size() * sizeof(HitgroupRecord)));
+    CUDA_CHECK(hipMemcpyAsync((void *)hgRecords, instanceContainer.hgRecords.data(),
+                              sizeof(HitgroupRecord) * instanceContainer.hgRecords.size(),
+                              hipMemcpyHostToDevice, buildStream));
+    CUDA_CHECK(hipMalloc(&offsets, instanceContainer.offsets.size() * sizeof(uint32_t)));
+    CUDA_CHECK(hipMemcpyAsync((void *)offsets, instanceContainer.offsets.data(),
+                              sizeof(uint32_t) * instanceContainer.offsets.size(),
+                              hipMemcpyHostToDevice, buildStream));
+
+    ///////////////////////////////////////////////////////////////////////////
+    // Create stack buffers
+    constexpr uint32_t StackSize = 64;
+    hiprtGlobalStackBufferInput stackBufferInput{hiprtStackTypeGlobal,
+                                                 hiprtStackEntryTypeInteger, StackSize,
+                                                 (uint32_t)maxQueueSize};
+    HIPRT_CHECK(
+        hiprtCreateGlobalStackBuffer(context, stackBufferInput, globalStackBuffer));
+
+    constexpr uint32_t InstanceStackSize = 1;
+    hiprtGlobalStackBufferInput instanceStackBufferInput{
+        hiprtStackTypeGlobal, hiprtStackEntryTypeInstance, InstanceStackSize,
+        (uint32_t)maxQueueSize};
+    HIPRT_CHECK(hiprtCreateGlobalStackBuffer(context, instanceStackBufferInput,
+                                             globalInstanceStackBuffer));
+
+#if defined(PBRT_IS_WINDOWS)
+    if (Options->useGPU)
+        ReenableThreadPool();
+#endif  // PBRT_IS_WINDOWS
+}
+
+void HiprtAggregate::IntersectClosest(int maxRays, const RayQueue *rayQueue,
+                                      EscapedRayQueue *escapedRayQueue,
+                                      HitAreaLightQueue *hitAreaLightQueue,
+                                      MaterialEvalQueue *basicEvalMaterialQueue,
+                                      MaterialEvalQueue *universalEvalMaterialQueue,
+                                      MediumSampleQueue *mediumSampleQueue,
+                                      RayQueue *nextRayQueue) const {
+    std::pair<hipEvent_t, hipEvent_t> events =
+        GetProfilerEvents("Trace closest hit rays");
+
+    hipEventRecord(events.first);
+
+    if (scene) {
+        RayIntersectParameters params;
+        params.traversable = scene;
+        params.rayQueue = rayQueue;
+        params.nextRayQueue = nextRayQueue;
+        params.escapedRayQueue = escapedRayQueue;
+        params.hitAreaLightQueue = hitAreaLightQueue;
+        params.basicEvalMaterialQueue = basicEvalMaterialQueue;
+        params.universalEvalMaterialQueue = universalEvalMaterialQueue;
+        params.mediumSampleQueue = mediumSampleQueue;
+        params.globalStackBuffer = globalStackBuffer;
+        params.globalInstanceStackBuffer = globalInstanceStackBuffer;
+        params.funcTable = module.funcTable;
+        params.hgRecords = hgRecords;
+        params.offsets = offsets;
+
+        ParamBufferState &pbs = getParamBuffer(params);
+
+#ifndef NDEBUG
+        LOG_VERBOSE("Launching intersect closest");
+#endif
+
+        hipDeviceptr_t paramsPtr;
+        size_t paramsSize;
+        CUDA_CHECK(
+            hipModuleGetGlobal(&paramsPtr, &paramsSize, module.hipModule, "paramBuffer"));
+        CUDA_CHECK(hipMemcpyAsync((void *)paramsPtr, (const void *)pbs.ptr,
+                                  sizeof(params), hipMemcpyDeviceToDevice, cudaStream));
+
+        hiprtLaunch(module.closestFunction, maxRays, 1, BlockSize, 1, nullptr,
+                    cudaStream);
+        CUDA_CHECK(hipEventRecord(pbs.finishedEvent));
+
+#ifndef NDEBUG
+        CUDA_CHECK(hipDeviceSynchronize());
+        LOG_VERBOSE("Post-sync triangle intersect closest");
+#endif
+    }
+
+    hipEventRecord(events.second);
+};
+
+void HiprtAggregate::IntersectShadow(int maxRays, ShadowRayQueue *shadowRayQueue,
+                                     SOA<PixelSampleState> *pixelSampleState) const {
+    std::pair<hipEvent_t, hipEvent_t> events = GetProfilerEvents("Trace shadow rays");
+
+    hipEventRecord(events.first);
+
+    if (scene) {
+        RayIntersectParameters params;
+        params.traversable = scene;
+        params.shadowRayQueue = shadowRayQueue;
+        params.pixelSampleState = *pixelSampleState;
+        params.globalStackBuffer = globalStackBuffer;
+        params.globalInstanceStackBuffer = globalInstanceStackBuffer;
+        params.funcTable = module.funcTable;
+        params.hgRecords = hgRecords;
+        params.offsets = offsets;
+
+        ParamBufferState &pbs = getParamBuffer(params);
+
+#ifndef NDEBUG
+        LOG_VERBOSE("Launching intersect shadow");
+#endif
+
+        hipDeviceptr_t paramsPtr;
+        size_t paramsSize;
+        CUDA_CHECK(
+            hipModuleGetGlobal(&paramsPtr, &paramsSize, module.hipModule, "paramBuffer"));
+        CUDA_CHECK(hipMemcpyAsync((void *)paramsPtr, (const void *)pbs.ptr,
+                                  sizeof(params), hipMemcpyDeviceToDevice, cudaStream));
+
+        hiprtLaunch(module.shadowFunction, maxRays, 1, BlockSize, 1, nullptr, cudaStream);
+        CUDA_CHECK(hipEventRecord(pbs.finishedEvent));
+
+#ifndef NDEBUG
+        CUDA_CHECK(hipDeviceSynchronize());
+        LOG_VERBOSE("Post-sync intersect shadow");
+#endif
+    }
+
+    hipEventRecord(events.second);
+}
+
+void HiprtAggregate::IntersectShadowTr(int maxRays, ShadowRayQueue *shadowRayQueue,
+                                       SOA<PixelSampleState> *pixelSampleState) const {
+    std::pair<hipEvent_t, hipEvent_t> events =
+        GetProfilerEvents("Tracing shadow Tr rays");
+
+    hipEventRecord(events.first);
+
+    if (scene) {
+        RayIntersectParameters params;
+        params.traversable = scene;
+        params.shadowRayQueue = shadowRayQueue;
+        params.pixelSampleState = *pixelSampleState;
+        params.globalStackBuffer = globalStackBuffer;
+        params.globalInstanceStackBuffer = globalInstanceStackBuffer;
+        params.funcTable = module.funcTable;
+        params.hgRecords = hgRecords;
+        params.offsets = offsets;
+
+        ParamBufferState &pbs = getParamBuffer(params);
+
+#ifndef NDEBUG
+        LOG_VERBOSE("Launching intersect shadow Tr");
+#endif
+        hipDeviceptr_t paramsPtr;
+        size_t paramsSize;
+        CUDA_CHECK(
+            hipModuleGetGlobal(&paramsPtr, &paramsSize, module.hipModule, "paramBuffer"));
+        CUDA_CHECK(hipMemcpyAsync((void *)paramsPtr, (const void *)pbs.ptr,
+                                  sizeof(params), hipMemcpyDeviceToDevice, cudaStream));
+
+        hiprtLaunch(module.shadowTrFunction, maxRays, 1, BlockSize, 1, nullptr,
+                    cudaStream);
+        CUDA_CHECK(hipEventRecord(pbs.finishedEvent));
+
+#ifndef NDEBUG
+        CUDA_CHECK(hipDeviceSynchronize());
+        LOG_VERBOSE("Post-sync intersect shadow Tr");
+#endif
+    }
+
+    hipEventRecord(events.second);
+}
+
+void HiprtAggregate::IntersectOneRandom(
+    int maxRays, SubsurfaceScatterQueue *subsurfaceScatterQueue) const {
+    std::pair<hipEvent_t, hipEvent_t> events =
+        GetProfilerEvents("Tracing subsurface scattering probe rays");
+
+    hipEventRecord(events.first);
+
+    if (scene) {
+        RayIntersectParameters params;
+        params.traversable = scene;
+        params.subsurfaceScatterQueue = subsurfaceScatterQueue;
+        params.globalStackBuffer = globalStackBuffer;
+        params.globalInstanceStackBuffer = globalInstanceStackBuffer;
+        params.funcTable = module.funcTable;
+        params.hgRecords = hgRecords;
+        params.offsets = offsets;
+
+        ParamBufferState &pbs = getParamBuffer(params);
+
+#ifndef NDEBUG
+        LOG_VERBOSE("Launching intersect random");
+#endif
+        hipDeviceptr_t paramsPtr;
+        size_t paramsSize;
+        CUDA_CHECK(
+            hipModuleGetGlobal(&paramsPtr, &paramsSize, module.hipModule, "paramBuffer"));
+        CUDA_CHECK(hipMemcpyAsync((void *)paramsPtr, (const void *)pbs.ptr,
+                                  sizeof(params), hipMemcpyDeviceToDevice, cudaStream));
+
+        hiprtLaunch(module.oneRandomFunction, maxRays, 1, BlockSize, 1, nullptr,
+                    cudaStream);
+        CUDA_CHECK(hipEventRecord(pbs.finishedEvent));
+
+#ifndef NDEBUG
+        CUDA_CHECK(hipDeviceSynchronize());
+        LOG_VERBOSE("Post-sync intersect random");
+#endif
+    }
+
+    hipEventRecord(events.second);
+}
+
+std::map<int, TriQuadMesh> HiprtAggregate::PreparePLYMeshes(
+    const std::vector<ShapeSceneEntity> &shapes,
+    const std::map<std::string, FloatTexture> &floatTextures) {
+    std::map<int, TriQuadMesh> plyMeshes;
+    std::mutex mutex;
+    ParallelFor(0, shapes.size(), [&](int64_t i) {
+        const auto &shape = shapes[i];
+        if (shape.name != "plymesh")
+            return;
+
+        std::string filename =
+            ResolveFilename(shape.parameters.GetOneString("filename", ""));
+        if (filename.empty())
+            ErrorExit(&shape.loc, "plymesh: \"filename\" must be provided.");
+        TriQuadMesh plyMesh = TriQuadMesh::ReadPLY(filename);  // todo: alloc
+        if (!plyMesh.triIndices.empty() || !plyMesh.quadIndices.empty()) {
+            plyMesh.ConvertToOnlyTriangles();
+
+            Float edgeLength =
+                shape.parameters.GetOneFloat("displacement.edgelength", 1.f);
+            edgeLength *= Options->displacementEdgeScale;
+
+            std::string displacementTexName = shape.parameters.GetTexture("displacement");
+            if (!displacementTexName.empty()) {
+                auto iter = floatTextures.find(displacementTexName);
+                if (iter == floatTextures.end())
+                    ErrorExit(&shape.loc, "%s: no such texture defined.",
+                              displacementTexName);
+                FloatTexture displacement = iter->second;
+
+                LOG_VERBOSE("Starting to displace mesh \"%s\" with \"%s\"", filename,
+                            displacementTexName);
+
+                size_t origNumTris = plyMesh.triIndices.size() / 3;
+
+                plyMesh = plyMesh.Displace(
+                    [&](Point3f v0, Point3f v1) {
+                        v0 = (*shape.renderFromObject)(v0);
+                        v1 = (*shape.renderFromObject)(v1);
+                        return Distance(v0, v1);
+                    },
+                    edgeLength,
+                    [&](Point3f *pCPU, const Normal3f *nCPU, const Point2f *uvCPU,
+                        int nVertices) {
+                        Point3f *p;
+                        Normal3f *n;
+                        Point2f *uv;
+                        CUDA_CHECK(hipMallocManaged(&p, nVertices * sizeof(Point3f)));
+                        CUDA_CHECK(hipMallocManaged(&n, nVertices * sizeof(Normal3f)));
+                        CUDA_CHECK(hipMallocManaged(&uv, nVertices * sizeof(Point2f)));
+
+                        std::memcpy(p, pCPU, nVertices * sizeof(Point3f));
+                        std::memcpy(n, nCPU, nVertices * sizeof(Normal3f));
+                        std::memcpy(uv, uvCPU, nVertices * sizeof(Point2f));
+
+                        GPUParallelFor(
+                            "Evaluate Displacement", nVertices, [=] PBRT_GPU(int i) {
+                                TextureEvalContext ctx;
+                                ctx.p = p[i];
+                                ctx.uv = uv[i];
+                                Float d = UniversalTextureEvaluator()(displacement, ctx);
+                                p[i] += Vector3f(d * n[i]);
+                            });
+                        GPUWait();
+
+                        std::memcpy(pCPU, p, nVertices * sizeof(Point3f));
+
+                        CUDA_CHECK(hipFree(p));
+                        CUDA_CHECK(hipFree(n));
+                        CUDA_CHECK(hipFree(uv));
+                    },
+                    &shape.loc);
+
+                displacedTrisDelta += plyMesh.triIndices.size() / 3 - origNumTris;
+
+                LOG_VERBOSE("Finished displacing mesh \"%s\" with \"%s\" -> %d tris",
+                            filename, displacementTexName, plyMesh.triIndices.size() / 3);
+            }
+        }
+
+        std::lock_guard<std::mutex> lock(mutex);
+        plyMeshes[i] = std::move(plyMesh);
+    });
+
+    return plyMeshes;
+}
+
+HiprtAggregate::ParamBufferState &HiprtAggregate::getParamBuffer(
+    const RayIntersectParameters &params) const {
+    CHECK(nextParamOffset < paramsPool.size());
+
+    ParamBufferState &pbs = paramsPool[nextParamOffset];
+    if (++nextParamOffset == paramsPool.size())
+        nextParamOffset = 0;
+    if (!pbs.used)
+        pbs.used = true;
+    else
+        CUDA_CHECK(hipEventSynchronize(pbs.finishedEvent));
+
+    // Copy to host-side pinned memory
+    memcpy(pbs.hostPtr, &params, sizeof(params));
+    CUDA_CHECK(hipMemcpyAsync((void *)pbs.ptr, pbs.hostPtr, sizeof(params),
+                              hipMemcpyHostToDevice, cudaStream));
+
+    return pbs;
+}
+
+BilinearPatchMesh *HiprtAggregate::diceCurveToBLP(const ShapeSceneEntity &shape,
+                                                  int nDiceU, int nDiceV,
+                                                  Allocator alloc) {
+    CHECK_EQ(shape.name, "curve");
+    const ParameterDictionary &parameters = shape.parameters;
+    const FileLoc *loc = &shape.loc;
+
+    ++nCurves;
+
+    // Extract parameters; the following ~90 lines of code are,
+    // unfortunately, copied from Curve::Create.  We would like to avoid
+    // the overhead of splitting the curve and creating Curve objects, so
+    // here we go..
+    Float width = parameters.GetOneFloat("width", 1.f);
+    Float width0 = parameters.GetOneFloat("width0", width);
+    Float width1 = parameters.GetOneFloat("width1", width);
+
+    int degree = parameters.GetOneInt("degree", 3);
+    if (degree != 2 && degree != 3) {
+        Error(loc, "Invalid degree %d: only degree 2 and 3 curves are supported.",
+              degree);
+        return {};
+    }
+
+    std::string basis = parameters.GetOneString("basis", "bezier");
+    if (basis != "bezier" && basis != "bspline") {
+        Error(loc,
+              "Invalid basis \"%s\": only \"bezier\" and \"bspline\" are "
+              "supported.",
+              basis);
+        return {};
+    }
+
+    int nSegments;
+    std::vector<Point3f> cp = parameters.GetPoint3fArray("P");
+    bool bezierBasis = (basis == "bezier");
+    if (bezierBasis) {
+        // After the first segment, which uses degree+1 control points,
+        // subsequent segments reuse the last control point of the previous
+        // one and then use degree more control points.
+        if (((cp.size() - 1 - degree) % degree) != 0) {
+            Error(loc,
+                  "Invalid number of control points %d: for the degree %d "
+                  "Bezier basis %d + n * %d are required, for n >= 0.",
+                  (int)cp.size(), degree, degree + 1, degree);
+            return {};
+        }
+        nSegments = (cp.size() - 1) / degree;
+    } else {
+        if (cp.size() < degree + 1) {
+            Error(loc,
+                  "Invalid number of control points %d: for the degree %d "
+                  "b-spline basis, must have >= %d.",
+                  int(cp.size()), degree, degree + 1);
+            return {};
+        }
+        nSegments = cp.size() - degree;
+    }
+
+    CurveType type;
+    std::string curveType = parameters.GetOneString("type", "flat");
+    if (curveType == "flat")
+        type = CurveType::Flat;
+    else if (curveType == "ribbon")
+        type = CurveType::Ribbon;
+    else if (curveType == "cylinder")
+        type = CurveType::Cylinder;
+    else {
+        Error(loc, R"(Unknown curve type "%s".  Using "cylinder".)", curveType);
+        type = CurveType::Cylinder;
+    }
+
+    std::vector<Normal3f> n = parameters.GetNormal3fArray("N");
+    if (!n.empty()) {
+        if (type != CurveType::Ribbon) {
+            Warning("Curve normals are only used with \"ribbon\" type curves.");
+            n = {};
+        } else if (n.size() != nSegments + 1) {
+            Error(loc,
+                  "Invalid number of normals %d: must provide %d normals for "
+                  "ribbon curves with %d segments.",
+                  int(n.size()), nSegments + 1, nSegments);
+            return {};
+        }
+        for (Normal3f &nn : n)
+            Normalize(nn);
+    } else if (type == CurveType::Ribbon) {
+        Error(loc, "Must provide normals \"N\" at curve endpoints with ribbon "
+                   "curves.");
+        return {};
+    }
+
+    // Start dicing...
+    std::vector<int> blpIndices;
+    std::vector<Point3f> blpP;
+    std::vector<Normal3f> blpN;
+    std::vector<Point2f> blpUV;
+
+    int lastCPOffset = -1;
+    pstd::array<Point3f, 4> segCpBezier;
+
+    for (int i = 0; i <= nDiceU; ++i) {
+        Float u = Float(i) / Float(nDiceU);
+        Float width = Lerp(u, width0, width1);
+
+        int segmentIndex = int(u * nSegments);
+        if (segmentIndex == nSegments)  // u == 1...
+            --segmentIndex;
+
+        // Compute offset into original control points for current u
+        int cpOffset;
+        if (bezierBasis)
+            cpOffset = segmentIndex * degree;
+        else
+            // Uniform b-spline.
+            cpOffset = segmentIndex;
+
+        if (cpOffset != lastCPOffset) {
+            // update segCpBezier
+            if (bezierBasis) {
+                if (degree == 2) {
+                    // Elevate to degree 3.
+                    segCpBezier = ElevateQuadraticBezierToCubic(
+                        pstd::MakeConstSpan(cp).subspan(cpOffset, 3));
+                } else {
+                    // All set.
+                    for (int i = 0; i < 4; ++i)
+                        segCpBezier[i] = cp[cpOffset + i];
+                }
+            } else {
+                // Uniform b-spline.
+                if (degree == 2) {
+                    pstd::array<Point3f, 3> bezCp = QuadraticBSplineToBezier(
+                        pstd::MakeConstSpan(cp).subspan(cpOffset, 3));
+                    segCpBezier =
+                        ElevateQuadraticBezierToCubic(pstd::MakeConstSpan(bezCp));
+                } else {
+                    segCpBezier = CubicBSplineToBezier(
+                        pstd::MakeConstSpan(cp).subspan(cpOffset, 4));
+                }
+            }
+            lastCPOffset = cpOffset;
+        }
+
+        Float uSeg = (u * nSegments) - segmentIndex;
+        DCHECK(uSeg >= 0 && uSeg <= 1);
+
+        Vector3f dpdu;
+        Point3f p = EvaluateCubicBezier(segCpBezier, uSeg, &dpdu);
+
+        switch (type) {
+        case CurveType::Ribbon: {
+            Float normalAngle = AngleBetween(n[segmentIndex], n[segmentIndex + 1]);
+            Float invSinNormalAngle = 1 / std::sin(normalAngle);
+
+            Normal3f nu;
+            if (normalAngle == 0)
+                nu = n[segmentIndex];
+            else {
+                Float sin0 = std::sin((1 - uSeg) * normalAngle) * invSinNormalAngle;
+                Float sin1 = std::sin(uSeg * normalAngle) * invSinNormalAngle;
+                nu = sin0 * n[segmentIndex] + sin1 * n[segmentIndex + 1];
+            }
+            Vector3f dpdv = Normalize(Cross(nu, dpdu)) * width;
+
+            blpP.push_back(p - dpdv / 2);
+            blpP.push_back(p + dpdv / 2);
+            blpUV.push_back(Point2f(u, 0));
+            blpUV.push_back(Point2f(u, 1));
+
+            if (i > 0) {
+                blpIndices.push_back(2 * (i - 1));
+                blpIndices.push_back(2 * (i - 1) + 1);
+                blpIndices.push_back(2 * i);
+                blpIndices.push_back(2 * i + 1);
+            }
+            break;
+        }
+        case CurveType::Flat:
+        case CurveType::Cylinder: {
+            Vector3f ortho[2];
+            CoordinateSystem(Normalize(dpdu), &ortho[0], &ortho[1]);
+            ortho[0] *= width / 2;
+            ortho[1] *= width / 2;
+
+            // Repeat the first/last vertex so we can assign different
+            // texture coordinates...
+            for (int v = 0; v <= nDiceV; ++v) {
+                Float angle = Float(v) / nDiceV * 2 * Pi;
+                blpP.push_back(p + ortho[0] * std::cos(angle) +
+                               ortho[1] * std::sin(angle));
+                blpN.push_back(Normal3f(Normalize(blpP.back() - p)));
+                blpUV.push_back(Point2f(u, Float(v) / nDiceV));
+            }
+
+            if (i > 0) {
+                for (int v = 0; v < nDiceV; ++v) {
+                    // Indexing is funny due to doubled-up last vertex
+                    blpIndices.push_back((nDiceV + 1) * (i - 1) + v);
+                    blpIndices.push_back((nDiceV + 1) * (i - 1) + v + 1);
+                    blpIndices.push_back((nDiceV + 1) * i + v);
+                    blpIndices.push_back((nDiceV + 1) * i + v + 1);
+                }
+            }
+            break;
+        }
+        }
+    }
+
+    nBLPsForCurves += blpIndices.size() / 4;
+
+    return alloc.new_object<BilinearPatchMesh>(
+        *shape.renderFromObject, shape.reverseOrientation, blpIndices, blpP, blpN, blpUV,
+        std::vector<int>(), nullptr, alloc);
+}
+
+HiprtAggregate::GeometryContainer HiprtAggregate::buildBVHForTriangles(
+    const std::vector<ShapeSceneEntity> &shapes,
+    const std::map<int, TriQuadMesh> &plyMeshes, hiprtContext context,
+    const std::map<std::string, FloatTexture> &floatTextures,
+    const std::map<std::string, Material> &namedMaterials,
+    const std::vector<Material> &materials, const std::map<std::string, Medium> &media,
+    const std::map<int, pstd::vector<Light> *> &shapeIndexToAreaLights,
+    ThreadLocal<Allocator> &threadAllocators,
+    ThreadLocal<hipStream_t> &threadCUDAStreams) {
+    // Count how many of the shapes are triangle meshes
+    std::vector<size_t> meshIndexToShapeIndex;
+    for (size_t i = 0; i < shapes.size(); ++i) {
+        const auto &shape = shapes[i];
+        if (shape.name == "trianglemesh" || shape.name == "plymesh" ||
+            shape.name == "loopsubdiv")
+            meshIndexToShapeIndex.push_back(i);
+    }
+
+    size_t nMeshes = meshIndexToShapeIndex.size();
+    if (nMeshes == 0)
+        return {};
+
+    LOG_VERBOSE("Building triangle BLAS");
+
+    std::vector<TriangleMesh *> meshes(nMeshes, nullptr);
+    ParallelFor(0, nMeshes, [&](int64_t meshIndex) {
+        Allocator alloc = threadAllocators.Get();
+        size_t shapeIndex = meshIndexToShapeIndex[meshIndex];
+        const auto &shape = shapes[shapeIndex];
+
+        TriangleMesh *mesh = nullptr;
+        if (shape.name == "trianglemesh") {
+            mesh = Triangle::CreateMesh(shape.renderFromObject, shape.reverseOrientation,
+                                        shape.parameters, &shape.loc, alloc);
+            CHECK(mesh != nullptr);
+        } else if (shape.name == "loopsubdiv") {
+            // Copied from pbrt/shapes.cpp... :-p
+            int nLevels = shape.parameters.GetOneInt("levels", 3);
+            std::vector<int> vertexIndices = shape.parameters.GetIntArray("indices");
+            if (vertexIndices.empty())
+                ErrorExit(&shape.loc, "Vertex indices \"indices\" not "
+                                      "provided for LoopSubdiv shape.");
+
+            std::vector<Point3f> P = shape.parameters.GetPoint3fArray("P");
+            if (P.empty())
+                ErrorExit(&shape.loc, "Vertex positions \"P\" not provided "
+                                      "for LoopSubdiv shape.");
+
+            // don't actually use this for now...
+            std::string scheme = shape.parameters.GetOneString("scheme", "loop");
+
+            mesh = LoopSubdivide(shape.renderFromObject, shape.reverseOrientation,
+                                 nLevels, vertexIndices, P, alloc);
+            CHECK(mesh != nullptr);
+        } else if (shape.name == "plymesh") {
+            auto plyIter = plyMeshes.find(shapeIndex);
+            CHECK(plyIter != plyMeshes.end());
+            const TriQuadMesh &plyMesh = plyIter->second;
+
+            if (!plyMesh.quadIndices.empty() && shape.lightIndex != -1) {
+#if 0
+                // If you'd like to know what they are...
+                for (int i = 0; i < plyMesh.quadIndices.size(); ++i)
+                    Printf("%s\n", plyMesh.p[plyMesh.quadIndices[i]]);
+#endif
+                // This would be nice to fix, but it involves some
+                // plumbing and it's a rare case. The underlying issue
+                // is that when we create AreaLights for emissive
+                // shapes earlier, we're not expecting this..
+                std::string filename =
+                    ResolveFilename(shape.parameters.GetOneString("filename", ""));
+                ErrorExit(&shape.loc,
+                          "%s: PLY file being used as an area light has quads--"
+                          "this is currently unsupported. Please replace them with "
+                          "\"bilinearmesh\" "
+                          "shapes as a workaround. (Sorry!).",
+                          filename);
+            }
+
+            mesh = alloc.new_object<TriangleMesh>(
+                *shape.renderFromObject, shape.reverseOrientation, plyMesh.triIndices,
+                plyMesh.p, std::vector<Vector3f>(), plyMesh.n, plyMesh.uv,
+                plyMesh.faceIndices, alloc);
+        } else
+            LOG_FATAL("Logic error in GPUAggregate::buildBVHForTriangles()");
+
+        meshes[meshIndex] = mesh;
+    });
+
+    GeometryContainer geomContainer(nMeshes);
+
+    ParallelFor(0, nMeshes, [&](int64_t startIndex, int64_t endIndex) {
+        Allocator alloc = threadAllocators.Get();
+
+        for (int meshIndex = startIndex; meshIndex < endIndex; ++meshIndex) {
+            TriangleMesh *mesh = meshes[meshIndex];
+            size_t shapeIndex = meshIndexToShapeIndex[meshIndex];
+            const auto &shape = shapes[shapeIndex];
+            FloatTexture alphaTexture = getAlphaTexture(shape, floatTextures, alloc);
+            Material material = getMaterial(shape, namedMaterials, materials);
+
+            // Do this here, after the alpha texture has been consumed.
+            shape.parameters.ReportUnused();
+
+            HitgroupRecord &hgRecord = geomContainer.hgRecords[meshIndex];
+            hgRecord.type = HitgroupRecord::TriangleMesh;
+            hgRecord.triRec.mesh = mesh;
+            hgRecord.triRec.material = material;
+            hgRecord.triRec.alphaTexture = alphaTexture;
+            hgRecord.triRec.areaLights = {};
+            if (shape.lightIndex != -1) {
+                if (!material)
+                    Warning(&shape.loc, "Ignoring area light specification for shape "
+                                        "with \"interface\" material.");
+                else {
+                    // Note: this will hit if we try to have an instance as an area
+                    // light.
+                    auto iter = shapeIndexToAreaLights.find(shapeIndex);
+                    CHECK(iter != shapeIndexToAreaLights.end());
+                    CHECK_EQ(iter->second->size(), mesh->nTriangles);
+                    hgRecord.triRec.areaLights = pstd::MakeSpan(*iter->second);
+                }
+            }
+            hgRecord.triRec.mediumInterface = getMediumInterface(shape, media, alloc);
+        }
+    });
+
+    hiprtBuildOptions options;
+    options.buildFlags = hiprtBuildFlagBitPreferBalancedBuild;
+    options.batchBuildMaxPrimCount = 512u;
+
+    size_t nTris = 0;
+    size_t nVerts = 0;
+    std::vector<size_t> triOffsets;
+    std::vector<size_t> vertsOffsets;
+    for (size_t meshIndex = 0; meshIndex < nMeshes; ++meshIndex) {
+        TriangleMesh *mesh = meshes[meshIndex];
+        triOffsets.push_back(nTris);
+        vertsOffsets.push_back(nVerts);
+        nTris += mesh->nTriangles;
+        nVerts += mesh->nVertices;
+    }
+
+    hiprtInt3 *tris;
+    CUDA_CHECK(hipMalloc(&tris, sizeof(hiprtInt3) * nTris));
+
+    hiprtFloat3 *verts;
+    CUDA_CHECK(hipMalloc(&verts, sizeof(hiprtFloat3) * nVerts));
+
+    std::vector<hiprtInt3> triBuffer(nTris);
+    std::vector<hiprtFloat3> vertBuffer(nVerts);
+
+    std::vector<hiprtGeometryBuildInput> geomInputs(nMeshes);
+    std::vector<hiprtGeometry> geoms(nMeshes);
+    std::vector<hiprtGeometry *> geomAddrs(nMeshes);
+    ParallelFor(0, nMeshes, [&](int64_t meshIndex) {
+        TriangleMesh *mesh = meshes[meshIndex];
+        size_t triOffset = triOffsets[meshIndex];
+        size_t vertOffset = vertsOffsets[meshIndex];
+
+        hiprtTriangleMeshPrimitive prim;
+        prim.triangleCount = mesh->nTriangles;
+        prim.triangleStride = sizeof(hiprtInt3);
+        prim.triangleIndices = tris + triOffset;
+        prim.vertexCount = mesh->nVertices;
+        prim.vertexStride = sizeof(hiprtFloat3);
+        prim.vertices = verts + vertOffset;
+
+        hiprtGeometryBuildInput &geomInput = geomInputs[meshIndex];
+        geomInput.type = hiprtPrimitiveTypeTriangleMesh;
+        geomInput.primitive.triangleMesh = prim;
+        geomInput.geomType = 0;
+
+        geomAddrs[meshIndex] = &geoms[meshIndex];
+
+        std::memcpy(&triBuffer[triOffset], mesh->vertexIndices,
+                    sizeof(hiprtInt3) * mesh->nTriangles);
+        std::memcpy(&vertBuffer[vertOffset], mesh->p,
+                    sizeof(hiprtFloat3) * mesh->nVertices);
+    });
+
+    hipStream_t buildStream = threadCUDAStreams.Get();
+    CUDA_CHECK(hipMemcpyAsync(tris, triBuffer.data(), sizeof(hiprtInt3) * nTris,
+                              hipMemcpyHostToDevice, buildStream));
+    CUDA_CHECK(hipMemcpyAsync(verts, vertBuffer.data(), sizeof(hiprtFloat3) * nVerts,
+                              hipMemcpyHostToDevice, buildStream));
+
+    size_t geomTempSize;
+    HIPRT_CHECK(hiprtGetGeometriesBuildTemporaryBufferSize(
+        context, nMeshes, geomInputs.data(), options, geomTempSize));
+
+    hiprtDevicePtr tempBuffer;
+    CUDA_CHECK(hipMalloc(&tempBuffer, geomTempSize));
+
+    HIPRT_CHECK(hiprtCreateGeometries(context, nMeshes, geomInputs.data(), options,
+                                      geomAddrs.data()));
+
+    LOG_VERBOSE("Starting to build triangle mesh geometries");
+    HIPRT_CHECK(hiprtBuildGeometries(context, hiprtBuildOperationBuild, nMeshes,
+                                     geomInputs.data(), options, tempBuffer, buildStream,
+                                     geoms.data()));
+    LOG_VERBOSE("Finished building triangle mesh geometries");
+
+    CUDA_CHECK(hipFree(tris));
+    CUDA_CHECK(hipFree(verts));
+    CUDA_CHECK(hipFree(tempBuffer));
+
+    hiprtFrameMatrix identity{};
+    for (size_t i = 0; i < 3; ++i)
+        identity.matrix[i][i] = 1.0f;
+
+    std::vector<hiprtInstance> instances(nMeshes);
+    std::vector<hiprtFrameMatrix> transforms(nMeshes);
+    ParallelFor(0, nMeshes, [&](int64_t meshIndex) {
+        instances[meshIndex].type = hiprtInstanceTypeGeometry;
+        instances[meshIndex].geometry = geoms[meshIndex];
+        transforms[meshIndex] = identity;
+    });
+
+    geomContainer.scene = buildBVHForInstances(instances, transforms, context,
+                                               threadAllocators, threadCUDAStreams);
+
+    return geomContainer;
+}
+
+HiprtAggregate::GeometryContainer HiprtAggregate::buildBVHForBLPs(
+    const std::vector<ShapeSceneEntity> &shapes, hiprtContext context,
+    const std::map<std::string, FloatTexture> &floatTextures,
+    const std::map<std::string, Material> &namedMaterials,
+    const std::vector<Material> &materials, const std::map<std::string, Medium> &media,
+    const std::map<int, pstd::vector<Light> *> &shapeIndexToAreaLights,
+    ThreadLocal<Allocator> &threadAllocators,
+    ThreadLocal<hipStream_t> &threadCUDAStreams) {
+    // Count how many BLP meshes there are in shapes
+    std::vector<size_t> meshIndexToShapeIndex;
+    for (size_t i = 0; i < shapes.size(); ++i) {
+        const auto &shape = shapes[i];
+        if (shape.name == "bilinearmesh" || shape.name == "curve")
+            meshIndexToShapeIndex.push_back(i);
+    }
+
+    size_t nMeshes = meshIndexToShapeIndex.size();
+    if (nMeshes == 0)
+        return {};
+
+    LOG_VERBOSE("Building bilinear patch BLAS");
+
+    // Create meshes
+    std::vector<BilinearPatchMesh *> meshes(nMeshes, nullptr);
+    ParallelFor(0, nMeshes, [&](int64_t meshIndex) {
+        Allocator alloc = threadAllocators.Get();
+        size_t shapeIndex = meshIndexToShapeIndex[meshIndex];
+        const auto &shape = shapes[shapeIndex];
+
+        if (shape.name == "bilinearmesh") {
+            BilinearPatchMesh *mesh = BilinearPatch::CreateMesh(
+                shape.renderFromObject, shape.reverseOrientation, shape.parameters,
+                &shape.loc, alloc);
+            meshes[meshIndex] = mesh;
+        } else if (shape.name == "curve") {
+            BilinearPatchMesh *curveMesh =
+                diceCurveToBLP(shape, 5 /* nseg */, 5 /* nvert */, alloc);
+            if (curveMesh) {
+                meshes[meshIndex] = curveMesh;
+            }
+        }
+    });
+
+    GeometryContainer geomContainer(nMeshes);
+
+    ParallelFor(0, nMeshes, [&](int64_t startIndex, int64_t endIndex) {
+        Allocator alloc = threadAllocators.Get();
+
+        for (int meshIndex = startIndex; meshIndex < endIndex; ++meshIndex) {
+            BilinearPatchMesh *mesh = meshes[meshIndex];
+            size_t shapeIndex = meshIndexToShapeIndex[meshIndex];
+            const auto &shape = shapes[shapeIndex];
+            Material material = getMaterial(shape, namedMaterials, materials);
+            FloatTexture alphaTexture = getAlphaTexture(shape, floatTextures, alloc);
+
+            // After "alpha" has been consumed...
+            shape.parameters.ReportUnused();
+
+            HitgroupRecord &hgRecord = geomContainer.hgRecords[meshIndex];
+            hgRecord.type = HitgroupRecord::BilinearMesh;
+            hgRecord.blpRec.mesh = mesh;
+            hgRecord.blpRec.material = material;
+            hgRecord.blpRec.alphaTexture = alphaTexture;
+            hgRecord.blpRec.areaLights = {};
+            if (shape.lightIndex != -1) {
+                if (!material)
+                    Warning(&shape.loc,
+                            "Ignoring area light specification for shape with "
+                            "\"interface\" material.");
+                else {
+                    auto iter = shapeIndexToAreaLights.find(shapeIndex);
+                    // Note: this will hit if we try to have an instance as an area
+                    // light.
+                    CHECK(iter != shapeIndexToAreaLights.end());
+                    CHECK_EQ(iter->second->size(), mesh->nPatches);
+                    hgRecord.blpRec.areaLights = pstd::MakeSpan(*iter->second);
+                }
+            }
+            hgRecord.blpRec.mediumInterface = getMediumInterface(shape, media, alloc);
+        }
+    });
+
+    hiprtBuildOptions options;
+    options.buildFlags = hiprtBuildFlagBitPreferBalancedBuild;
+    options.batchBuildMaxPrimCount = 512u;
+
+    size_t nAabbs = 0;
+    std::vector<size_t> aabbOffsets;
+    for (size_t meshIndex = 0; meshIndex < nMeshes; ++meshIndex) {
+        BilinearPatchMesh *mesh = meshes[meshIndex];
+        aabbOffsets.push_back(nAabbs);
+        nAabbs += mesh->nPatches;
+    }
+
+    hiprtFloat4 *aabbs;
+    CUDA_CHECK(hipMalloc(&aabbs, sizeof(hiprtFloat4) * 2 * nAabbs));
+
+    std::vector<hiprtFloat4> aabbBuffer(2 * nAabbs);
+
+    std::vector<hiprtGeometryBuildInput> geomInputs(nMeshes);
+    std::vector<hiprtGeometry> geoms(nMeshes);
+    std::vector<hiprtGeometry *> geomAddrs(nMeshes);
+    ParallelFor(0, nMeshes, [&](int64_t meshIndex) {
+        BilinearPatchMesh *mesh = meshes[meshIndex];
+        size_t aabbIndex = aabbOffsets[meshIndex];
+
+        hiprtAABBListPrimitive prim;
+        prim.aabbCount = mesh->nPatches;
+        prim.aabbStride = 2 * sizeof(hiprtFloat4);
+        prim.aabbs = aabbs + 2 * aabbIndex;
+
+        hiprtGeometryBuildInput &geomInput = geomInputs[meshIndex];
+        geomInput.type = hiprtPrimitiveTypeAABBList;
+        geomInput.primitive.aabbList = prim;
+        geomInput.geomType = 1;
+
+        geomAddrs[meshIndex] = &geoms[meshIndex];
+
+        for (int patchIndex = 0; patchIndex < mesh->nPatches; ++patchIndex) {
+            Bounds3f patchBounds;
+            for (int i = 0; i < 4; ++i)
+                patchBounds =
+                    Union(patchBounds, mesh->p[mesh->vertexIndices[4 * patchIndex + i]]);
+            hiprtFloat4 aabbMin =
+                hiprtFloat4{float(patchBounds.pMin.x), float(patchBounds.pMin.y),
+                            float(patchBounds.pMin.z), 0.0f};
+            hiprtFloat4 aabbMax =
+                hiprtFloat4{float(patchBounds.pMax.x), float(patchBounds.pMax.y),
+                            float(patchBounds.pMax.z), 0.0f};
+            aabbBuffer[2 * aabbIndex + 0] = aabbMin;
+            aabbBuffer[2 * aabbIndex + 1] = aabbMax;
+            ++aabbIndex;
+        }
+    });
+
+    hipStream_t buildStream = threadCUDAStreams.Get();
+    CUDA_CHECK(hipMemcpyAsync(aabbs, aabbBuffer.data(), sizeof(hiprtFloat4) * 2 * nAabbs,
+                              hipMemcpyHostToDevice, buildStream));
+
+    size_t geomTempSize;
+    HIPRT_CHECK(hiprtGetGeometriesBuildTemporaryBufferSize(
+        context, nMeshes, geomInputs.data(), options, geomTempSize));
+
+    hiprtDevicePtr tempBuffer;
+    CUDA_CHECK(hipMalloc(&tempBuffer, geomTempSize));
+
+    HIPRT_CHECK(hiprtCreateGeometries(context, nMeshes, geomInputs.data(), options,
+                                      geomAddrs.data()));
+
+    LOG_VERBOSE("Starting to build bilinear patch mesh geometries");
+    HIPRT_CHECK(hiprtBuildGeometries(context, hiprtBuildOperationBuild, nMeshes,
+                                     geomInputs.data(), options, tempBuffer, buildStream,
+                                     geoms.data()));
+    LOG_VERBOSE("Finished building bilinear patch mesh geometries");
+
+    CUDA_CHECK(hipFree(aabbs));
+    CUDA_CHECK(hipFree(tempBuffer));
+
+    hiprtFrameMatrix identity{};
+    for (size_t i = 0; i < 3; ++i)
+        identity.matrix[i][i] = 1.0f;
+
+    std::vector<hiprtInstance> instances(nMeshes);
+    std::vector<hiprtFrameMatrix> transforms(nMeshes);
+    ParallelFor(0, nMeshes, [&](int64_t meshIndex) {
+        instances[meshIndex].type = hiprtInstanceTypeGeometry;
+        instances[meshIndex].geometry = geoms[meshIndex];
+        transforms[meshIndex] = identity;
+    });
+
+    geomContainer.scene = buildBVHForInstances(instances, transforms, context,
+                                               threadAllocators, threadCUDAStreams);
+
+    return geomContainer;
+}
+
+HiprtAggregate::GeometryContainer HiprtAggregate::buildBVHForQuadrics(
+    const std::vector<ShapeSceneEntity> &shapes, hiprtContext context,
+    const std::map<std::string, FloatTexture> &floatTextures,
+    const std::map<std::string, Material> &namedMaterials,
+    const std::vector<Material> &materials, const std::map<std::string, Medium> &media,
+    const std::map<int, pstd::vector<Light> *> &shapeIndexToAreaLights,
+    ThreadLocal<Allocator> &threadAllocators,
+    ThreadLocal<hipStream_t> &threadCUDAStreams) {
+    int nQuadrics = 0;
+    for (size_t shapeIndex = 0; shapeIndex < shapes.size(); ++shapeIndex) {
+        const auto &s = shapes[shapeIndex];
+        if (s.name == "sphere" || s.name == "cylinder" || s.name == "disk")
+            ++nQuadrics;
+    }
+
+    if (nQuadrics == 0)
+        return {};
+
+    LOG_VERBOSE("Building quadric BLAS");
+
+    hiprtBuildOptions options;
+    options.buildFlags = hiprtBuildFlagBitPreferBalancedBuild;
+    options.batchBuildMaxPrimCount = 512u;
+
+    Allocator alloc = threadAllocators.Get();
+    hiprtFloat4 *aabbs;
+    CUDA_CHECK(hipMalloc(&aabbs, sizeof(hiprtFloat4) * 2 * nQuadrics));
+
+    std::vector<hiprtFloat4> aabbBuffer(2 * nQuadrics);
+
+    int quadricIndex = 0;
+    GeometryContainer geomContainer(nQuadrics);
+    for (size_t shapeIndex = 0; shapeIndex < shapes.size(); ++shapeIndex) {
+        const auto &s = shapes[shapeIndex];
+        if (s.name != "sphere" && s.name != "cylinder" && s.name != "disk")
+            continue;
+
+        pstd::vector<Shape> shapes = Shape::Create(
+            s.name, s.renderFromObject, s.objectFromRender, s.reverseOrientation,
+            s.parameters, floatTextures, &s.loc, alloc);
+        if (shapes.empty())
+            continue;
+        CHECK_EQ(1, shapes.size());
+        Shape shape = shapes[0];
+
+        Bounds3f shapeBounds = shape.Bounds();
+        hiprtFloat4 aabbMin =
+            hiprtFloat4{float(shapeBounds.pMin.x), float(shapeBounds.pMin.y),
+                        float(shapeBounds.pMin.z), 0.0f};
+        hiprtFloat4 aabbMax =
+            hiprtFloat4{float(shapeBounds.pMax.x), float(shapeBounds.pMax.y),
+                        float(shapeBounds.pMax.z), 0.0f};
+        aabbBuffer[2 * quadricIndex + 0] = aabbMin;
+        aabbBuffer[2 * quadricIndex + 1] = aabbMax;
+
+        // Find alpha texture, if present.
+        Material material = getMaterial(s, namedMaterials, materials);
+        FloatTexture alphaTexture = getAlphaTexture(s, floatTextures, alloc);
+
+        // Once again, after any alpha texture is created...
+        s.parameters.ReportUnused();
+
+        HitgroupRecord &hgRecord = geomContainer.hgRecords[quadricIndex];
+        hgRecord.type = HitgroupRecord::Quadric;
+        hgRecord.quadricRec.shape = shape;
+        hgRecord.quadricRec.material = material;
+        hgRecord.quadricRec.alphaTexture = alphaTexture;
+        hgRecord.quadricRec.areaLight = nullptr;
+        if (s.lightIndex != -1) {
+            if (!material)
+                Warning(&s.loc, "Ignoring area light specification for shape with "
+                                "\"interface\" material.");
+            else {
+                auto iter = shapeIndexToAreaLights.find(shapeIndex);
+                // Note: this will hit if we try to have an instance as an area
+                // light.
+                CHECK(iter != shapeIndexToAreaLights.end());
+                CHECK_EQ(iter->second->size(), 1);
+                hgRecord.quadricRec.areaLight = (*iter->second)[0];
+            }
+        }
+        hgRecord.quadricRec.mediumInterface = getMediumInterface(s, media, alloc);
+
+        ++quadricIndex;
+    }
+    nQuadrics = quadricIndex;
+    geomContainer.resize(nQuadrics);
+
+    std::vector<hiprtGeometryBuildInput> geomInputs(nQuadrics);
+    std::vector<hiprtGeometry> geoms(nQuadrics);
+    std::vector<hiprtGeometry *> geomAddrs(nQuadrics);
+    ParallelFor(0, nQuadrics, [&](int64_t quadricIndex) {
+        hiprtAABBListPrimitive prim;
+        prim.aabbCount = 1;
+        prim.aabbStride = 2 * sizeof(hiprtFloat4);
+        prim.aabbs = aabbs + 2 * quadricIndex;
+
+        hiprtGeometryBuildInput &geomInput = geomInputs[quadricIndex];
+        geomInput.type = hiprtPrimitiveTypeAABBList;
+        geomInput.primitive.aabbList = prim;
+        geomInput.geomType = 2;
+
+        geomAddrs[quadricIndex] = &geoms[quadricIndex];
+    });
+
+    hipStream_t buildStream = threadCUDAStreams.Get();
+    CUDA_CHECK(hipMemcpyAsync(aabbs, aabbBuffer.data(),
+                              sizeof(hiprtFloat4) * 2 * nQuadrics, hipMemcpyHostToDevice,
+                              buildStream));
+
+    size_t geomTempSize;
+    HIPRT_CHECK(hiprtGetGeometriesBuildTemporaryBufferSize(
+        context, nQuadrics, geomInputs.data(), options, geomTempSize));
+
+    hiprtDevicePtr tempBuffer;
+    CUDA_CHECK(hipMalloc(&tempBuffer, geomTempSize));
+
+    HIPRT_CHECK(hiprtCreateGeometries(context, nQuadrics, geomInputs.data(), options,
+                                      geomAddrs.data()));
+
+    LOG_VERBOSE("Starting to build quadric geometries");
+    HIPRT_CHECK(hiprtBuildGeometries(context, hiprtBuildOperationBuild, nQuadrics,
+                                     geomInputs.data(), options, tempBuffer, buildStream,
+                                     geoms.data()));
+    LOG_VERBOSE("Finished building quadric geometries");
+
+    CUDA_CHECK(hipFree(aabbs));
+    CUDA_CHECK(hipFree(tempBuffer));
+
+    hiprtFrameMatrix identity{};
+    for (size_t i = 0; i < 3; ++i)
+        identity.matrix[i][i] = 1.0f;
+
+    std::vector<hiprtInstance> instances(nQuadrics);
+    std::vector<hiprtFrameMatrix> transforms(nQuadrics);
+    ParallelFor(0, nQuadrics, [&](int64_t quadricIndex) {
+        instances[quadricIndex].type = hiprtInstanceTypeGeometry;
+        instances[quadricIndex].geometry = geoms[quadricIndex];
+        transforms[quadricIndex] = identity;
+    });
+
+    geomContainer.scene = buildBVHForInstances(instances, transforms, context,
+                                               threadAllocators, threadCUDAStreams);
+
+    return geomContainer;
+}
+
+hiprtScene HiprtAggregate::buildBVHForInstances(
+    const std::vector<hiprtInstance> &instances,
+    const std::vector<hiprtFrameMatrix> &transforms, hiprtContext context,
+    ThreadLocal<Allocator> &threadAllocators,
+    ThreadLocal<hipStream_t> &threadCUDAStreams) {
+    CHECK(transforms.size() == instances.size());
+    hiprtBuildOptions options;
+    options.buildFlags = hiprtBuildFlagBitPreferBalancedBuild;
+    options.batchBuildMaxPrimCount = 512u;
+
+    hiprtSceneBuildInput sceneInput;
+    sceneInput.instanceCount = instances.size();
+    sceneInput.frameCount = instances.size();
+    sceneInput.instanceFrames = nullptr;
+    sceneInput.instanceMasks = nullptr;
+    sceneInput.instanceTransformHeaders = nullptr;
+    sceneInput.instances = nullptr;
+    sceneInput.frameType = hiprtFrameTypeMatrix;
+
+    size_t sceneTempSize;
+    HIPRT_CHECK(hiprtGetSceneBuildTemporaryBufferSize(context, sceneInput, options,
+                                                      sceneTempSize));
+
+    hiprtDevicePtr tempBuffer;
+    CUDA_CHECK(hipMalloc(&tempBuffer, sceneTempSize));
+    CUDA_CHECK(
+        hipMalloc(&sceneInput.instances, sizeof(hiprtInstance) * instances.size()));
+    CUDA_CHECK(hipMalloc(&sceneInput.instanceFrames,
+                         sizeof(hiprtFrameMatrix) * transforms.size()));
+
+    hipStream_t buildStream = threadCUDAStreams.Get();
+    CUDA_CHECK(hipMemcpyAsync((void *)sceneInput.instances, instances.data(),
+                              sizeof(hiprtInstance) * instances.size(),
+                              hipMemcpyHostToDevice, buildStream));
+    CUDA_CHECK(hipMemcpyAsync((void *)sceneInput.instanceFrames, transforms.data(),
+                              sizeof(hiprtFrameMatrix) * transforms.size(),
+                              hipMemcpyHostToDevice, buildStream));
+
+    hiprtScene scene;
+    HIPRT_CHECK(hiprtCreateScene(context, sceneInput, options, scene));
+
+    LOG_VERBOSE("Started to build scene");
+    HIPRT_CHECK(hiprtBuildScene(context, hiprtBuildOperationBuild, sceneInput, options,
+                                tempBuffer, buildStream, scene));
+    LOG_VERBOSE("Finished building scene");
+
+    CUDA_CHECK(hipFree(tempBuffer));
+    CUDA_CHECK(hipFree(sceneInput.instances));
+    CUDA_CHECK(hipFree(sceneInput.instanceFrames));
+
+    return scene;
+}
+
+static void loadFile(const std::string& path, std::vector<char> &dst) {
+    std::fstream f(path, std::ios::binary | std::ios::in);
+    if (f.is_open()) {
+        size_t sizeFile;
+        f.seekg(0, std::fstream::end);
+        size_t size = sizeFile = (size_t)f.tellg();
+        dst.resize(size);
+        f.seekg(0, std::fstream::beg);
+        f.read(dst.data(), size);
+        f.close();
+    }
+}
+
+HiprtAggregate::Module HiprtAggregate::compileHiprtModule(hiprtContext context) {
+    const std::string path = "../src/pbrt/gpu/hiprt/hiprt.cu";
+    const std::string closestFunction = "__raygen__findClosest";
+    const std::string shadowFunction = "__raygen__shadow";
+    const std::string shadowTrFunction = "__raygen__shadow_Tr";
+    const std::string oneRandomFunction = "__raygen__randomHit";
+    const std::string binFilename = "hiprt.hipfb";
+
+    std::vector<char> binary;
+    loadFile(binFilename, binary);
+
+    Module module;
+    CUDA_CHECK(hipModuleLoadData(&module.hipModule, binary.data()));
+
+    HIPRT_CHECK(hiprtCreateFuncTable(context, 3, 1, module.funcTable));
+
+    CUDA_CHECK(hipModuleGetFunction(&module.closestFunction, module.hipModule,
+                                    closestFunction.c_str()));
+    CUDA_CHECK(hipModuleGetFunction(&module.shadowFunction, module.hipModule,
+                                    shadowFunction.c_str()));
+    CUDA_CHECK(hipModuleGetFunction(&module.shadowTrFunction, module.hipModule,
+                                    shadowTrFunction.c_str()));
+    CUDA_CHECK(hipModuleGetFunction(&module.oneRandomFunction, module.hipModule,
+                                    oneRandomFunction.c_str()));
+
+    return module;
+}
+
+void HiprtAggregate::hiprtLaunch(hipFunction_t func, int nx, int ny, int tx, int ty,
+                                 void **args, hipStream_t cudaStream,
+                                 size_t sharedMemoryBytes) {
+    int3 tpb = {tx, ty, 1};
+    int3 nb;
+    nb.x = (nx + tpb.x - 1) / tpb.x;
+    nb.y = (ny + tpb.y - 1) / tpb.y;
+    CUDA_CHECK(hipModuleLaunchKernel(func, nb.x, nb.y, 1, tpb.x, tpb.y, 1,
+                                     sharedMemoryBytes, cudaStream, args, 0));
+}
+
+}  // namespace pbrt
diff --git a/src/pbrt/gpu/hiprt/aggregate.h b/src/pbrt/gpu/hiprt/aggregate.h
new file mode 100644
index 000000000..e9fdc61bc
--- /dev/null
+++ b/src/pbrt/gpu/hiprt/aggregate.h
@@ -0,0 +1,191 @@
+#ifndef PBRT_HIPRT_AGGREGATE_H
+#define PBRT_HIPRT_AGGREGATE_H
+
+#include <pbrt/pbrt.h>
+
+#include <pbrt/gpu/common.h>
+#include <pbrt/gpu/memory.h>
+#include <pbrt/scene.h>
+#include <pbrt/util/containers.h>
+#include <pbrt/util/pstd.h>
+#include <pbrt/util/soa.h>
+#include <pbrt/util/vecmath.h>
+#include <pbrt/wavefront/integrator.h>
+#include <pbrt/wavefront/workitems.h>
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include <hip/hip_runtime.h>
+#include <hiprt/hiprt.h>
+
+namespace pbrt {
+
+class HiprtAggregate : public WavefrontAggregate {
+  public:
+    HiprtAggregate(const BasicScene &scene, CUDATrackedMemoryResource *memoryResource,
+                   NamedTextures &textures,
+                   const std::map<int, pstd::vector<Light> *> &shapeIndexToAreaLights,
+                   const std::map<std::string, Medium> &media,
+                   const std::map<std::string, pbrt::Material> &namedMaterials,
+                   const std::vector<pbrt::Material> &materials, int maxQueueSize);
+
+    Bounds3f Bounds() const { return bounds; }
+
+    void IntersectClosest(int maxRays, const RayQueue *rayQueue,
+                          EscapedRayQueue *escapedRayQueue,
+                          HitAreaLightQueue *hitAreaLightQueue,
+                          MaterialEvalQueue *basicEvalMaterialQueue,
+                          MaterialEvalQueue *universalEvalMaterialQueue,
+                          MediumSampleQueue *mediumSampleQueue,
+                          RayQueue *nextRayQueue) const;
+
+    void IntersectShadow(int maxRays, ShadowRayQueue *shadowRayQueue,
+                         SOA<PixelSampleState> *pixelSampleState) const;
+
+    void IntersectShadowTr(int maxRays, ShadowRayQueue *shadowRayQueue,
+                           SOA<PixelSampleState> *pixelSampleState) const;
+
+    void IntersectOneRandom(int maxRays,
+                            SubsurfaceScatterQueue *subsurfaceScatterQueue) const;
+
+    // WAR: The enclosing parent function ("PreparePLYMeshes") for an
+    // extended __device__ lambda cannot have private or protected access
+    // within its class, so it's public...
+    static std::map<int, TriQuadMesh> PreparePLYMeshes(
+        const std::vector<ShapeSceneEntity> &shapes,
+        const std::map<std::string, FloatTexture> &floatTextures);
+
+  private:
+    static constexpr size_t BlockSize = 64u;
+
+    struct ParamBufferState {
+        bool used = false;
+        hipEvent_t finishedEvent;
+        hipDeviceptr_t ptr = 0;
+        void *hostPtr = nullptr;
+    };
+
+    mutable std::vector<ParamBufferState> paramsPool;
+    mutable size_t nextParamOffset = 0;
+
+    struct Module {
+        hipModule_t hipModule;
+        hipFunction_t closestFunction;
+        hipFunction_t shadowFunction;
+        hipFunction_t shadowTrFunction;
+        hipFunction_t oneRandomFunction;
+        hiprtFuncTable funcTable;
+    };
+
+    struct GeometryContainer {
+        GeometryContainer() = default;
+        GeometryContainer(size_t size) : hgRecords(size) {}
+
+        void resize(size_t size) {
+            hgRecords.resize(size);
+        }
+
+        uint32_t offset;
+        hiprtScene scene;
+        std::vector<HitgroupRecord> hgRecords;
+    };
+
+    struct GeometryGroup {
+        GeometryContainer triGeomContainer;
+        GeometryContainer blpGeomContainer;
+        GeometryContainer quadricGeomContainer;
+    };
+
+    struct InstanceContainer {
+        void appendRecords(GeometryContainer &g) {
+            g.offset = hgRecords.size();
+            hgRecords.insert(hgRecords.end(), g.hgRecords.begin(), g.hgRecords.end());
+        }
+
+        void insertInstance(GeometryContainer &g, const hiprtFrameMatrix& m) { 
+            if (g.scene != nullptr) {
+                hiprtInstance instance;
+                instance.type = hiprtInstanceTypeScene;
+                instance.scene = g.scene;
+                instances.push_back(instance);
+                transforms.push_back(m);
+                offsets.push_back(g.offset);
+            }
+        }
+
+        std::vector<uint32_t> offsets;
+        std::vector<hiprtFrameMatrix> transforms;
+        std::vector<hiprtInstance> instances;
+        std::vector<HitgroupRecord> hgRecords;
+    };
+
+    ParamBufferState &getParamBuffer(const RayIntersectParameters &) const;
+
+    static BilinearPatchMesh *diceCurveToBLP(const ShapeSceneEntity &shape, int nDiceU,
+                                             int nDiceV, Allocator alloc);
+
+    static GeometryContainer buildBVHForTriangles(
+        const std::vector<ShapeSceneEntity> &shapes,
+        const std::map<int, TriQuadMesh> &plyMeshes, hiprtContext context,
+        const std::map<std::string, FloatTexture> &floatTextures,
+        const std::map<std::string, Material> &namedMaterials,
+        const std::vector<Material> &materials,
+        const std::map<std::string, Medium> &media,
+        const std::map<int, pstd::vector<Light> *> &shapeIndexToAreaLights,
+        ThreadLocal<Allocator> &threadAllocators,
+        ThreadLocal<hipStream_t> &threadCUDAStreams);
+
+    static GeometryContainer buildBVHForBLPs(
+        const std::vector<ShapeSceneEntity> &shapes, hiprtContext context,
+        const std::map<std::string, FloatTexture> &floatTextures,
+        const std::map<std::string, Material> &namedMaterials,
+        const std::vector<Material> &materials,
+        const std::map<std::string, Medium> &media,
+        const std::map<int, pstd::vector<Light> *> &shapeIndexToAreaLights,
+        ThreadLocal<Allocator> &threadAllocators,
+        ThreadLocal<hipStream_t> &threadCUDAStreams);
+
+    static GeometryContainer buildBVHForQuadrics(
+        const std::vector<ShapeSceneEntity> &shapes, hiprtContext context,
+        const std::map<std::string, FloatTexture> &floatTextures,
+        const std::map<std::string, Material> &namedMaterials,
+        const std::vector<Material> &materials,
+        const std::map<std::string, Medium> &media,
+        const std::map<int, pstd::vector<Light> *> &shapeIndexToAreaLights,
+        ThreadLocal<Allocator> &threadAllocators,
+        ThreadLocal<hipStream_t> &threadCUDAStreams);
+
+    static hiprtScene buildBVHForInstances(
+        const std::vector<hiprtInstance> &instances,
+        const std::vector<hiprtFrameMatrix> &transforms, hiprtContext context,
+        ThreadLocal<Allocator> &threadAllocators,
+        ThreadLocal<hipStream_t> &threadCUDAStreams);
+
+    static Module compileHiprtModule(hiprtContext context);
+
+    static void hiprtLaunch(hipFunction_t func, int nx, int ny, void **args,
+                            hipStream_t cudaStream, size_t sharedMemoryBytes = 0);
+
+    static void hiprtLaunch(hipFunction_t func, int nx, int ny, int tx, int ty,
+                            void **args, hipStream_t cudaStream,
+                            size_t sharedMemoryBytes = 0);
+    Module module;
+    hiprtContext context;
+
+    hiprtScene scene;
+    hiprtGlobalStackBuffer globalStackBuffer;
+    hiprtGlobalStackBuffer globalInstanceStackBuffer;
+    std::vector<GeometryGroup> geomGroups;
+    HitgroupRecord *hgRecords;
+    uint32_t *offsets;
+    
+    Bounds3f bounds;
+    CUDATrackedMemoryResource *memoryResource;
+    hipStream_t cudaStream;
+};
+
+}  // namespace pbrt
+
+#endif  // PBRT_HIPRT_AGGREGATE_H
diff --git a/src/pbrt/gpu/hiprt/hiprt.cu b/src/pbrt/gpu/hiprt/hiprt.cu
new file mode 100644
index 000000000..c713ae14f
--- /dev/null
+++ b/src/pbrt/gpu/hiprt/hiprt.cu
@@ -0,0 +1,609 @@
+#include <hip/hip_runtime.h>
+#include <hiprt/hiprt_device.h>
+
+__device__ bool __filter__alphaKilled(const hiprtRay &ray, const void *data,
+                                      void *payload, const hiprtHit &hit);
+__device__ bool __intersection__bilinearPatch(const hiprtRay &ray, const void *data,
+                                              void *payload, hiprtHit &hit);
+__device__ bool __intersection__quadric(const hiprtRay &ray, const void *data,
+                                        void *payload, hiprtHit &hit);
+
+HIPRT_DEVICE bool intersectFunc(uint32_t geomType, uint32_t rayType,
+                                const hiprtFuncTableHeader &tableHeader,
+                                const hiprtRay &ray, void *payload, hiprtHit &hit) {
+    const uint32_t index = tableHeader.numGeomTypes * rayType + geomType;
+    const void *data = tableHeader.funcDataSets[index].intersectFuncData;
+    switch (index) {
+    case 1: {
+        return __intersection__bilinearPatch(ray, data, payload, hit);
+    }
+    case 2: {
+        return __intersection__quadric(ray, data, payload, hit);
+    }
+    default: {
+        return false;
+    }
+    }
+}
+
+HIPRT_DEVICE bool filterFunc(uint32_t geomType, uint32_t rayType,
+                             const hiprtFuncTableHeader &tableHeader, const hiprtRay &ray,
+                             void *payload, const hiprtHit &hit) {
+    const uint32_t index = tableHeader.numGeomTypes * rayType + geomType;
+    const void *data = tableHeader.funcDataSets[index].filterFuncData;
+    switch (index) {
+    case 0: {
+        return __filter__alphaKilled(ray, data, payload, hit);
+    }
+    default: {
+        return false;
+    }
+    }
+}
+
+// pbrt is Copyright(c) 1998-2020 Matt Pharr, Wenzel Jakob, and Greg Humphreys.
+// The pbrt source code is licensed under the Apache License, Version 2.0.
+// SPDX: Apache-2.0
+
+#include <pbrt/pbrt.h>
+
+#include <pbrt/gpu/common.h>
+#include <pbrt/interaction.h>
+#include <pbrt/materials.h>
+#include <pbrt/media.h>
+#include <pbrt/shapes.h>
+#include <pbrt/textures.h>
+#include <pbrt/util/float.h>
+#include <pbrt/util/rng.h>
+#include <pbrt/util/transform.h>
+#include <pbrt/util/vecmath.h>
+#include <pbrt/wavefront/intersect.h>
+
+// Make various functions visible to HIPRT, which doesn't get to link
+// shader code with the HIP code in the main executable...
+#include <pbrt/util/color.cpp>
+#include <pbrt/util/colorspace.cpp>
+#include <pbrt/util/log.cpp>
+#include <pbrt/util/noise.cpp>
+#include <pbrt/util/spectrum.cpp>
+#include <pbrt/util/transform.cpp>
+
+#include <utility>
+
+using namespace pbrt;
+
+alignas(alignof(RayIntersectParameters)) __constant__
+    unsigned char paramBuffer[sizeof(RayIntersectParameters)];
+#define params (*(RayIntersectParameters *)paramBuffer)
+
+///////////////////////////////////////////////////////////////////////////
+// Utility functions
+
+template <bool AnyHit>
+__device__ inline hiprtHit Trace(hiprtScene scene, Ray ray, Float tMax, uint32_t &missed,
+                                 void *payload) {
+    hiprtRay hiprtRay;
+    hiprtRay.origin = make_float3(ray.o.x, ray.o.y, ray.o.z);
+    hiprtRay.direction = make_float3(ray.d.x, ray.d.y, ray.d.z);
+    hiprtRay.minT = 1e-7f;
+    hiprtRay.maxT = tMax;
+
+    __shared__ int sharedStackCache[SHARED_STACK_SIZE * BLOCK_SIZE];
+    hiprtSharedStackBuffer sharedStackBuffer{SHARED_STACK_SIZE, sharedStackCache};
+    hiprtGlobalStack stack(params.globalStackBuffer, sharedStackBuffer);
+
+    hiprtSharedStackBuffer sharedInstanceStackBuffer{};
+    hiprtGlobalInstanceStack instanceStack(params.globalInstanceStackBuffer,
+                                           sharedInstanceStackBuffer);
+
+    hiprtHit hit;
+    if constexpr (!AnyHit) {
+        hiprtSceneTraversalClosestCustomStack<hiprtGlobalStack, hiprtGlobalInstanceStack>
+            tr(scene, hiprtRay, stack, instanceStack, hiprtFullRayMask,
+               hiprtTraversalHintDefault, payload, params.funcTable, 0, ray.time);
+        hit = tr.getNextHit();
+    } else {
+        hiprtSceneTraversalAnyHitCustomStack<hiprtGlobalStack, hiprtGlobalInstanceStack>
+            tr(scene, hiprtRay, stack, instanceStack, hiprtFullRayMask,
+               hiprtTraversalHintDefault, payload, params.funcTable, 0, ray.time);
+        hit = tr.getNextHit();
+        if (hit.t == 0.0f)
+            hit.primID = hiprtInvalidValue;
+    }
+
+    missed = uint32_t(hit.primID == hiprtInvalidValue);
+    return hit;
+}
+
+static __device__ uint32_t recordIndex(const hiprtHit &hit) {
+    return hit.instanceIDs[1] + params.offsets[hit.instanceIDs[0]];
+}
+
+static __forceinline__ __device__ Transform getWorldFromInstance(const hiprtHit &hit) {
+    hiprtFrameMatrix hiprtWorldFromObjM =
+        hiprtGetObjectToWorldFrameMatrix(params.traversable, hit.instanceIDs, 0.0f);
+    hiprtFrameMatrix hiprtObjFromWorldM =
+        hiprtGetWorldToObjectFrameMatrix(params.traversable, hit.instanceIDs, 0.0f);
+    SquareMatrix<4> worldFromObjM, objFromWorldM;
+    for (int i = 0; i < 3; ++i) {
+        for (int j = 0; j < 4; ++j) {
+            worldFromObjM[i][j] = hiprtWorldFromObjM.matrix[i][j];
+            objFromWorldM[i][j] = hiprtObjFromWorldM.matrix[i][j];
+        }
+    }
+    return Transform(worldFromObjM, objFromWorldM);
+}
+
+static __forceinline__ __device__ SurfaceInteraction
+getTriangleIntersection(const Ray &rayWorld, const hiprtHit &hit) {
+    const TriangleMeshRecord &rec = params.hgRecords[recordIndex(hit)].triRec;
+
+    float b1 = hit.uv.x;
+    float b2 = hit.uv.y;
+    float b0 = 1 - b1 - b2;
+
+    float3 rd = make_float3(rayWorld.d.x, rayWorld.d.y, rayWorld.d.z);
+    Vector3f wo = -Vector3f(rd.x, rd.y, rd.z);
+
+    Transform worldFromInstance = getWorldFromInstance(hit);
+
+    Float time = rayWorld.time;
+    wo = worldFromInstance.ApplyInverse(wo);
+
+    TriangleIntersection ti{b0, b1, b2, hit.t};
+    SurfaceInteraction intr =
+        Triangle::InteractionFromIntersection(rec.mesh, hit.primID, ti, time, wo);
+    return worldFromInstance(intr);
+}
+
+static __forceinline__ __device__ SurfaceInteraction
+getBilinearPatchIntersection(const Ray &rayWorld, const hiprtHit &hit, Point2f uv) {
+    BilinearMeshRecord &rec = params.hgRecords[recordIndex(hit)].blpRec;
+
+    float3 rd = make_float3(rayWorld.d.x, rayWorld.d.y, rayWorld.d.z);
+    Vector3f wo = -Vector3f(rd.x, rd.y, rd.z);
+
+    return BilinearPatch::InteractionFromIntersection(rec.mesh, hit.primID, uv,
+                                                      rayWorld.time, wo);
+}
+
+static __device__ inline SurfaceInteraction getQuadricIntersection(
+    const Ray &rayWorld, const hiprtHit &hit, const QuadricIntersection &si) {
+    QuadricRecord &rec = params.hgRecords[recordIndex(hit)].quadricRec;
+
+    float3 rd = make_float3(rayWorld.d.x, rayWorld.d.y, rayWorld.d.z);
+    Vector3f wo = -Vector3f(rd.x, rd.y, rd.z);
+    Float time = rayWorld.time;
+
+    SurfaceInteraction intr;
+    if (const Sphere *sphere = rec.shape.CastOrNullptr<Sphere>())
+        intr = sphere->InteractionFromIntersection(si, wo, time);
+    else if (const Cylinder *cylinder = rec.shape.CastOrNullptr<Cylinder>())
+        intr = cylinder->InteractionFromIntersection(si, wo, time);
+    else if (const Disk *disk = rec.shape.CastOrNullptr<Disk>())
+        intr = disk->InteractionFromIntersection(si, wo, time);
+    else
+        CHECK(!"unexpected quadric");
+
+    return intr;
+}
+
+///////////////////////////////////////////////////////////////////////////
+// Intersection and filter functions
+
+__device__ bool __filter__alphaKilled(const hiprtRay &ray, const void *data,
+                                      void *payload, const hiprtHit &hit) {
+    const TriangleMeshRecord &rec = params.hgRecords[recordIndex(hit)].triRec;
+    if (!rec.alphaTexture)
+        return false;
+
+    Ray rayWorld = *(Ray *)payload;
+
+    SurfaceInteraction intr = getTriangleIntersection(rayWorld, hit);
+
+    BasicTextureEvaluator eval;
+    Float alpha = eval(rec.alphaTexture, intr);
+    if (alpha >= 1)
+        return false;
+    if (alpha <= 0)
+        return true;
+    else {
+        float3 o = make_float3(rayWorld.o.x, rayWorld.o.y, rayWorld.o.z);
+        float3 d = make_float3(rayWorld.d.x, rayWorld.d.y, rayWorld.d.z);
+        Float u = HashFloat(o, d);
+        return u > alpha;
+    }
+}
+
+__device__ bool __intersection__bilinearPatch(const hiprtRay &ray_, const void *data,
+                                              void *payload, hiprtHit &hit) {
+    BilinearMeshRecord &rec = params.hgRecords[recordIndex(hit)].blpRec;
+
+    float3 org = ray_.origin;
+    float3 dir = ray_.direction;
+    Float tMax = ray_.maxT;
+    Ray ray(Point3f(org.x, org.y, org.z), Vector3f(dir.x, dir.y, dir.z));
+    Ray rayWorld = *(Ray *)payload;
+
+    int vertexIndex = 4 * hit.primID;
+    Point3f p00 = rec.mesh->p[rec.mesh->vertexIndices[vertexIndex]];
+    Point3f p10 = rec.mesh->p[rec.mesh->vertexIndices[vertexIndex + 1]];
+    Point3f p01 = rec.mesh->p[rec.mesh->vertexIndices[vertexIndex + 2]];
+    Point3f p11 = rec.mesh->p[rec.mesh->vertexIndices[vertexIndex + 3]];
+    pstd::optional<BilinearIntersection> isect =
+        IntersectBilinearPatch(ray, tMax, p00, p10, p01, p11);
+
+    if (!isect)
+        return false;
+
+    if (rec.alphaTexture) {
+        SurfaceInteraction intr = getBilinearPatchIntersection(rayWorld, hit, isect->uv);
+        BasicTextureEvaluator eval;
+        Float alpha = eval(rec.alphaTexture, intr);
+        if (alpha < 1) {
+            if (alpha == 0)
+                // No hit
+                return false;
+
+            float3 o = make_float3(rayWorld.o.x, rayWorld.o.y, rayWorld.o.z);
+            float3 d = make_float3(rayWorld.d.x, rayWorld.d.y, rayWorld.d.z);
+            Float u = HashFloat(o, d);
+            if (u > alpha)
+                // no hit
+                return false;
+        }
+    }
+
+    hit.t = isect->t;
+    hit.uv.x = isect->uv[0];
+    hit.uv.y = isect->uv[1];
+
+    return true;
+}
+
+__device__ bool __intersection__quadric(const hiprtRay &ray_, const void *data,
+                                        void *payload, hiprtHit &hit) {
+    QuadricRecord &rec = params.hgRecords[recordIndex(hit)].quadricRec;
+
+    float3 org = ray_.origin;
+    float3 dir = ray_.direction;
+    Float tMax = ray_.maxT;
+    Ray ray(Point3f(org.x, org.y, org.z), Vector3f(dir.x, dir.y, dir.z));
+    pstd::optional<QuadricIntersection> isect;
+
+    if (const Sphere *sphere = rec.shape.CastOrNullptr<Sphere>())
+        isect = sphere->BasicIntersect(ray, tMax);
+    else if (const Cylinder *cylinder = rec.shape.CastOrNullptr<Cylinder>())
+        isect = cylinder->BasicIntersect(ray, tMax);
+    else if (const Disk *disk = rec.shape.CastOrNullptr<Disk>())
+        isect = disk->BasicIntersect(ray, tMax);
+
+    if (!isect)
+        return false;
+
+    Ray rayWorld = *(Ray *)payload;
+
+    if (rec.alphaTexture) {
+        SurfaceInteraction intr = getQuadricIntersection(rayWorld, hit, *isect);
+
+        BasicTextureEvaluator eval;
+        Float alpha = eval(rec.alphaTexture, intr);
+        if (alpha < 1) {
+            if (alpha == 0)
+                // No hit
+                return false;
+
+            float3 o = make_float3(rayWorld.o.x, rayWorld.o.y, rayWorld.o.z);
+            float3 d = make_float3(rayWorld.d.x, rayWorld.d.y, rayWorld.d.z);
+            Float u = HashFloat(o.x, o.y, o.z, d.x, d.y, d.z);
+            if (u > alpha)
+                // no hit
+                return false;
+        }
+    }
+
+    hit.t = isect->tHit;
+    hit.normal = make_float3(isect->pObj.x, isect->pObj.y, isect->pObj.z);
+    hit.uv.x = isect->phi;
+
+    return true;
+}
+
+///////////////////////////////////////////////////////////////////////////
+// Closest hit
+
+struct ClosestHitContext {
+    ClosestHitContext() = default;
+    __device__ ClosestHitContext(Medium rayMedium, bool shadowRay)
+        : rayMedium(rayMedium), shadowRay(shadowRay) {}
+
+    Medium rayMedium;
+    bool shadowRay;
+
+    // out
+    Point3fi piHit;
+    Normal3f nHit;
+    Material material;
+    MediumInterface mediumInterface;
+
+    __device__ Ray SpawnRayTo(const Point3f &p) const {
+        Interaction intr(piHit, nHit);
+        intr.mediumInterface = &mediumInterface;
+        return intr.SpawnRayTo(p);
+    }
+};
+
+static __forceinline__ __device__ void ProcessClosestIntersection(
+    SurfaceInteraction intr, const hiprtHit &hit, ClosestHitContext &ctx) {
+    int rayIndex = blockIdx.x * blockDim.x + threadIdx.x;
+
+    Medium rayMedium = ctx.rayMedium;
+
+    if (intr.mediumInterface)
+        ctx.mediumInterface = *intr.mediumInterface;
+    else
+        ctx.mediumInterface = MediumInterface(rayMedium);
+
+    ctx.piHit = intr.pi;
+    ctx.nHit = intr.n;
+    ctx.material = intr.material;
+
+    if (ctx.shadowRay)
+        return;
+
+    // We only have the ray queue (and it only makes sense to access) for
+    // regular closest hit rays.
+    RayWorkItem r = (*params.rayQueue)[rayIndex];
+
+    EnqueueWorkAfterIntersection(r, rayMedium, hit.t, intr, params.mediumSampleQueue,
+                                 params.nextRayQueue, params.hitAreaLightQueue,
+                                 params.basicEvalMaterialQueue,
+                                 params.universalEvalMaterialQueue);
+}
+__device__ void closesthitTriangle(const Ray &rayWorld, const hiprtHit &hit,
+                                   ClosestHitContext &ctx) {
+    const TriangleMeshRecord &rec = params.hgRecords[recordIndex(hit)].triRec;
+
+    SurfaceInteraction intr = getTriangleIntersection(rayWorld, hit);
+
+    if (rec.mediumInterface && rec.mediumInterface->IsMediumTransition())
+        intr.mediumInterface = rec.mediumInterface;
+    intr.material = rec.material;
+    if (!rec.areaLights.empty())
+        intr.areaLight = rec.areaLights[hit.primID];
+
+    ProcessClosestIntersection(intr, hit, ctx);
+}
+
+__device__ void closesthitBilinearPatch(const Ray &rayWorld, const hiprtHit &hit,
+                                        ClosestHitContext &ctx) {
+    BilinearMeshRecord &rec = params.hgRecords[recordIndex(hit)].blpRec;
+
+    Point2f uv(hit.uv.x, hit.uv.y);
+
+    SurfaceInteraction intr = getBilinearPatchIntersection(rayWorld, hit, uv);
+    if (rec.mediumInterface && rec.mediumInterface->IsMediumTransition())
+        intr.mediumInterface = rec.mediumInterface;
+    intr.material = rec.material;
+    if (!rec.areaLights.empty())
+        intr.areaLight = rec.areaLights[hit.primID];
+
+    Transform worldFromInstance = getWorldFromInstance(hit);
+    intr = worldFromInstance(intr);
+
+    ProcessClosestIntersection(intr, hit, ctx);
+}
+
+__device__ void closesthitQuadric(const Ray &rayWorld, const hiprtHit &hit,
+                                  ClosestHitContext &ctx) {
+    QuadricRecord &rec = params.hgRecords[recordIndex(hit)].quadricRec;
+    QuadricIntersection qi;
+    qi.pObj = Point3f(hit.normal.x, hit.normal.y, hit.normal.z);
+    qi.phi = hit.uv.x;
+
+    SurfaceInteraction intr = getQuadricIntersection(rayWorld, hit, qi);
+    if (rec.mediumInterface && rec.mediumInterface->IsMediumTransition())
+        intr.mediumInterface = rec.mediumInterface;
+    intr.material = rec.material;
+    if (rec.areaLight)
+        intr.areaLight = rec.areaLight;
+
+    Transform worldFromInstance = getWorldFromInstance(hit);
+    intr = worldFromInstance(intr);
+
+    ProcessClosestIntersection(intr, hit, ctx);
+}
+
+extern "C" __global__ void __raygen__findClosest() {
+    int rayIndex = blockIdx.x * blockDim.x + threadIdx.x;
+    if (rayIndex >= params.rayQueue->Size())
+        return;
+
+    RayWorkItem r = (*params.rayQueue)[rayIndex];
+    Ray ray = r.ray;
+    Float tMax = 1e30f;
+
+    PBRT_DBG("ray o %f %f %f dir %f %f %f tmax %f\n", ray.o.x, ray.o.y, ray.o.z, ray.d.x,
+             ray.d.y, ray.d.z, tMax);
+
+    uint32_t missed = 0;
+    hiprtHit hit = Trace<false>(params.traversable, ray, tMax, missed, &ray);
+
+    if (missed) {
+        EnqueueWorkAfterMiss(r, params.mediumSampleQueue, params.escapedRayQueue);
+        return;
+    }
+
+    ClosestHitContext ctx(ray.medium, false);
+    if (params.hgRecords[recordIndex(hit)].type == HitgroupRecord::TriangleMesh)
+        closesthitTriangle(ray, hit, ctx);
+    else if (params.hgRecords[recordIndex(hit)].type == HitgroupRecord::BilinearMesh)
+        closesthitBilinearPatch(ray, hit, ctx);
+    else if (params.hgRecords[recordIndex(hit)].type == HitgroupRecord::Quadric)
+        closesthitQuadric(ray, hit, ctx);
+    else
+        CHECK(!"unexpected primitive type");
+}
+
+///////////////////////////////////////////////////////////////////////////
+// Shadow rays
+
+extern "C" __global__ void __raygen__shadow() {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= params.shadowRayQueue->Size())
+        return;
+
+    ShadowRayWorkItem sr = (*params.shadowRayQueue)[index];
+    PBRT_DBG("Tracing shadow ray index %d o %f %f %f d %f %f %f\n", index, sr.ray.o.x,
+             sr.ray.o.y, sr.ray.o.z, sr.ray.d.x, sr.ray.d.y, sr.ray.d.z);
+
+    uint32_t missed = 0;
+    Trace<true>(params.traversable, sr.ray, sr.tMax, missed, &sr.ray);
+
+    RecordShadowRayResult(sr, &params.pixelSampleState, !missed);
+}
+
+extern "C" __global__ void __raygen__shadow_Tr() {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= params.shadowRayQueue->Size())
+        return;
+
+    ShadowRayWorkItem sr = (*params.shadowRayQueue)[index];
+
+    ClosestHitContext ctx;
+
+    TraceTransmittance(
+        sr, &params.pixelSampleState,
+        [&](Ray ray, Float tMax) -> TransmittanceTraceResult {
+            uint32_t missed = 0;
+            hiprtHit hit = Trace<false>(params.traversable, ray, tMax, missed,
+                                        &ray);  // the closest hit is actually used
+
+            ctx = ClosestHitContext(ray.medium, true);
+
+            if (!missed) {
+                if (params.hgRecords[recordIndex(hit)].type ==
+                    HitgroupRecord::TriangleMesh)
+                    closesthitTriangle(ray, hit, ctx);
+                else if (params.hgRecords[recordIndex(hit)].type ==
+                         HitgroupRecord::BilinearMesh)
+                    closesthitBilinearPatch(ray, hit, ctx);
+                else if (params.hgRecords[recordIndex(hit)].type ==
+                         HitgroupRecord::Quadric)
+                    closesthitQuadric(ray, hit, ctx);
+                else
+                    CHECK(!"unexpected primitive type");
+            }
+
+            return TransmittanceTraceResult{!missed, Point3f(ctx.piHit), ctx.material};
+        },
+        [&](Point3f p) -> Ray { return ctx.SpawnRayTo(p); });
+}
+
+///////////////////////////////////////////////////////////////////////////
+// Random hit (for subsurface scattering)
+
+struct RandomHitPayload {
+    WeightedReservoirSampler<SubsurfaceInteraction> wrs;
+    Material material;
+    pstd::optional<SurfaceInteraction> intr;
+};
+
+__device__ void closesthitRandomHitTriangle(const Ray &rayWorld, const hiprtHit &hit,
+                                            RandomHitPayload *p) {
+    const TriangleMeshRecord &rec = params.hgRecords[recordIndex(hit)].triRec;
+
+    PBRT_DBG("Anyhit triangle for random hit: rec.material %p params.materials %p\n",
+             rec.material.ptr(), p->material.ptr());
+
+    SurfaceInteraction intr = getTriangleIntersection(rayWorld, hit);
+    p->intr = intr;
+
+    if (rec.material == p->material)
+        p->wrs.Add([&] __device__() { return intr; }, 1.f);
+}
+
+__device__ void closesthitRandomHitBilinearPatch(const Ray &rayWorld, const hiprtHit &hit,
+                                                 RandomHitPayload *p) {
+    BilinearMeshRecord &rec = params.hgRecords[recordIndex(hit)].blpRec;
+
+    PBRT_DBG("Anyhit blp for random hit: rec.material %p params.materials %p\n",
+             rec.material.ptr(), p->material.ptr());
+
+    Point2f uv(hit.uv.x, hit.uv.y);
+    SurfaceInteraction intr = getBilinearPatchIntersection(rayWorld, hit, uv);
+    p->intr = intr;
+
+    if (rec.material == p->material)
+        p->wrs.Add([&] __device__() { return intr; }, 1.f);
+}
+
+__device__ void closesthitRandomHitQuadric(const Ray &rayWorld, const hiprtHit &hit,
+                                           RandomHitPayload *p) {
+    QuadricRecord &rec = params.hgRecords[recordIndex(hit)].quadricRec;
+
+    PBRT_DBG("Anyhit quadric for random hit: rec.material %p params.materials %p\n",
+             rec.material.ptr(), p->material.ptr());
+
+    QuadricIntersection qi;
+    qi.pObj = Point3f(hit.normal.x, hit.normal.y, hit.normal.z);
+    qi.phi = hit.uv.x;
+
+    SurfaceInteraction intr = getQuadricIntersection(rayWorld, hit, qi);
+    p->intr = intr;
+
+    if (rec.material == p->material)
+        p->wrs.Add([&] __device__() { return intr; }, 1.f);
+}
+
+extern "C" __global__ void __raygen__randomHit() {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= params.subsurfaceScatterQueue->Size())
+        return;
+
+    SubsurfaceScatterWorkItem s = (*params.subsurfaceScatterQueue)[index];
+
+    Ray ray(s.p0, s.p1 - s.p0);
+
+    RandomHitPayload payload;
+    payload.wrs.Seed(Hash(s.p0, s.p1));
+    payload.material = s.material;
+
+    PBRT_DBG("Randomhit raygen ray.o %f %f %f ray.d %f %f %f\n", ray.o.x, ray.o.y,
+             ray.o.z, ray.d.x, ray.d.y, ray.d.z);
+
+    int depth = 0;
+    while (LengthSquared(ray.d) > 0 && ++depth < 100) {
+        uint32_t missed = 0;
+        hiprtHit hit =
+            Trace<false>(params.traversable, ray, 1.f /* tMax */, missed, &ray);
+
+        if (!missed) {
+            if (params.hgRecords[recordIndex(hit)].type == HitgroupRecord::TriangleMesh)
+                closesthitRandomHitTriangle(ray, hit, &payload);
+            else if (params.hgRecords[recordIndex(hit)].type ==
+                     HitgroupRecord::BilinearMesh)
+                closesthitRandomHitBilinearPatch(ray, hit, &payload);
+            else if (params.hgRecords[recordIndex(hit)].type == HitgroupRecord::Quadric)
+                closesthitRandomHitQuadric(ray, hit, &payload);
+            else
+                CHECK(!"unexpected primitive type");
+        }
+
+        if (payload.intr) {
+            ray = payload.intr->SpawnRayTo(s.p1);
+            payload.intr.reset();
+        } else
+            break;
+    }
+
+    if (payload.wrs.HasSample() &&
+        payload.wrs.WeightSum() > 0) {  // TODO: latter check shouldn't be needed...
+        const SubsurfaceInteraction &si = payload.wrs.GetSample();
+
+        params.subsurfaceScatterQueue->reservoirPDF[index] =
+            payload.wrs.SampleProbability();
+        params.subsurfaceScatterQueue->ssi[index] = payload.wrs.GetSample();
+    } else
+        params.subsurfaceScatterQueue->reservoirPDF[index] = 0;
+}
diff --git a/src/pbrt/gpu/memory.cpp b/src/pbrt/gpu/memory.cpp
index 0cec0a9ab..da3601884 100644
--- a/src/pbrt/gpu/memory.cpp
+++ b/src/pbrt/gpu/memory.cpp
@@ -8,8 +8,12 @@
 #include <pbrt/util/check.h>
 #include <pbrt/util/log.h>
 
+#if defined(__HIPCC__)
+#include <pbrt/util/hip_aliases.h>
+#else
 #include <cuda.h>
 #include <cuda_runtime.h>
+#endif
 
 namespace pbrt {
 
diff --git a/src/pbrt/gpu/util.cpp b/src/pbrt/gpu/util.cpp
index aa89707c7..b0a631d8e 100644
--- a/src/pbrt/gpu/util.cpp
+++ b/src/pbrt/gpu/util.cpp
@@ -76,21 +76,27 @@ void GPUInit() {
 #endif
     CUDA_CHECK(cudaSetDevice(device));
 
+// there was a bug in HIP stating unsupported while actually supported
+// not sure whether has been fixed
+#ifdef __NVCC__ 
     int hasUnifiedAddressing;
     CUDA_CHECK(cudaDeviceGetAttribute(&hasUnifiedAddressing, cudaDevAttrUnifiedAddressing,
                                       device));
     if (!hasUnifiedAddressing)
         LOG_FATAL("The selected GPU device (%d) does not support unified addressing.",
                   device);
+#endif
 
     CUDA_CHECK(cudaDeviceSetLimit(cudaLimitStackSize, 8192));
     size_t stackSize;
     CUDA_CHECK(cudaDeviceGetLimit(&stackSize, cudaLimitStackSize));
     LOG_VERBOSE("Reset stack size to %d", stackSize);
 
+#ifdef __NVCC__
     CUDA_CHECK(cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 32 * 1024 * 1024));
 
     CUDA_CHECK(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
+#endif
 
 #ifdef NVTX
 #ifdef PBRT_IS_WINDOWS
diff --git a/src/pbrt/gpu/util.h b/src/pbrt/gpu/util.h
index 72b8cc4cc..98f782aa6 100644
--- a/src/pbrt/gpu/util.h
+++ b/src/pbrt/gpu/util.h
@@ -18,8 +18,12 @@
 #include <utility>
 #include <vector>
 
+#if defined(__HIPCC__)
+#include <pbrt/util/hip_aliases.h>
+#else
 #include <cuda.h>
 #include <cuda_runtime_api.h>
+#endif
 
 #ifdef NVTX
 #ifdef UNICODE
@@ -38,6 +42,7 @@
         LOG_FATAL("CUDA error: %s", cudaGetErrorString(error)); \
     } else /* eat semicolon */
 
+#ifdef __NVCC__  // only used in denoiser.cpp
 #define CU_CHECK(EXPR)                                              \
     do {                                                            \
         CUresult result = EXPR;                                     \
@@ -47,6 +52,7 @@
             LOG_FATAL("CUDA error: %s", str);                       \
         }                                                           \
     } while (false) /* eat semicolon */
+#endif
 
 namespace pbrt {
 
@@ -64,15 +70,20 @@ inline int GetBlockSize(const char *description, F kernel) {
         return iter->second;
 
     int minGridSize, blockSize;
+// this API is not reliable in HIP sometimes returning even negative values
+#ifdef __HIPCC__  
+    blockSize = 64;
+#else
     CUDA_CHECK(
         cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, kernel, 0, 0));
+#endif
     kernelBlockSizes[index] = blockSize;
     LOG_VERBOSE("[%s]: block size %d", description, blockSize);
 
     return blockSize;
 }
 
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
 template <typename F>
 __global__ void Kernel(F func, int nItems) {
     int tid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -113,7 +124,7 @@ void GPUParallelFor(const char *description, int nItems, F func) {
 #endif
 }
 
-#endif  // __NVCC__
+#endif  // __NVCC__ || __HIPCC__
 
 // GPU Synchronization Function Declarations
 void GPUWait();
diff --git a/src/pbrt/media.h b/src/pbrt/media.h
index 8a5677a74..d31d02aa2 100644
--- a/src/pbrt/media.h
+++ b/src/pbrt/media.h
@@ -24,9 +24,6 @@
 #include <nanovdb/NanoVDB.h>
 #include <nanovdb/util/GridHandle.h>
 #include <nanovdb/util/SampleFromVoxels.h>
-#if defined(PBRT_BUILD_GPU_RENDERER) && defined(__NVCC__)
-#include <nanovdb/util/CudaDeviceBuffer.h>
-#endif  // PBRT_BUILD_GPU_RENDERER
 
 #include <algorithm>
 #include <limits>
diff --git a/src/pbrt/options.h b/src/pbrt/options.h
index 56f8e67be..397ef9710 100644
--- a/src/pbrt/options.h
+++ b/src/pbrt/options.h
@@ -62,9 +62,9 @@ struct PBRTOptions : BasicPBRTOptions {
 extern PBRTOptions *Options;
 
 #if defined(PBRT_BUILD_GPU_RENDERER)
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
 extern __constant__ BasicPBRTOptions OptionsGPU;
-#endif  // __CUDACC__
+#endif  // __CUDACC__ || __HIPCC__
 
 void CopyOptionsToGPU();
 #endif  // PBRT_BUILD_GPU_RENDERER
diff --git a/src/pbrt/pbrt.h b/src/pbrt/pbrt.h
index 8207a4800..e52297032 100644
--- a/src/pbrt/pbrt.h
+++ b/src/pbrt/pbrt.h
@@ -18,11 +18,11 @@
 #endif  // PBRT_IS_WINDOWS
 
 // GPU Macro Definitions
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
 #define PBRT_IS_GPU_CODE
 #endif
 
-#if defined(PBRT_BUILD_GPU_RENDERER) && defined(__CUDACC__)
+#if defined(PBRT_BUILD_GPU_RENDERER) && (defined(__CUDACC__) || defined(__HIPCC__))
 #ifndef PBRT_NOINLINE
 #define PBRT_NOINLINE __attribute__((noinline))
 #endif
diff --git a/src/pbrt/shapes.h b/src/pbrt/shapes.h
index d775b6469..f74df4ba9 100644
--- a/src/pbrt/shapes.h
+++ b/src/pbrt/shapes.h
@@ -812,7 +812,7 @@ inline Cylinder::Cylinder(const Transform *renderFromObject,
       phiMax(Radians(Clamp(phiMax, 0, 360))) {}
 
 // Triangle Declarations
-#if defined(PBRT_BUILD_GPU_RENDERER) && defined(__CUDACC__)
+#if defined(PBRT_BUILD_GPU_RENDERER) && (defined(__HIPCC__) || defined(__CUDACC__))
 extern PBRT_GPU pstd::vector<const TriangleMesh *> *allTriangleMeshesGPU;
 #endif
 
@@ -1264,7 +1264,7 @@ class Curve {
 };
 
 // BilinearPatch Declarations
-#if defined(PBRT_BUILD_GPU_RENDERER) && defined(__CUDACC__)
+#if defined(PBRT_BUILD_GPU_RENDERER) && (defined(__HIPCC__) || defined(__CUDACC__))
 extern PBRT_GPU pstd::vector<const BilinearPatchMesh *> *allBilinearMeshesGPU;
 #endif
 
diff --git a/src/pbrt/textures.cpp b/src/pbrt/textures.cpp
index 48a0aeb2b..d21c72f28 100644
--- a/src/pbrt/textures.cpp
+++ b/src/pbrt/textures.cpp
@@ -180,7 +180,7 @@ std::string SpectrumBilerpTexture::ToString() const {
 }
 
 // CheckerboardTexture Function Definitions
-Float Checkerboard(TextureEvalContext ctx, TextureMapping2D map2D,
+PBRT_CPU_GPU Float Checkerboard(TextureEvalContext ctx, TextureMapping2D map2D,
                    TextureMapping3D map3D) {
     // Define 1D checkerboard filtered integral functions
     auto d = [](Float x) {
@@ -285,7 +285,7 @@ std::string SpectrumCheckerboardTexture::ToString() const {
 }
 
 // InsidePolkaDot Function Definition
-bool InsidePolkaDot(Point2f st) {
+PBRT_CPU_GPU bool InsidePolkaDot(Point2f st) {
     // Compute cell indices (_sCell_,_tCell_ for dots
     int sCell = pstd::floor(st[0] + .5f), tCell = pstd::floor(st[1] + .5f);
 
@@ -356,10 +356,10 @@ std::string FBmTexture::ToString() const {
 }
 
 // SpectrumImageTexture Method Definitions
-SampledSpectrum SpectrumImageTexture::Evaluate(TextureEvalContext ctx,
+PBRT_CPU_GPU SampledSpectrum SpectrumImageTexture::Evaluate(TextureEvalContext ctx,
                                                SampledWavelengths lambda) const {
 #ifdef PBRT_IS_GPU_CODE
-    assert(!"Should not be called in GPU code");
+    CHECK(!"Should not be called in GPU code");
     return SampledSpectrum(0);
 #else
     // Apply texture mapping and flip $t$ coordinate for image texture lookup
@@ -477,7 +477,7 @@ SpectrumImageTexture *SpectrumImageTexture::Create(
 }
 
 // MarbleTexture Method Definitions
-SampledSpectrum MarbleTexture::Evaluate(TextureEvalContext ctx,
+PBRT_CPU_GPU SampledSpectrum MarbleTexture::Evaluate(TextureEvalContext ctx,
                                         SampledWavelengths lambda) const {
     TexCoord3D c = mapping.Map(ctx);
     c.p *= scale;
@@ -582,7 +582,6 @@ SpectrumDirectionMixTexture *SpectrumDirectionMixTexture::Create(
         parameters.GetSpectrumTexture("tex2", one, spectrumType, alloc), dir);
 }
 
-static std::mutex ptexMutex;
 static Ptex::PtexCache *cache;
 
 STAT_COUNTER("Texture/Ptex lookups", nLookups);
@@ -601,7 +600,8 @@ struct : public PtexErrorHandler {
 PtexTextureBase::PtexTextureBase(const std::string &filename, ColorEncoding encoding,
                                  Float scale)
     : filename(filename), encoding(encoding), scale(scale) {
-    ptexMutex.lock();
+    std::mutex mutex;
+    mutex.lock();
     if (!cache) {
         int maxFiles = 100;
         size_t maxMem = 1ull << 32;  // 4GB
@@ -611,7 +611,7 @@ PtexTextureBase::PtexTextureBase(const std::string &filename, ColorEncoding enco
                                         &errorHandler);
         // TODO? cache->setSearchPath(...);
     }
-    ptexMutex.unlock();
+    mutex.unlock();
 
     // Issue an error if the texture doesn't exist or has an unsupported
     // number of channels.
@@ -695,7 +695,7 @@ std::string SpectrumPtexTexture::ToString() const {
     return StringPrintf("[ SpectrumPtexTexture %s ]", BaseToString());
 }
 
-Float FloatPtexTexture::Evaluate(TextureEvalContext ctx) const {
+PBRT_CPU_GPU Float FloatPtexTexture::Evaluate(TextureEvalContext ctx) const {
 #ifdef PBRT_IS_GPU_CODE
     LOG_FATAL("Ptex not supported with GPU renderer");
     return 0;
@@ -709,7 +709,7 @@ Float FloatPtexTexture::Evaluate(TextureEvalContext ctx) const {
 #endif
 }
 
-SampledSpectrum SpectrumPtexTexture::Evaluate(TextureEvalContext ctx,
+PBRT_CPU_GPU SampledSpectrum SpectrumPtexTexture::Evaluate(TextureEvalContext ctx,
                                               SampledWavelengths lambda) const {
 #ifdef PBRT_IS_GPU_CODE
     LOG_FATAL("Ptex not supported with GPU renderer");
@@ -989,15 +989,23 @@ WrinkledTexture *WrinkledTexture::Create(const Transform &renderFromTexture,
 
 #if defined(PBRT_BUILD_GPU_RENDERER)
 
+// mipmaps were not fully supported in HIP
+// we use just the base resolution for now
+#ifdef __HIPCC__
+using TextureArray = cudaArray_t;
+#else
+using TextureArray = cudaMipmappedArray_t;
+#endif
+
 struct LuminanceTextureCacheItem {
-    cudaMipmappedArray_t mipArray;
+    TextureArray texArray;
     cudaTextureReadMode readMode;
     int nMIPMapLevels;
     bool originallySingleChannel;
 };
 
 struct RGBTextureCacheItem {
-    cudaMipmappedArray_t mipArray;
+    TextureArray texArray;
     cudaTextureReadMode readMode;
     int nMIPMapLevels;
     const RGBColorSpace *colorSpace;
@@ -1009,10 +1017,10 @@ static std::map<std::string, RGBTextureCacheItem> rgbTextureCache;
 
 STAT_MEMORY_COUNTER("Memory/ImageTextures", gpuImageTextureBytes);
 
-static cudaMipmappedArray_t createSingleChannelTextureArray(
+static TextureArray createSingleChannelTextureArray(
     const Image &image, const RGBColorSpace *colorSpace, int *nMIPMapLevels) {
     CHECK_EQ(1, image.NChannels());
-    cudaMipmappedArray_t mipArray;
+    TextureArray texArray;
 
     cudaChannelFormatDesc channelDesc;
     switch (image.Format()) {
@@ -1036,13 +1044,37 @@ static cudaMipmappedArray_t createSingleChannelTextureArray(
     const Image &baseImage = mipmap.GetLevel(0);
     cudaExtent extent =
         make_cudaExtent(baseImage.Resolution().x, baseImage.Resolution().y, 0);
-    CUDA_CHECK(cudaMallocMipmappedArray(&mipArray, &channelDesc, extent, mipmap.Levels(),
+
+#ifdef __HIPCC__
+    int pitch;
+    switch (image.Format()) {
+    case PixelFormat::U256:
+        pitch = baseImage.Resolution().x * sizeof(uint8_t);
+        break;
+    case PixelFormat::Half:
+        pitch = baseImage.Resolution().x * sizeof(Half);
+        break;
+    case PixelFormat::Float:
+        pitch = baseImage.Resolution().x * sizeof(float);
+        break;
+    default:
+        LOG_FATAL("Unhandled PixelFormat");
+    }
+
+    gpuImageTextureBytes += pitch * baseImage.Resolution().y;
+
+    CUDA_CHECK(cudaMallocArray(&texArray, &channelDesc, extent.width, extent.height));
+    CUDA_CHECK(cudaMemcpy2DToArray(texArray, /* offset */ 0, 0,
+                                  baseImage.RawPointer({0, 0}), pitch, pitch,
+                                  baseImage.Resolution().y, cudaMemcpyHostToDevice));
+#else
+    CUDA_CHECK(cudaMallocMipmappedArray(&texArray, &channelDesc, extent, mipmap.Levels(),
                                         0 /* flags */));
 
     for (int level = 0; level < mipmap.Levels(); ++level) {
         const Image &levelImage = mipmap.GetLevel(level);
         cudaArray_t levelArray;
-        CUDA_CHECK(cudaGetMipmappedArrayLevel(&levelArray, mipArray, level));
+        CUDA_CHECK(cudaGetMipmappedArrayLevel(&levelArray, texArray, level));
 
         int pitch;
         switch (image.Format()) {
@@ -1065,8 +1097,9 @@ static cudaMipmappedArray_t createSingleChannelTextureArray(
             levelArray, /* offset */ 0, 0, levelImage.RawPointer({0, 0}), pitch, pitch,
             levelImage.Resolution().y, cudaMemcpyHostToDevice));
     }
+#endif
 
-    return mipArray;
+    return texArray;
 }
 
 static cudaTextureAddressMode convertAddressMode(const std::string &mode) {
@@ -1107,7 +1140,7 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create(
     ColorEncoding encoding = ColorEncoding::Get(encodingString, alloc);
 
     // These have to be initialized one way or another in the below
-    cudaMipmappedArray_t mipArray;
+    TextureArray texArray;
     int nMIPMapLevels = 0;
     cudaTextureReadMode readMode;
     const RGBColorSpace *colorSpace = nullptr;
@@ -1117,7 +1150,7 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create(
     auto rgbIter = rgbTextureCache.find(filename);
     if (rgbIter != rgbTextureCache.end()) {
         LOG_VERBOSE("Found %s in RGB tex array cache!", filename);
-        mipArray = rgbIter->second.mipArray;
+        texArray = rgbIter->second.texArray;
         readMode = rgbIter->second.readMode;
         nMIPMapLevels = rgbIter->second.nMIPMapLevels;
         colorSpace = rgbIter->second.colorSpace;
@@ -1128,7 +1161,7 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create(
         // GPUFloatImageTexture converted it to single channel
         if (lumIter != lumTextureCache.end() && lumIter->second.originallySingleChannel) {
             LOG_VERBOSE("Found %s in luminance tex array cache!", filename);
-            mipArray = lumIter->second.mipArray;
+            texArray = lumIter->second.texArray;
             readMode = lumIter->second.readMode;
             nMIPMapLevels = lumIter->second.nMIPMapLevels;
             colorSpace = RGBColorSpace::sRGB;
@@ -1162,14 +1195,35 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create(
 
                         cudaExtent extent = make_cudaExtent(baseImage.Resolution().x,
                                                             baseImage.Resolution().y, 0);
-                        CUDA_CHECK(cudaMallocMipmappedArray(&mipArray, &channelDesc,
+#ifdef __HIPCC__
+                        std::vector<uint8_t> rgba(4 * baseImage.Resolution().x *
+                                                  baseImage.Resolution().y);
+                        size_t offset = 0;
+                        for (int y = 0; y < baseImage.Resolution().y; ++y)
+                            for (int x = 0; x < baseImage.Resolution().x; ++x) {
+                                for (int c = 0; c < 3; ++c)
+                                    rgba[offset++] =
+                                        ((uint8_t *)baseImage.RawPointer({x, y}))[c];
+                                rgba[offset++] = 255;
+                            }
+
+                        int pitch = baseImage.Resolution().x * 4 * sizeof(uint8_t);
+                        gpuImageTextureBytes += pitch * baseImage.Resolution().y;
+
+                        CUDA_CHECK(cudaMallocArray(&texArray, &channelDesc, extent.width,
+                                                  extent.height));
+                        CUDA_CHECK(cudaMemcpy2DToArray(
+                            texArray, /* offset */ 0, 0, rgba.data(), pitch, pitch,
+                            baseImage.Resolution().y, cudaMemcpyHostToDevice));
+#else
+                        CUDA_CHECK(cudaMallocMipmappedArray(&texArray, &channelDesc,
                                                             extent, mipmap.Levels(),
                                                             0 /* flags */));
                         for (int level = 0; level < mipmap.Levels(); ++level) {
                             const Image &levelImage = mipmap.GetLevel(level);
                             cudaArray_t levelArray;
                             CUDA_CHECK(
-                                cudaGetMipmappedArrayLevel(&levelArray, mipArray, level));
+                                cudaGetMipmappedArrayLevel(&levelArray, texArray, level));
 
                             std::vector<uint8_t> rgba(4 * levelImage.Resolution().x *
                                                       levelImage.Resolution().y);
@@ -1190,6 +1244,7 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create(
                                 /* offset */ 0, 0, rgba.data(), pitch, pitch,
                                 levelImage.Resolution().y, cudaMemcpyHostToDevice));
                         }
+#endif
                         break;
                     }
                     case PixelFormat::Half: {
@@ -1198,7 +1253,28 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create(
 
                         cudaExtent extent = make_cudaExtent(baseImage.Resolution().x,
                                                             baseImage.Resolution().y, 0);
-                        CUDA_CHECK(cudaMallocMipmappedArray(&mipArray, &channelDesc,
+#ifdef __HIPCC__
+                        std::vector<Half> rgba(4 * baseImage.Resolution().x *
+                                               baseImage.Resolution().y);
+                        size_t offset = 0;
+                        for (int y = 0; y < baseImage.Resolution().y; ++y)
+                            for (int x = 0; x < baseImage.Resolution().x; ++x) {
+                                for (int c = 0; c < 3; ++c)
+                                    rgba[offset++] =
+                                        Half(baseImage.GetChannel({x, y}, c));
+                                rgba[offset++] = Half(1.f);
+                            }
+
+                        int pitch = baseImage.Resolution().x * 4 * sizeof(Half);
+                        gpuImageTextureBytes += pitch * baseImage.Resolution().y;
+
+                        CUDA_CHECK(cudaMallocArray(&texArray, &channelDesc, extent.width,
+                                                  extent.height));
+                        CUDA_CHECK(cudaMemcpy2DToArray(
+                            texArray, /* offset */ 0, 0, rgba.data(), pitch, pitch,
+                            baseImage.Resolution().y, cudaMemcpyHostToDevice));
+#else
+                        CUDA_CHECK(cudaMallocMipmappedArray(&texArray, &channelDesc,
                                                             extent, mipmap.Levels(),
                                                             0 /* flags */));
 
@@ -1206,7 +1282,7 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create(
                             const Image &levelImage = mipmap.GetLevel(level);
                             cudaArray_t levelArray;
                             CUDA_CHECK(
-                                cudaGetMipmappedArrayLevel(&levelArray, mipArray, level));
+                                cudaGetMipmappedArrayLevel(&levelArray, texArray, level));
 
                             std::vector<Half> rgba(4 * levelImage.Resolution().x *
                                                    levelImage.Resolution().y);
@@ -1228,6 +1304,7 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create(
                                 /* offset */ 0, 0, rgba.data(), pitch, pitch,
                                 levelImage.Resolution().y, cudaMemcpyHostToDevice));
                         }
+#endif
                         break;
                     }
                     case PixelFormat::Float: {
@@ -1236,7 +1313,28 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create(
 
                         cudaExtent extent = make_cudaExtent(baseImage.Resolution().x,
                                                             baseImage.Resolution().y, 0);
-                        CUDA_CHECK(cudaMallocMipmappedArray(&mipArray, &channelDesc,
+#ifdef __HIPCC__
+                        std::vector<float> rgba(4 * baseImage.Resolution().x *
+                                                baseImage.Resolution().y);
+
+                        size_t offset = 0;
+                        for (int y = 0; y < baseImage.Resolution().y; ++y)
+                            for (int x = 0; x < baseImage.Resolution().x; ++x) {
+                                for (int c = 0; c < 3; ++c)
+                                    rgba[offset++] = baseImage.GetChannel({x, y}, c);
+                                rgba[offset++] = 1.f;
+                            }
+
+                        int pitch = baseImage.Resolution().x * 4 * sizeof(float);
+                        gpuImageTextureBytes += pitch * baseImage.Resolution().y;
+
+                        CUDA_CHECK(cudaMallocArray(&texArray, &channelDesc, extent.width,
+                                                  extent.height));
+                        CUDA_CHECK(cudaMemcpy2DToArray(
+                            texArray, /* offset */ 0, 0, rgba.data(), pitch, pitch,
+                            baseImage.Resolution().y, cudaMemcpyHostToDevice));
+#else
+                        CUDA_CHECK(cudaMallocMipmappedArray(&texArray, &channelDesc,
                                                             extent, mipmap.Levels(),
                                                             0 /* flags */));
 
@@ -1244,7 +1342,7 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create(
                             const Image &levelImage = mipmap.GetLevel(level);
                             cudaArray_t levelArray;
                             CUDA_CHECK(
-                                cudaGetMipmappedArrayLevel(&levelArray, mipArray, level));
+                                cudaGetMipmappedArrayLevel(&levelArray, texArray, level));
 
                             std::vector<float> rgba(4 * levelImage.Resolution().x *
                                                     levelImage.Resolution().y);
@@ -1265,6 +1363,7 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create(
                                 /* offset */ 0, 0, rgba.data(), pitch, pitch,
                                 levelImage.Resolution().y, cudaMemcpyHostToDevice));
                         }
+#endif
                         break;
                     }
                     default:
@@ -1273,15 +1372,15 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create(
 
                     textureCacheMutex.lock();
                     rgbTextureCache[filename] = RGBTextureCacheItem{
-                        mipArray, readMode, nMIPMapLevels, colorSpace};
+                        texArray, readMode, nMIPMapLevels, colorSpace};
                     textureCacheMutex.unlock();
                 } else if (image.NChannels() == 1) {
-                    mipArray = createSingleChannelTextureArray(image, colorSpace,
+                    texArray = createSingleChannelTextureArray(image, colorSpace,
                                                                &nMIPMapLevels);
 
                     textureCacheMutex.lock();
                     lumTextureCache[filename] = LuminanceTextureCacheItem{
-                        mipArray, readMode, nMIPMapLevels, true};
+                        texArray, readMode, nMIPMapLevels, true};
                     textureCacheMutex.unlock();
                     isSingleChannel = true;
                 } else {
@@ -1293,8 +1392,13 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create(
     }
 
     cudaResourceDesc resDesc = {};
+#ifdef __HIPCC__
+    resDesc.resType = cudaResourceTypeArray;
+    resDesc.res.array.array = texArray;
+#else
     resDesc.resType = cudaResourceTypeMipmappedArray;
-    resDesc.res.mipmap.mipmap = mipArray;
+    resDesc.res.mipmap.mipmap = texArray;
+#endif
 
     cudaTextureDesc texDesc = {};
     texDesc.addressMode[0] = convertAddressMode(wrapString);
@@ -1302,6 +1406,7 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create(
     texDesc.filterMode = filter == "point" ? cudaFilterModePoint : cudaFilterModeLinear;
     texDesc.readMode = readMode;
     texDesc.normalizedCoords = 1;
+#ifndef __HIPCC__
     texDesc.maxAnisotropy = Clamp(maxAniso, 1, 16);
     texDesc.maxMipmapLevelClamp = nMIPMapLevels - 1;
     texDesc.minMipmapLevelClamp = 0;
@@ -1309,6 +1414,7 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create(
         (filter == "trilinear" || filter == "ewa" || filter == "EWA")
             ? cudaFilterModeLinear
             : cudaFilterModePoint;
+#endif
     texDesc.borderColor[0] = texDesc.borderColor[1] = texDesc.borderColor[2] =
         texDesc.borderColor[3] = 0.f;
     texDesc.sRGB = 1;
@@ -1356,7 +1462,7 @@ GPUFloatImageTexture *GPUFloatImageTexture::Create(
     std::string encodingString = parameters.GetOneString("encoding", defaultEncoding);
     ColorEncoding encoding = ColorEncoding::Get(encodingString, alloc);
 
-    cudaMipmappedArray_t mipArray;
+    TextureArray texArray;
     int nMIPMapLevels = 0;
     cudaTextureReadMode readMode;
 
@@ -1364,7 +1470,7 @@ GPUFloatImageTexture *GPUFloatImageTexture::Create(
     auto iter = lumTextureCache.find(filename);
     if (iter != lumTextureCache.end()) {
         LOG_VERBOSE("Found %s in luminance tex array cache!", filename);
-        mipArray = iter->second.mipArray;
+        texArray = iter->second.texArray;
         readMode = iter->second.readMode;
         nMIPMapLevels = iter->second.nMIPMapLevels;
         textureCacheMutex.unlock();
@@ -1407,19 +1513,24 @@ GPUFloatImageTexture *GPUFloatImageTexture::Create(
                           image.NChannels());
         }
 
-        mipArray = createSingleChannelTextureArray(image, colorSpace, &nMIPMapLevels);
+        texArray = createSingleChannelTextureArray(image, colorSpace, &nMIPMapLevels);
         readMode = (image.Format() == PixelFormat::U256) ? cudaReadModeNormalizedFloat
                                                          : cudaReadModeElementType;
 
         textureCacheMutex.lock();
         lumTextureCache[filename] =
-            LuminanceTextureCacheItem{mipArray, readMode, nMIPMapLevels, !convertedImage};
+            LuminanceTextureCacheItem{texArray, readMode, nMIPMapLevels, !convertedImage};
         textureCacheMutex.unlock();
     }
 
     cudaResourceDesc resDesc = {};
+#ifdef __HIPCC__
+    resDesc.resType = cudaResourceTypeArray;
+    resDesc.res.array.array = texArray;
+#else
     resDesc.resType = cudaResourceTypeMipmappedArray;
-    resDesc.res.mipmap.mipmap = mipArray;
+    resDesc.res.mipmap.mipmap = texArray;
+#endif
 
     cudaTextureDesc texDesc = {};
     texDesc.addressMode[0] = convertAddressMode(wrapString);
@@ -1427,6 +1538,7 @@ GPUFloatImageTexture *GPUFloatImageTexture::Create(
     texDesc.filterMode = filter == "point" ? cudaFilterModePoint : cudaFilterModeLinear;
     texDesc.readMode = readMode;
     texDesc.normalizedCoords = 1;
+#ifndef __HIPCC__
     texDesc.maxAnisotropy = Clamp(maxAniso, 1, 16);
     texDesc.maxMipmapLevelClamp = nMIPMapLevels - 1;
     texDesc.minMipmapLevelClamp = 0;
@@ -1434,6 +1546,7 @@ GPUFloatImageTexture *GPUFloatImageTexture::Create(
         (filter == "trilinear" || filter == "ewa" || filter == "EWA")
             ? cudaFilterModeLinear
             : cudaFilterModePoint;
+#endif
     texDesc.borderColor[0] = texDesc.borderColor[1] = texDesc.borderColor[2] =
         texDesc.borderColor[3] = 0.f;
     texDesc.sRGB = 1;
@@ -1562,11 +1675,11 @@ SpectrumTexture SpectrumTexture::Create(const std::string &name,
 }
 
 // UniversalTextureEvaluator Method Definitions
-Float UniversalTextureEvaluator::operator()(FloatTexture tex, TextureEvalContext ctx) {
+PBRT_CPU_GPU Float UniversalTextureEvaluator::operator()(FloatTexture tex, TextureEvalContext ctx) {
     return tex.Evaluate(ctx);
 }
 
-SampledSpectrum UniversalTextureEvaluator::operator()(SpectrumTexture tex,
+PBRT_CPU_GPU SampledSpectrum UniversalTextureEvaluator::operator()(SpectrumTexture tex,
                                                       TextureEvalContext ctx,
                                                       SampledWavelengths lambda) {
     return tex.Evaluate(ctx, lambda);
diff --git a/src/pbrt/textures.h b/src/pbrt/textures.h
index 4a074e967..14b0bb433 100644
--- a/src/pbrt/textures.h
+++ b/src/pbrt/textures.h
@@ -24,6 +24,10 @@
 #include <mutex>
 #include <string>
 
+#if defined(__HIPCC__)
+#include <pbrt/util/hip_aliases.h>
+#endif
+
 namespace pbrt {
 
 // TextureEvalContext Definition
@@ -220,7 +224,7 @@ class TextureMapping2D : public TaggedPointer<UVMapping, SphericalMapping,
 };
 
 // TextureMapping2D Inline Functions
-inline TexCoord2D TextureMapping2D::Map(TextureEvalContext ctx) const {
+PBRT_CPU_GPU inline TexCoord2D TextureMapping2D::Map(TextureEvalContext ctx) const {
     auto map = [&](auto ptr) { return ptr->Map(ctx); };
     return Dispatch(map);
 }
@@ -260,7 +264,7 @@ class TextureMapping3D : public TaggedPointer<PointTransformMapping> {
     TexCoord3D Map(TextureEvalContext ctx) const;
 };
 
-inline TexCoord3D TextureMapping3D::Map(TextureEvalContext ctx) const {
+PBRT_CPU_GPU inline TexCoord3D TextureMapping3D::Map(TextureEvalContext ctx) const {
     auto map = [&](auto ptr) { return ptr->Map(ctx); };
     return Dispatch(map);
 }
@@ -578,7 +582,7 @@ class FloatImageTexture : public ImageTextureBase {
     PBRT_CPU_GPU
     Float Evaluate(TextureEvalContext ctx) const {
 #ifdef PBRT_IS_GPU_CODE
-        assert(!"Should not be called in GPU code");
+        CHECK(!"Should not be called in GPU code");
         return 0;
 #else
         TexCoord2D c = mapping.Map(ctx);
@@ -624,7 +628,7 @@ class SpectrumImageTexture : public ImageTextureBase {
     SpectrumType spectrumType;
 };
 
-#if defined(PBRT_BUILD_GPU_RENDERER) && defined(__NVCC__)
+#if defined(PBRT_BUILD_GPU_RENDERER) && (defined(__NVCC__) || defined(__HIPCC__))
 class GPUSpectrumImageTexture {
   public:
     GPUSpectrumImageTexture(std::string filename, TextureMapping2D mapping,
@@ -732,7 +736,7 @@ class GPUFloatImageTexture {
     bool invert;
 };
 
-#else  // PBRT_BUILD_GPU_RENDERER && __NVCC__
+#else  // PBRT_BUILD_GPU_RENDERER && (__NVCC__ ||  __HIPCC__)
 
 class GPUSpectrumImageTexture {
   public:
@@ -769,7 +773,7 @@ class GPUFloatImageTexture {
     std::string ToString() const { return "GPUFloatImageTexture"; }
 };
 
-#endif  // PBRT_BUILD_GPU_RENDERER && __NVCC__
+#endif  // PBRT_BUILD_GPU_RENDERER && (__NVCC__ ||  __HIPCC__)
 
 // MarbleTexture Definition
 class MarbleTexture {
@@ -1125,13 +1129,13 @@ class WrinkledTexture {
     Float omega;
 };
 
-inline Float FloatTexture::Evaluate(TextureEvalContext ctx) const {
+PBRT_CPU_GPU inline Float FloatTexture::Evaluate(TextureEvalContext ctx) const {
     auto eval = [&](auto ptr) { return ptr->Evaluate(ctx); };
     return Dispatch(eval);
 }
 
-inline SampledSpectrum SpectrumTexture::Evaluate(TextureEvalContext ctx,
-                                                 SampledWavelengths lambda) const {
+PBRT_CPU_GPU inline SampledSpectrum SpectrumTexture::Evaluate(
+    TextureEvalContext ctx, SampledWavelengths lambda) const {
     auto eval = [&](auto ptr) { return ptr->Evaluate(ctx, lambda); };
     return Dispatch(eval);
 }
diff --git a/src/pbrt/util/check.h b/src/pbrt/util/check.h
index e2b513a2e..2128504b0 100644
--- a/src/pbrt/util/check.h
+++ b/src/pbrt/util/check.h
@@ -18,10 +18,19 @@ namespace pbrt {
 
 void PrintStackTrace();
 
+#define EMPTY_CHECK \
+    do {            \
+    } while (false) /* swallow semicolon */
+
 #ifdef PBRT_IS_GPU_CODE
 
+#ifdef __HIP_DEVICE_COMPILE__
+#define CHECK(x) EMPTY_CHECK
+#define CHECK_IMPL(a, b, op) EMPTY_CHECK
+#else
 #define CHECK(x) assert(x)
 #define CHECK_IMPL(a, b, op) assert((a)op(b))
+#endif
 
 #define CHECK_EQ(a, b) CHECK_IMPL(a, b, ==)
 #define CHECK_NE(a, b) CHECK_IMPL(a, b, !=)
@@ -56,7 +65,7 @@ void PrintStackTrace();
 
 #ifdef PBRT_DEBUG_BUILD
 
-#define DCHECK(x) (CHECK(x))
+#define DCHECK(x) CHECK(x)
 #define DCHECK_EQ(a, b) CHECK_EQ(a, b)
 #define DCHECK_NE(a, b) CHECK_NE(a, b)
 #define DCHECK_GT(a, b) CHECK_GT(a, b)
@@ -66,10 +75,6 @@ void PrintStackTrace();
 
 #else
 
-#define EMPTY_CHECK \
-    do {            \
-    } while (false) /* swallow semicolon */
-
 // Use an empty check (rather than expanding the macros to nothing) to swallow the
 // semicolon at the end, and avoid empty if-statements.
 #define DCHECK(x) EMPTY_CHECK
diff --git a/src/pbrt/util/color.h b/src/pbrt/util/color.h
index c04536901..5d10c9911 100644
--- a/src/pbrt/util/color.h
+++ b/src/pbrt/util/color.h
@@ -476,18 +476,18 @@ class GammaColorEncoding {
     pstd::array<Float, 1024> inverseLUT;
 };
 
-inline void ColorEncoding::ToLinear(pstd::span<const uint8_t> vin,
+PBRT_CPU_GPU inline void ColorEncoding::ToLinear(pstd::span<const uint8_t> vin,
                                     pstd::span<Float> vout) const {
     auto tolin = [&](auto ptr) { return ptr->ToLinear(vin, vout); };
     Dispatch(tolin);
 }
 
-inline Float ColorEncoding::ToFloatLinear(Float v) const {
+PBRT_CPU_GPU inline Float ColorEncoding::ToFloatLinear(Float v) const {
     auto tfl = [&](auto ptr) { return ptr->ToFloatLinear(v); };
     return Dispatch(tfl);
 }
 
-inline void ColorEncoding::FromLinear(pstd::span<const Float> vin,
+PBRT_CPU_GPU inline void ColorEncoding::FromLinear(pstd::span<const Float> vin,
                                       pstd::span<uint8_t> vout) const {
     auto fl = [&](auto ptr) { return ptr->FromLinear(vin, vout); };
     Dispatch(fl);
diff --git a/src/pbrt/util/colorspace.cpp b/src/pbrt/util/colorspace.cpp
index aa6d5e5db..27994690a 100644
--- a/src/pbrt/util/colorspace.cpp
+++ b/src/pbrt/util/colorspace.cpp
@@ -3,19 +3,15 @@
 // SPDX: Apache-2.0
 
 #include <pbrt/util/colorspace.h>
-
-#ifdef PBRT_BUILD_GPU_RENDERER
-#include <pbrt/gpu/util.h>
-#endif
 #include <pbrt/options.h>
 
 namespace pbrt {
 
 #ifdef PBRT_BUILD_GPU_RENDERER
-PBRT_CONST RGBColorSpace *RGBColorSpace_sRGB;
-PBRT_CONST RGBColorSpace *RGBColorSpace_DCI_P3;
-PBRT_CONST RGBColorSpace *RGBColorSpace_Rec2020;
-PBRT_CONST RGBColorSpace *RGBColorSpace_ACES2065_1;
+PBRT_GPU __constant__ RGBColorSpace *RGBColorSpace_sRGB;
+PBRT_GPU __constant__ RGBColorSpace *RGBColorSpace_DCI_P3;
+PBRT_GPU __constant__ RGBColorSpace *RGBColorSpace_Rec2020;
+PBRT_GPU __constant__ RGBColorSpace *RGBColorSpace_ACES2065_1;
 #endif
 
 // RGBColorSpace Method Definitions
@@ -40,7 +36,7 @@ SquareMatrix<3> ConvertRGBColorSpace(const RGBColorSpace &from, const RGBColorSp
     return to.RGBFromXYZ * from.XYZFromRGB;
 }
 
-RGBSigmoidPolynomial RGBColorSpace::ToRGBCoeffs(RGB rgb) const {
+PBRT_CPU_GPU RGBSigmoidPolynomial RGBColorSpace::ToRGBCoeffs(RGB rgb) const {
     DCHECK(rgb.r >= 0 && rgb.g >= 0 && rgb.b >= 0);
     return (*rgbToSpectrumTable)(ClampZero(rgb));
 }
diff --git a/src/pbrt/util/colorspace.h b/src/pbrt/util/colorspace.h
index 4fc026a34..c015a8f28 100644
--- a/src/pbrt/util/colorspace.h
+++ b/src/pbrt/util/colorspace.h
@@ -11,6 +11,9 @@
 #include <pbrt/util/math.h>
 #include <pbrt/util/spectrum.h>
 #include <pbrt/util/vecmath.h>
+#ifdef PBRT_BUILD_GPU_RENDERER
+#include <pbrt/gpu/util.h>
+#endif
 
 #include <string>
 
@@ -66,10 +69,10 @@ class RGBColorSpace {
 };
 
 #ifdef PBRT_BUILD_GPU_RENDERER
-extern PBRT_CONST RGBColorSpace *RGBColorSpace_sRGB;
-extern PBRT_CONST RGBColorSpace *RGBColorSpace_DCI_P3;
-extern PBRT_CONST RGBColorSpace *RGBColorSpace_Rec2020;
-extern PBRT_CONST RGBColorSpace *RGBColorSpace_ACES2065_1;
+extern PBRT_GPU __constant__ RGBColorSpace *RGBColorSpace_sRGB;
+extern PBRT_GPU __constant__ RGBColorSpace *RGBColorSpace_DCI_P3;
+extern PBRT_GPU __constant__ RGBColorSpace *RGBColorSpace_Rec2020;
+extern PBRT_GPU __constant__ RGBColorSpace *RGBColorSpace_ACES2065_1;
 #endif
 
 SquareMatrix<3> ConvertRGBColorSpace(const RGBColorSpace &from, const RGBColorSpace &to);
diff --git a/src/pbrt/util/float.h b/src/pbrt/util/float.h
index ab9e6447d..a672331ee 100644
--- a/src/pbrt/util/float.h
+++ b/src/pbrt/util/float.h
@@ -16,8 +16,14 @@
 #include <string>
 
 #if defined(PBRT_BUILD_GPU_RENDERER) && defined(PBRT_IS_GPU_CODE)
+#if defined(__HIPCC__)
+#include <hip/hip_runtime.h>
+#include <hip/math_functions.h>
+#include <hip/hip_fp16.h>
+#else
 #include <cuda_fp16.h>
 #endif
+#endif
 
 namespace pbrt {
 
@@ -57,7 +63,7 @@ template <typename T>
 inline PBRT_CPU_GPU typename std::enable_if_t<std::is_floating_point_v<T>, bool> IsNaN(
     T v) {
 #ifdef PBRT_IS_GPU_CODE
-    return isnan(v);
+    return isnan((float)v);
 #else
     return std::isnan(v);
 #endif
@@ -199,9 +205,9 @@ inline constexpr Float gamma(int n) {
 inline PBRT_CPU_GPU Float AddRoundUp(Float a, Float b) {
 #ifdef PBRT_IS_GPU_CODE
 #ifdef PBRT_FLOAT_AS_DOUBLE
-    return __dadd_ru(a, b);
+    return __dadd_rn(a, b);
 #else
-    return __fadd_ru(a, b);
+    return __fadd_rn(a, b);
 #endif
 #else  // CPU
     return NextFloatUp(a + b);
@@ -210,9 +216,9 @@ inline PBRT_CPU_GPU Float AddRoundUp(Float a, Float b) {
 inline PBRT_CPU_GPU Float AddRoundDown(Float a, Float b) {
 #ifdef PBRT_IS_GPU_CODE
 #ifdef PBRT_FLOAT_AS_DOUBLE
-    return __dadd_rd(a, b);
+    return __dadd_rn(a, b);
 #else
-    return __fadd_rd(a, b);
+    return __fadd_rn(a, b);
 #endif
 #else  // CPU
     return NextFloatDown(a + b);
@@ -229,9 +235,9 @@ inline PBRT_CPU_GPU Float SubRoundDown(Float a, Float b) {
 inline PBRT_CPU_GPU Float MulRoundUp(Float a, Float b) {
 #ifdef PBRT_IS_GPU_CODE
 #ifdef PBRT_FLOAT_AS_DOUBLE
-    return __dmul_ru(a, b);
+    return __dmul_rn(a, b);
 #else
-    return __fmul_ru(a, b);
+    return __fmul_rn(a, b);
 #endif
 #else  // CPU
     return NextFloatUp(a * b);
@@ -241,9 +247,9 @@ inline PBRT_CPU_GPU Float MulRoundUp(Float a, Float b) {
 inline PBRT_CPU_GPU Float MulRoundDown(Float a, Float b) {
 #ifdef PBRT_IS_GPU_CODE
 #ifdef PBRT_FLOAT_AS_DOUBLE
-    return __dmul_rd(a, b);
+    return __dmul_rn(a, b);
 #else
-    return __fmul_rd(a, b);
+    return __fmul_rn(a, b);
 #endif
 #else  // CPU
     return NextFloatDown(a * b);
@@ -253,9 +259,9 @@ inline PBRT_CPU_GPU Float MulRoundDown(Float a, Float b) {
 inline PBRT_CPU_GPU Float DivRoundUp(Float a, Float b) {
 #ifdef PBRT_IS_GPU_CODE
 #ifdef PBRT_FLOAT_AS_DOUBLE
-    return __ddiv_ru(a, b);
+    return __ddiv_rn(a, b);
 #else
-    return __fdiv_ru(a, b);
+    return __fdiv_rn(a, b);
 #endif
 #else  // CPU
     return NextFloatUp(a / b);
@@ -265,9 +271,9 @@ inline PBRT_CPU_GPU Float DivRoundUp(Float a, Float b) {
 inline PBRT_CPU_GPU Float DivRoundDown(Float a, Float b) {
 #ifdef PBRT_IS_GPU_CODE
 #ifdef PBRT_FLOAT_AS_DOUBLE
-    return __ddiv_rd(a, b);
+    return __ddiv_rn(a, b);
 #else
-    return __fdiv_rd(a, b);
+    return __fdiv_rn(a, b);
 #endif
 #else  // CPU
     return NextFloatDown(a / b);
@@ -277,9 +283,9 @@ inline PBRT_CPU_GPU Float DivRoundDown(Float a, Float b) {
 inline PBRT_CPU_GPU Float SqrtRoundUp(Float a) {
 #ifdef PBRT_IS_GPU_CODE
 #ifdef PBRT_FLOAT_AS_DOUBLE
-    return __dsqrt_ru(a);
+    return __dsqrt_rn(a);
 #else
-    return __fsqrt_ru(a);
+    return __fsqrt_rn(a);
 #endif
 #else  // CPU
     return NextFloatUp(std::sqrt(a));
@@ -289,9 +295,9 @@ inline PBRT_CPU_GPU Float SqrtRoundUp(Float a) {
 inline PBRT_CPU_GPU Float SqrtRoundDown(Float a) {
 #ifdef PBRT_IS_GPU_CODE
 #ifdef PBRT_FLOAT_AS_DOUBLE
-    return __dsqrt_rd(a);
+    return __dsqrt_rn(a);
 #else
-    return __fsqrt_rd(a);
+    return __fsqrt_rn(a);
 #endif
 #else  // CPU
     return std::max<Float>(0, NextFloatDown(std::sqrt(a)));
@@ -301,9 +307,9 @@ inline PBRT_CPU_GPU Float SqrtRoundDown(Float a) {
 inline PBRT_CPU_GPU Float FMARoundUp(Float a, Float b, Float c) {
 #ifdef PBRT_IS_GPU_CODE
 #ifdef PBRT_FLOAT_AS_DOUBLE
-    return __fma_ru(a, b, c);  // FIXME: what to do here?
+    return __fma_rn(a, b, c);  // FIXME: what to do here?
 #else
-    return __fma_ru(a, b, c);
+    return __fma_rn(a, b, c);
 #endif
 #else  // CPU
     return NextFloatUp(FMA(a, b, c));
@@ -313,9 +319,9 @@ inline PBRT_CPU_GPU Float FMARoundUp(Float a, Float b, Float c) {
 inline PBRT_CPU_GPU Float FMARoundDown(Float a, Float b, Float c) {
 #ifdef PBRT_IS_GPU_CODE
 #ifdef PBRT_FLOAT_AS_DOUBLE
-    return __fma_rd(a, b, c);  // FIXME: what to do here?
+    return __fma_rn(a, b, c);  // FIXME: what to do here?
 #else
-    return __fma_rd(a, b, c);
+    return __fma_rn(a, b, c);
 #endif
 #else  // CPU
     return NextFloatDown(FMA(a, b, c));
diff --git a/src/pbrt/util/hash.h b/src/pbrt/util/hash.h
index 1c53b3a7b..84a0163c0 100644
--- a/src/pbrt/util/hash.h
+++ b/src/pbrt/util/hash.h
@@ -27,7 +27,7 @@ PBRT_CPU_GPU inline uint64_t MurmurHash64A(const unsigned char *key, size_t len,
 
     while (key != end) {
         uint64_t k;
-        std::memcpy(&k, key, sizeof(uint64_t));
+        memcpy(&k, key, sizeof(uint64_t));
         key += 8;
 
         k *= m;
@@ -67,7 +67,7 @@ PBRT_CPU_GPU inline uint64_t MurmurHash64A(const unsigned char *key, size_t len,
 // http://zimbry.blogspot.ch/2011/09/better-bit-mixing-improving-on.html
 PBRT_CPU_GPU inline uint64_t MixBits(uint64_t v);
 
-inline uint64_t MixBits(uint64_t v) {
+PBRT_CPU_GPU inline uint64_t MixBits(uint64_t v) {
     v ^= (v >> 31);
     v *= 0x7fb5d329728ea185;
     v ^= (v >> 27);
diff --git a/src/pbrt/util/hip_aliases.h b/src/pbrt/util/hip_aliases.h
new file mode 100644
index 000000000..fb5e01601
--- /dev/null
+++ b/src/pbrt/util/hip_aliases.h
@@ -0,0 +1,105 @@
+#ifndef PBRT_UTIL_HIP_ALIASES_H
+#define PBRT_UTIL_HIP_ALIASES_H
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_gl_interop.h>
+
+#define cudaError_t hipError_t
+#define cudaSuccess hipSuccess
+#define cudaErrorNotReady hipErrorNotReady
+#define cudaGetLastError hipGetLastError
+#define cudaGetErrorString hipGetErrorString
+
+#define cudaGetDevice hipGetDevice
+#define cudaSetDevice hipSetDevice
+#define cudaGetDeviceCount hipGetDeviceCount
+#define cudaDeviceGetAttribute hipDeviceGetAttribute
+#define cudaDeviceProp hipDeviceProp_t
+#define cudaDeviceGetLimit hipDeviceGetLimit
+#define cudaDeviceSetLimit hipDeviceSetLimit
+#define cudaDeviceSetCacheConfig hipDeviceSetCacheConfig
+#define cudaLimitStackSize hipLimitStackSize
+#define cudaLimitPrintfFifoSize hipLimitPrintfFifoSize
+#define cudaFuncCachePreferL1 hipFuncCachePreferL1
+#define cudaGetDeviceProperties hipGetDeviceProperties
+#define cudaDevAttrKernelExecTimeout hipDeviceAttributeKernelExecTimeout
+#define cudaDevAttrConcurrentManagedAccess hipDeviceAttributeConcurrentManagedAccess
+#define cudaDriverGetVersion hipDriverGetVersion
+#define cudaRuntimeGetVersion hipRuntimeGetVersion
+
+#define cudaGraphicsMapResources hipGraphicsMapResources
+#define cudaGraphicsUnmapResources hipGraphicsUnmapResources
+#define cudaGraphicsResourceGetMappedPointer hipGraphicsResourceGetMappedPointer
+#define cudaGraphicsResource hipGraphicsResource 
+#define cudaGraphicsGLRegisterBuffer hipGraphicsGLRegisterBuffer
+#define cudaGraphicsMapResources hipGraphicsMapResources
+#define cudaGraphicsMapFlagsWriteDiscard hipGraphicsRegisterFlagsWriteDiscard
+
+#define cudaGLGetDevices hipGLGetDevices
+#define cudaGLDeviceListAll hipGLDeviceListAll
+
+#define CUdeviceptr hipDeviceptr_t
+#define cudaMalloc hipMalloc
+#define cudaMallocHost hipHostMalloc
+#define cudaMallocManaged hipMallocManaged
+#define cudaFree hipFree
+
+#define cudaMemcpy hipMemcpy
+#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaMemcpyToSymbol hipMemcpyToSymbol
+#define cudaMemcpyFromSymbol hipMemcpyFromSymbol
+#define cudaMemPrefetchAsync hipMemPrefetchAsync
+#define cudaMemset hipMemset
+#define cudaMemAdvise hipMemAdvise
+#define cudaMemAdviseSetReadMostly hipMemAdviseSetReadMostly
+#define cudaMemAdviseSetPreferredLocation hipMemAdviseSetPreferredLocation
+
+#define cudaArray_t hipArray_t
+#define cudaMallocArray hipMallocArray
+#define cudaMemcpy2DToArray hipMemcpy2DToArray
+#define cudaMipmappedArray_t  hipMipmappedArray_t
+#define cudaMallocMipmappedArray hipMallocMipmappedArray
+#define cudaGetMipmappedArrayLevel hipGetMipmappedArrayLevel
+
+#define cudaExtent hipExtent
+#define make_cudaExtent make_hipExtent
+
+#define cudaTextureObject_t hipTextureObject_t
+#define cudaCreateTextureObject hipCreateTextureObject
+#define cudaChannelFormatDesc hipChannelFormatDesc
+#define cudaCreateChannelDesc hipCreateChannelDesc
+#define cudaChannelFormatKindUnsigned hipChannelFormatKindUnsigned
+#define cudaChannelFormatKindFloat hipChannelFormatKindFloat
+#define cudaFilterModePoint hipFilterModePoint
+#define cudaFilterModeLinear hipFilterModeLinear
+#define cudaTextureDesc hipTextureDesc
+#define cudaTextureAddressMode hipTextureAddressMode
+#define cudaTextureReadMode hipTextureReadMode
+#define cudaAddressModeWrap hipAddressModeWrap
+#define cudaAddressModeClamp hipAddressModeClamp
+#define cudaAddressModeBorder hipAddressModeBorder
+#define cudaReadModeNormalizedFloat hipReadModeNormalizedFloat
+#define cudaReadModeElementType hipReadModeElementType
+#define cudaResourceDesc hipResourceDesc
+#define cudaResourceTypeArray hipResourceTypeArray
+#define cudaResourceTypeMipmappedArray hipResourceTypeMipmappedArray
+
+#define cudaEvent_t hipEvent_t
+#define cudaEventCreate hipEventCreate
+#define cudaEventRecord hipEventRecord
+#define cudaEventElapsedTime hipEventElapsedTime
+#define cudaEventSynchronize hipEventSynchronize
+#define cudaEventQuery hipEventQuery
+
+#define CUstream hipStream_t
+#define cudaStream_t hipStream_t
+#define cudaStreamCreate hipStreamCreate
+#define cudaStreamSynchronize hipStreamSynchronize
+#define cudaDeviceSynchronize hipDeviceSynchronize
+
+#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
+
+#endif  // PBRT_UTIL_HIP_ALIASES_H
diff --git a/src/pbrt/util/image.h b/src/pbrt/util/image.h
index 6771eeb9a..a54eec228 100644
--- a/src/pbrt/util/image.h
+++ b/src/pbrt/util/image.h
@@ -422,7 +422,7 @@ class Image {
 };
 
 // Image Inline Method Definitions
-inline void Image::SetChannel(Point2i p, int c, Float value) {
+PBRT_CPU_GPU inline void Image::SetChannel(Point2i p, int c, Float value) {
     // CHECK(!IsNaN(value));
     if (IsNaN(value)) {
 #ifndef PBRT_IS_GPU_CODE
diff --git a/src/pbrt/util/log.cpp b/src/pbrt/util/log.cpp
index df730c028..2064f1a07 100644
--- a/src/pbrt/util/log.cpp
+++ b/src/pbrt/util/log.cpp
@@ -327,7 +327,7 @@ std::string ToString(LogLevel level) {
     }
 }
 
-void Log(LogLevel level, const char *file, int line, const char *s) {
+PBRT_CPU_GPU void Log(LogLevel level, const char *file, int line, const char *s) {
 #ifdef PBRT_IS_GPU_CODE
     auto strlen = [](const char *ptr) {
         int len = 0;
@@ -389,7 +389,7 @@ void Log(LogLevel level, const char *file, int line, const char *s) {
 #endif
 }
 
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
 // warning #1305-D: function declared with "noreturn" does return
 #ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
 #pragma nv_diag_suppress 1305
@@ -398,7 +398,7 @@ void Log(LogLevel level, const char *file, int line, const char *s) {
 #endif
 #endif
 
-void LogFatal(LogLevel level, const char *file, int line, const char *s) {
+PBRT_CPU_GPU void LogFatal(LogLevel level, const char *file, int line, const char *s) {
 #ifdef PBRT_IS_GPU_CODE
     Log(LogLevel::Fatal, file, line, s);
     __threadfence();
diff --git a/src/pbrt/util/log.h b/src/pbrt/util/log.h
index 0cde178d5..45b34a178 100644
--- a/src/pbrt/util/log.h
+++ b/src/pbrt/util/log.h
@@ -44,16 +44,17 @@ extern FILE *logFile;
 PBRT_CPU_GPU
 void Log(LogLevel level, const char *file, int line, const char *s);
 
-PBRT_CPU_GPU [[noreturn]] void LogFatal(LogLevel level, const char *file, int line,
-                                        const char *s);
+template <typename... Args>
+PBRT_CPU_GPU inline void LogFatal(LogLevel level, const char *file, int line,
+                                  const char *fmt, Args &&...args);
+
+#ifndef __HIPCC__
+PBRT_CPU_GPU void LogFatal(LogLevel level, const char *file, int line, const char *s);
 
 template <typename... Args>
 PBRT_CPU_GPU inline void Log(LogLevel level, const char *file, int line, const char *fmt,
                              Args &&...args);
-
-template <typename... Args>
-PBRT_CPU_GPU [[noreturn]] inline void LogFatal(LogLevel level, const char *file, int line,
-                                               const char *fmt, Args &&...args);
+#endif
 
 #define TO_STRING(x) TO_STRING2(x)
 #define TO_STRING2(x) #x
@@ -62,6 +63,23 @@ PBRT_CPU_GPU [[noreturn]] inline void LogFatal(LogLevel level, const char *file,
 
 extern __constant__ LogLevel LOGGING_LogLevelGPU;
 
+// printing may cause hang in the device code
+#ifdef __HIP_DEVICE_COMPILE__
+
+#define LOG_VERBOSE(...) \
+    do {                 \
+    } while (false) /* swallow semicolon */
+
+#define LOG_ERROR(...) \
+    do {               \
+    } while (false) /* swallow semicolon */
+
+#define LOG_FATAL(...) \
+    do {               \
+    } while (false) /* swallow semicolon */
+
+#else
+
 #define LOG_VERBOSE(...)                               \
     (pbrt::LogLevel::Verbose >= LOGGING_LogLevelGPU && \
      (pbrt::Log(LogLevel::Verbose, __FILE__, __LINE__, __VA_ARGS__), true))
@@ -73,6 +91,8 @@ extern __constant__ LogLevel LOGGING_LogLevelGPU;
 #define LOG_FATAL(...) \
     pbrt::LogFatal(pbrt::LogLevel::Fatal, __FILE__, __LINE__, __VA_ARGS__)
 
+#endif
+
 #else
 
 // Logging Macros
@@ -96,9 +116,9 @@ extern __constant__ LogLevel LOGGING_LogLevelGPU;
 namespace pbrt {
 
 template <typename... Args>
-inline void Log(LogLevel level, const char *file, int line, const char *fmt,
-                Args &&...args) {
-#ifdef PBRT_IS_GPU_CODE
+PBRT_CPU_GPU inline void Log(LogLevel level, const char *file, int line, const char *fmt,
+                             Args &&...args) {
+#if defined(PBRT_IS_GPU_CODE)
     Log(level, file, line, fmt);  // just the format string #yolo
 #else
     std::string s = StringPrintf(fmt, std::forward<Args>(args)...);
@@ -107,9 +127,9 @@ inline void Log(LogLevel level, const char *file, int line, const char *fmt,
 }
 
 template <typename... Args>
-inline void LogFatal(LogLevel level, const char *file, int line, const char *fmt,
-                     Args &&...args) {
-#ifdef PBRT_IS_GPU_CODE
+PBRT_CPU_GPU inline void LogFatal(LogLevel level, const char *file, int line,
+                                  const char *fmt, Args &&...args) {
+#if defined(PBRT_IS_GPU_CODE) || defined(__HIPCC__)
     LogFatal(level, file, line, fmt);  // just the format string #yolo
 #else
     std::string s = StringPrintf(fmt, std::forward<Args>(args)...);
diff --git a/src/pbrt/util/math.h b/src/pbrt/util/math.h
index 48f50ddc8..1568a8a42 100644
--- a/src/pbrt/util/math.h
+++ b/src/pbrt/util/math.h
@@ -25,7 +25,7 @@
 
 namespace pbrt {
 
-#ifdef PBRT_IS_GPU_CODE
+#if defined(PBRT_IS_GPU_CODE) && defined(__CUDACC__)
 
 #define ShadowEpsilon 0.0001f
 #define Pi Float(3.14159265358979323846)
diff --git a/src/pbrt/util/noise.cpp b/src/pbrt/util/noise.cpp
index 77373bea3..646c26800 100644
--- a/src/pbrt/util/noise.cpp
+++ b/src/pbrt/util/noise.cpp
@@ -53,7 +53,7 @@ static PBRT_CONST int NoisePerm[2 * NoisePermSize] = {
 };
 
 // Noise Function Definitions
-Float Noise(Float x, Float y, Float z) {
+PBRT_CPU_GPU Float Noise(Float x, Float y, Float z) {
     // Compute noise cell coordinates and offsets
     // Avoid overflow when computing deltas if the coordinates are too large to store in
     // int32s.
@@ -87,11 +87,11 @@ Float Noise(Float x, Float y, Float z) {
     return Lerp(wz, y0, y1);
 }
 
-Float Noise(Point3f p) {
+PBRT_CPU_GPU Float Noise(Point3f p) {
     return Noise(p.x, p.y, p.z);
 }
 
-inline Float Grad(int x, int y, int z, Float dx, Float dy, Float dz) {
+PBRT_CPU_GPU inline Float Grad(int x, int y, int z, Float dx, Float dy, Float dz) {
     int h = NoisePerm[NoisePerm[NoisePerm[x] + y] + z];
     h &= 15;
     Float u = h < 8 || h == 12 || h == 13 ? dx : dy;
@@ -99,11 +99,11 @@ inline Float Grad(int x, int y, int z, Float dx, Float dy, Float dz) {
     return ((h & 1) ? -u : u) + ((h & 2) ? -v : v);
 }
 
-inline Float NoiseWeight(Float t) {
+PBRT_CPU_GPU inline Float NoiseWeight(Float t) {
     return 6 * Pow<5>(t) - 15 * Pow<4>(t) + 10 * Pow<3>(t);
 }
 
-Vector3f DNoise(Point3f p) {
+PBRT_CPU_GPU Vector3f DNoise(Point3f p) {
     Float delta = .01f;
     Float n = Noise(p);
     Point3f noiseDelta(Noise(p + Vector3f(delta, 0, 0)), Noise(p + Vector3f(0, delta, 0)),
@@ -111,7 +111,8 @@ Vector3f DNoise(Point3f p) {
     return (noiseDelta - Point3f(n, n, n)) / delta;
 }
 
-Float FBm(Point3f p, Vector3f dpdx, Vector3f dpdy, Float omega, int maxOctaves) {
+PBRT_CPU_GPU Float FBm(Point3f p, Vector3f dpdx, Vector3f dpdy, Float omega,
+                       int maxOctaves) {
     // Compute number of octaves for antialiased FBm
     Float len2 = std::max(LengthSquared(dpdx), LengthSquared(dpdy));
     Float n = Clamp(-1 - Log2(len2) / 2, 0, maxOctaves);
@@ -130,7 +131,7 @@ Float FBm(Point3f p, Vector3f dpdx, Vector3f dpdy, Float omega, int maxOctaves)
     return sum;
 }
 
-Float Turbulence(Point3f p, Vector3f dpdx, Vector3f dpdy, Float omega, int maxOctaves) {
+PBRT_CPU_GPU Float Turbulence(Point3f p, Vector3f dpdx, Vector3f dpdy, Float omega, int maxOctaves) {
     // Compute number of octaves for antialiased FBm
     Float len2 = std::max(LengthSquared(dpdx), LengthSquared(dpdy));
     Float n = Clamp(-1 - Log2(len2) / 2, 0, maxOctaves);
diff --git a/src/pbrt/util/parallel.h b/src/pbrt/util/parallel.h
index 48bd6c00b..827d0d3f1 100644
--- a/src/pbrt/util/parallel.h
+++ b/src/pbrt/util/parallel.h
@@ -213,7 +213,7 @@ class AtomicDouble {
     void Add(double v) {
 #if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600)
         atomicAdd(&value, v);
-#elif defined(__CUDA_ARCH__)
+#elif defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
         uint64_t old = bits, assumed;
 
         do {
@@ -235,7 +235,7 @@ class AtomicDouble {
     // AtomicDouble Private Data
 #if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600)
     double value;
-#elif defined(__CUDA_ARCH__)
+#elif (defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__))
     uint64_t bits;
 #else
     std::atomic<uint64_t> bits;
diff --git a/src/pbrt/util/progressreporter.h b/src/pbrt/util/progressreporter.h
index f018f1b46..a4028fa84 100644
--- a/src/pbrt/util/progressreporter.h
+++ b/src/pbrt/util/progressreporter.h
@@ -16,7 +16,11 @@
 #include <thread>
 
 #ifdef PBRT_BUILD_GPU_RENDERER
+#if defined(__HIPCC__)
+#include <pbrt/util/hip_aliases.h>
+#else
 #include <cuda_runtime.h>
+#endif
 #include <vector>
 #endif
 
diff --git a/src/pbrt/util/pstd.h b/src/pbrt/util/pstd.h
index 2d6065fef..52c3e240b 100644
--- a/src/pbrt/util/pstd.h
+++ b/src/pbrt/util/pstd.h
@@ -60,7 +60,7 @@ class array<T, 0> {
     array() = default;
 
     PBRT_CPU_GPU
-    void fill(const T &v) { assert(!"should never be called"); }
+    void fill(const T &v) { CHECK(!"should never be called"); }
 
     PBRT_CPU_GPU
     bool operator==(const array<T, 0> &a) const { return true; }
@@ -81,13 +81,13 @@ class array<T, 0> {
 
     PBRT_CPU_GPU
     T &operator[](size_t i) {
-        assert(!"should never be called");
+        CHECK(!"should never be called");
         static T t;
         return t;
     }
     PBRT_CPU_GPU
     const T &operator[](size_t i) const {
-        assert(!"should never be called");
+        CHECK(!"should never be called");
         static T t;
         return t;
     }
@@ -253,7 +253,7 @@ class optional {
     bool has_value() const { return set; }
 
   private:
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
     // Work-around NVCC bug
     PBRT_CPU_GPU
     T *ptr() { return reinterpret_cast<T *>(&optionalValue); }
diff --git a/src/pbrt/util/sampling.h b/src/pbrt/util/sampling.h
index e5caf7831..f0f844c1f 100644
--- a/src/pbrt/util/sampling.h
+++ b/src/pbrt/util/sampling.h
@@ -1300,7 +1300,7 @@ class PiecewiseLinear2D {
   private:
     using FloatStorage = pstd::vector<float>;
 
-#if !defined(_MSC_VER) && !defined(__CUDACC__)
+#if !defined(_MSC_VER) && !(defined(__CUDACC__) || defined(__HIPCC__))
     static constexpr size_t ArraySize = Dimension;
 #else
     static constexpr size_t ArraySize = (Dimension != 0) ? Dimension : 1;
diff --git a/src/pbrt/util/scattering.cpp b/src/pbrt/util/scattering.cpp
index 0353d1a1f..5dde4610a 100644
--- a/src/pbrt/util/scattering.cpp
+++ b/src/pbrt/util/scattering.cpp
@@ -7,7 +7,7 @@
 namespace pbrt {
 
 // BSSRDF Utility Functions
-Float FresnelMoment1(Float eta) {
+PBRT_CPU_GPU Float FresnelMoment1(Float eta) {
     Float eta2 = eta * eta, eta3 = eta2 * eta, eta4 = eta3 * eta, eta5 = eta4 * eta;
     if (eta < 1)
         return 0.45966f - 1.73965f * eta + 3.37668f * eta2 - 3.904945 * eta3 +
@@ -17,7 +17,7 @@ Float FresnelMoment1(Float eta) {
                1.27198f * eta4 + 0.12746f * eta5;
 }
 
-Float FresnelMoment2(Float eta) {
+PBRT_CPU_GPU Float FresnelMoment2(Float eta) {
     Float eta2 = eta * eta, eta3 = eta2 * eta, eta4 = eta3 * eta, eta5 = eta4 * eta;
     if (eta < 1) {
         return 0.27614f - 0.87350f * eta + 1.12077f * eta2 - 0.65095f * eta3 +
diff --git a/src/pbrt/util/taggedptr.h b/src/pbrt/util/taggedptr.h
index a430d575e..d9fb067c7 100644
--- a/src/pbrt/util/taggedptr.h
+++ b/src/pbrt/util/taggedptr.h
@@ -743,9 +743,14 @@ class TaggedPointer {
     template <typename T>
     PBRT_CPU_GPU TaggedPointer(T *ptr) {
         uint64_t iptr = reinterpret_cast<uint64_t>(ptr);
-        DCHECK_EQ(iptr & ptrMask, iptr);
         constexpr unsigned int type = TypeIndex<T>();
+#if defined(PBRT_IS_WINDOWS) && defined(__HIPCC__)
+        uint64_t aptr = (iptr & tagMask) >> 5ull;
+        bits = (iptr & ptrMask) | aptr | ((uint64_t)type << (tagShift + 2ull));
+#else
+        DCHECK_EQ(iptr & ptrMask, iptr);
         bits = iptr | ((uint64_t)type << tagShift);
+#endif
     }
 
     PBRT_CPU_GPU
@@ -769,7 +774,13 @@ class TaggedPointer {
     }
 
     PBRT_CPU_GPU
-    unsigned int Tag() const { return ((bits & tagMask) >> tagShift); }
+    unsigned int Tag() const {
+#if defined(PBRT_IS_WINDOWS) && defined(__HIPCC__)
+        return ((bits & tagMask) >> (tagShift + 2ull));
+#else
+        return ((bits & tagMask) >> tagShift);
+#endif
+    }
 
     template <typename T>
     PBRT_CPU_GPU bool Is() const {
@@ -825,10 +836,42 @@ class TaggedPointer {
     bool operator!=(const TaggedPointer &tp) const { return bits != tp.bits; }
 
     PBRT_CPU_GPU
-    void *ptr() { return reinterpret_cast<void *>(bits & ptrMask); }
+    void *ptr() {
+#if defined(PBRT_IS_WINDOWS) && defined(__HIPCC__)
+        unsigned int aptr = (bits >> tagShift) & 3;
+        uint64_t iptr = bits & ptrMask;
+        // lld crashes on Windows
+        // iptr |= (uint64_t)aptr << 60;
+        if (aptr == 1)
+            iptr |= 1ull << 60;
+        if (aptr == 2)
+            iptr |= 2ull << 60;
+        if (aptr == 3)
+            iptr |= 3ull << 60;
+        return reinterpret_cast<void *>(iptr);
+#else
+        return reinterpret_cast<void *>(bits & ptrMask);
+#endif
+    }
 
     PBRT_CPU_GPU
-    const void *ptr() const { return reinterpret_cast<const void *>(bits & ptrMask); }
+    const void *ptr() const {
+#if defined(PBRT_IS_WINDOWS) && defined(__HIPCC__)
+        unsigned int aptr = (bits >> tagShift) & 3;
+        uint64_t iptr = bits & ptrMask;
+        // lld crashes on Windows
+        // iptr |= (uint64_t)aptr << 60;
+        if (aptr == 1)
+            iptr |= 1ull << 60;
+        if (aptr == 2)
+            iptr |= 2ull << 60;
+        if (aptr == 3)
+            iptr |= 3ull << 60;
+        return reinterpret_cast<const void *>(iptr);
+#else
+        return reinterpret_cast<const void *>(bits & ptrMask);
+#endif
+    }
 
     template <typename F>
     PBRT_CPU_GPU decltype(auto) Dispatch(F &&func) {
@@ -862,7 +905,12 @@ class TaggedPointer {
     static_assert(sizeof(uintptr_t) <= sizeof(uint64_t),
                   "Expected pointer size to be <= 64 bits");
     // TaggedPointer Private Members
+#if defined(PBRT_IS_WINDOWS) && defined(__HIPCC__)
+    // top two bits are already used by the system on Windows
+    static constexpr int tagShift = 55;
+#else
     static constexpr int tagShift = 57;
+#endif
     static constexpr int tagBits = 64 - tagShift;
     static constexpr uint64_t tagMask = ((1ull << tagBits) - 1) << tagShift;
     static constexpr uint64_t ptrMask = ~tagMask;
diff --git a/src/pbrt/util/vecmath_test.cpp b/src/pbrt/util/vecmath_test.cpp
index 4569c6e6f..8ad27e1bf 100644
--- a/src/pbrt/util/vecmath_test.cpp
+++ b/src/pbrt/util/vecmath_test.cpp
@@ -104,6 +104,7 @@ TEST(Point2, InvertBilinear) {
     }
 }
 
+#if !defined(__HIPCC__)
 TEST(Vector, AngleBetween) {
     EXPECT_EQ(0, AngleBetween(Vector3f(1, 0, 0), Vector3f(1, 0, 0)));
 
@@ -176,6 +177,7 @@ TEST(Vector, AngleBetween) {
     Float abet = AngleBetween(a, b);
     EXPECT_EQ(abet, precise) << StringPrintf("vs naive %f", naive);
 }
+#endif
 
 TEST(Vector, CoordinateSystem) {
     // Duff et al 2017 footnote 1
diff --git a/src/pbrt/wavefront/integrator.cpp b/src/pbrt/wavefront/integrator.cpp
index c97d995e8..6729a7be5 100644
--- a/src/pbrt/wavefront/integrator.cpp
+++ b/src/pbrt/wavefront/integrator.cpp
@@ -9,7 +9,11 @@
 #include <pbrt/film.h>
 #include <pbrt/filters.h>
 #ifdef PBRT_BUILD_GPU_RENDERER
+#if defined(__HIPCC__)
+#include <pbrt/gpu/hiprt/aggregate.h>
+#else
 #include <pbrt/gpu/optix/aggregate.h>
+#endif
 #include <pbrt/gpu/memory.h>
 #endif  // PBRT_BUILD_GPU_RENDERER
 #include <pbrt/lights.h>
@@ -36,8 +40,12 @@
 #include <map>
 
 #ifdef PBRT_BUILD_GPU_RENDERER
+#if defined(__HIPCC__)
+#include <pbrt/util/hip_aliases.h>
+#else
 #include <cuda.h>
 #include <cuda_runtime.h>
+#endif
 #endif  // PBRT_BUILD_GPU_RENDERER
 
 namespace pbrt {
@@ -154,13 +162,28 @@ WavefrontPathIntegrator::WavefrontPathIntegrator(
     filter = film.GetFilter();
     sampler = scene.GetSampler();
 
+    // Compute number of scanlines to render per pass
+    Vector2i resolution = film.PixelBounds().Diagonal();
+    // TODO: make this configurable. Base it on the amount of GPU memory?
+    int maxSamples = 1024 * 1024;
+    scanlinesPerPass = std::max(1, maxSamples / resolution.x);
+    int nPasses = (resolution.y + scanlinesPerPass - 1) / scanlinesPerPass;
+    scanlinesPerPass = (resolution.y + nPasses - 1) / nPasses;
+    maxQueueSize = resolution.x * scanlinesPerPass;
+
     if (Options->useGPU) {
 #ifdef PBRT_BUILD_GPU_RENDERER
         CUDATrackedMemoryResource *mr =
             dynamic_cast<CUDATrackedMemoryResource *>(memoryResource);
         CHECK(mr);
+#ifdef __HIPCC__
+        aggregate = new HiprtAggregate(scene, mr, textures, shapeIndexToAreaLights, media,
+                                       namedMaterials, materials, maxQueueSize);
+#else
         aggregate = new OptiXAggregate(scene, mr, textures, shapeIndexToAreaLights, media,
                                        namedMaterials, materials);
+#endif
+
 #else
         LOG_FATAL("Options->useGPU was set without PBRT_BUILD_GPU_RENDERER enabled");
 #endif
@@ -224,14 +247,6 @@ WavefrontPathIntegrator::WavefrontPathIntegrator(
     }
 #endif  // PBRT_BUILD_GPU_RENDERER
 
-    // Compute number of scanlines to render per pass
-    Vector2i resolution = film.PixelBounds().Diagonal();
-    // TODO: make this configurable. Base it on the amount of GPU memory?
-    int maxSamples = 1024 * 1024;
-    scanlinesPerPass = std::max(1, maxSamples / resolution.x);
-    int nPasses = (resolution.y + scanlinesPerPass - 1) / scanlinesPerPass;
-    scanlinesPerPass = (resolution.y + nPasses - 1) / nPasses;
-    maxQueueSize = resolution.x * scanlinesPerPass;
     LOG_VERBOSE("Will render in %d passes %d scanlines per pass\n", nPasses,
                 scanlinesPerPass);
 
diff --git a/src/pbrt/wavefront/intersect.h b/src/pbrt/wavefront/intersect.h
index 310173a1e..690aa10e8 100644
--- a/src/pbrt/wavefront/intersect.h
+++ b/src/pbrt/wavefront/intersect.h
@@ -54,7 +54,7 @@ inline PBRT_CPU_GPU void EnqueueWorkAfterIntersection(
         intr.mediumInterface ? *intr.mediumInterface : MediumInterface(rayMedium);
 
     if (rayMedium) {
-        assert(mediumSampleQueue);
+        CHECK(mediumSampleQueue);
         PBRT_DBG("Enqueuing into medium sample queue\n");
         mediumSampleQueue->Push(MediumSampleWorkItem{r.ray,
                                                      r.depth,
diff --git a/src/pbrt/wavefront/wavefront.cpp b/src/pbrt/wavefront/wavefront.cpp
index 7482fdb84..c5d86cc68 100644
--- a/src/pbrt/wavefront/wavefront.cpp
+++ b/src/pbrt/wavefront/wavefront.cpp
@@ -66,7 +66,14 @@ void RenderWavefront(BasicScene &scene) {
     integrator->camera.InitMetadata(&metadata);
     metadata.renderTimeSeconds = seconds;
     metadata.samplesPerPixel = integrator->sampler.SamplesPerPixel();
+    // multithreading somehow hangs in HIP
+#ifdef __HIPCC__
+    if (Options->useGPU) DisableThreadPool();
+#endif
     integrator->film.WriteImage(metadata);
+#ifdef __HIPCC__
+    if (Options->useGPU) ReenableThreadPool();
+#endif
 }
 
 }  // namespace pbrt
diff --git a/src/pbrt/wavefront/workqueue.h b/src/pbrt/wavefront/workqueue.h
index 2f0827e4d..10e81f5a9 100644
--- a/src/pbrt/wavefront/workqueue.h
+++ b/src/pbrt/wavefront/workqueue.h
@@ -33,6 +33,9 @@
 #include <cuda/atomic>
 #endif
 
+#elif defined(__HIPCC__)
+#define PBRT_USE_LEGACY_CUDA_ATOMICS
+
 #endif  // __CUDACC__
 
 namespace pbrt {