diff --git a/0001-MSVC-HIP.patch b/0001-MSVC-HIP.patch new file mode 100644 index 000000000..75dfb6092 --- /dev/null +++ b/0001-MSVC-HIP.patch @@ -0,0 +1,24 @@ +From 56933471af36147e1032fbbc7912ca0088797b78 Mon Sep 17 00:00:00 2001 +From: Aaryaman Vasishta +Date: Sun, 10 Dec 2023 18:40:22 +0900 +Subject: [PATCH] Fix BitScanreverse by including intrin.h for MSVC + +--- + common/compiler_msc.h | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/common/compiler_msc.h b/common/compiler_msc.h +index 18cfa12..2eafd55 100644 +--- a/common/compiler_msc.h ++++ b/common/compiler_msc.h +@@ -4,6 +4,7 @@ + + #include + #include /* for _byteswap_*() */ ++#include + + #define LIBEXPORT __declspec(dllexport) + +-- +2.33.0.windows.2 + diff --git a/CMakeLists.txt b/CMakeLists.txt index 6c6a50166..cf4120218 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,11 +11,17 @@ if (NOT DEFINED PBRT_SOURCE_CODE) set (PBRT_SOURCE_CODE ${CMAKE_SOURCE_DIR}) endif () +# HIP +list (APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) +set (AMDGPU_TARGETS "gfx1100" CACHE STRING "List of specific machine types for library to target") # User variable + # For sanitizers list (INSERT CMAKE_MODULE_PATH 0 "${PBRT_SOURCE_CODE}/cmake") # Configuration options +option (PBRT_HIPRT "Use HIPRT as backend" OFF) +option (PBRT_HIPRT_SDK "Use HIPRT SDK instead of HIPRT in HIP SDK" OFF) option (PBRT_FLOAT_AS_DOUBLE "Use 64-bit floats" OFF) option (PBRT_BUILD_NATIVE_EXECUTABLE "Build executable optimized for CPU architecture of system pbrt was built on" ON) option (PBRT_DBG_LOGGING "Enable (very verbose!) debug logging" OFF) @@ -23,6 +29,7 @@ option (PBRT_NVTX "Insert NVTX annotations for NVIDIA Profiling and Debugging To option (PBRT_NVML "Use NVML for GPU performance measurement" OFF) option (PBRT_USE_PREGENERATED_RGB_TO_SPECTRUM_TABLES "Use pregenerated rgbspectrum_*.cpp files rather than running rgb2spec_opt to generate them at build time" OFF) set (PBRT_OPTIX7_PATH $ENV{PBRT_OPTIX7_PATH} CACHE PATH "Path to OptiX 7 SDK") +set (PBRT_HIPRT_PATH $ENV{PBRT_HIPRT_PATH} CACHE PATH "Path to HIPRT SDK") set (PBRT_GPU_SHADER_MODEL "" CACHE STRING "") @@ -64,7 +71,7 @@ function (CHECK_EXT NAME DIR HASH) endfunction () check_ext ("OpenEXR" "openexr/OpenEXR" ca757f97033ae7cfcc6130f23c0e9ee86b108b07) -check_ext ("OpenVDB" "openvdb/nanovdb" 414bed84c2fc22e188eac7b611aa85c7edd7a5a9) +check_ext ("OpenVDB" "openvdb/nanovdb" 385516321b272c423e2a514b41bd21ce6ffdda72) check_ext ("Ptex" "ptex/src" 054047d02b9e06e690420b407114d2872435b953) check_ext ("double-conversion" "double-conversion/cmake" cc1f75a114aca8d2af69f73a5a959aecbab0e87a) check_ext ("filesystem" "filesystem/filesystem" c5f9de30142453eb3c6fe991e82dfc2583373116) @@ -86,7 +93,7 @@ find_package(OpenGL REQUIRED) set_property (GLOBAL PROPERTY USE_FOLDERS ON) -if (MSVC) +if (WIN32) list (APPEND PBRT_DEFINITIONS "PBRT_IS_MSVC" "_CRT_SECURE_NO_WARNINGS") list (APPEND PBRT_DEFINITIONS "PBRT_IS_MSVC" "_ENABLE_EXTENDED_ALIGNED_STORAGE") endif () @@ -155,207 +162,268 @@ target_compile_options ( add_library (pbrt_opt INTERFACE) ######################################### -## CUDA / OptiX +## CUDA / OptiX / HIP / HIPRT add_library (cuda_build_configuration INTERFACE) include (CheckLanguage) -check_language(CUDA) - -if (CMAKE_CUDA_COMPILER) - if (CUDA_VERSION_MAJOR LESS 11) - message (WARNING "pbrt-v4 requires CUDA version 11.0 or later but version ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}\ -was found. GPU support is therefore disabled. If you have multiple versions\ -of CUDA installed, please update your PATH.") +# Function to get the short path +function(get_short_path input_path output_var) + # Create a command to get the short path using cmd.exe + execute_process( + COMMAND cmd /c for %A in ("${input_path}") do @echo %~sA + OUTPUT_VARIABLE short_path + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + # Set the result to the output variable + set(${output_var} "${short_path}" PARENT_SCOPE) +endfunction() + +if (PBRT_HIPRT) + find_package (hip REQUIRED) + set (PBRT_HIP_PATH $ENV{HIP_PATH}) + get_short_path("${PBRT_HIP_PATH}" PBRT_HIP_PATH) + string(REGEX REPLACE "[/\\]$" "" PBRT_HIP_PATH "${PBRT_HIP_PATH}") + + set (CUDA_COMPILE_LANGUAGE CXX) + + if (WIN32) + if (PBRT_HIPRT_SDK) + set (PBRT_HIPRT_BINARY_PATH ${PBRT_HIPRT_PATH}/hiprt/win) + else () + set (PBRT_HIPRT_PATH ${PBRT_HIP_PATH}/include) + set (PBRT_HIPRT_BINARY_PATH ${PBRT_HIP_PATH}/bin) + endif () + else () + set (PBRT_HIPRT_BINARY_PATH ${PBRT_HIPRT_PATH}/hiprt/linux64) + endif () + link_directories (${PBRT_HIPRT_BINARY_PATH}) + include_directories (${PBRT_HIPRT_PATH}) + list (APPEND PBRT_DEFINITIONS "PBRT_BUILD_GPU_RENDERER") + list (APPEND PBRT_DEFINITIONS "GTEST_HAS_SEH=0") + list (APPEND PBRT_CXX_FLAGS "-std=c++17") + list (APPEND PBRT_CXX_FLAGS "-fgpu-rdc") + list (APPEND PBRT_LINK_FLAGS "-fgpu-rdc") + if (WIN32) + list (APPEND PBRT_LINK_FLAGS "-fuse-ld=lld") + endif() + list (APPEND PBRT_LINK_FLAGS "--hip-link") + list (APPEND PBRT_LINK_FLAGS "-Xoffload-linker") + list (APPEND PBRT_LINK_FLAGS "--whole-archive") + set (PBRT_HIP_ENABLED ON) + file (GLOB PBRT_HIPRT_BITCODES ${PBRT_HIPRT_BINARY_PATH}/*.bc) + file (GLOB PBRT_HIPRT_FATBINS ${PBRT_HIPRT_BINARY_PATH}/*.hipfb) + file (COPY ${PBRT_HIPRT_BITCODES} DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) + file (COPY ${PBRT_HIPRT_FATBINS} DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) + get_filename_component(PBRT_HIPRT_BITCODE_FILENAME ${PBRT_HIPRT_BITCODES} NAME) + if (WIN32) + file (GLOB PBRT_HIPRT_DLLS ${PBRT_HIPRT_BINARY_PATH}/*.dll) + file (GLOB PBRT_HIPRT_LIB_NAME ${PBRT_HIPRT_BINARY_PATH}/*.lib) + file (COPY ${PBRT_HIPRT_DLLS} DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) + get_filename_component(PBRT_HIPRT_LIB_NAME ${PBRT_HIPRT_LIB_NAME} NAME_WE) else () - find_package (CUDA REQUIRED) + file (GLOB PBRT_HIPRT_LIB_NAME ${PBRT_HIPRT_BINARY_PATH}/*.so) + get_filename_component(PBRT_HIPRT_LIB_NAME ${PBRT_HIPRT_LIB_NAME} NAME_WE) + string(REGEX REPLACE "^lib" "" PBRT_HIPRT_LIB_NAME ${PBRT_HIPRT_LIB_NAME}) + endif () +else () + check_language(CUDA) - # This seems to be necessary starting with 3.17.1, but gives an error - # about 17 being an unsupported version earlier... - if (${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.17.0") - set (CMAKE_CUDA_STANDARD 17) - endif () + if (CMAKE_CUDA_COMPILER) + if (CUDA_VERSION_MAJOR LESS 11) + message (WARNING "pbrt-v4 requires CUDA version 11.0 or later but version ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}\ + was found. GPU support is therefore disabled. If you have multiple versions\ + of CUDA installed, please update your PATH.") + else () + find_package (CUDA REQUIRED) - # https://github.com/VIAME/VIAME/blob/aa6c5f56a898b08e4da102c400b453e23952199c/CMakeLists.txt#L291 - if (NOT CUDA_VERSION_PATCH) - if (CUDA_NVCC_EXECUTABLE AND - CUDA_NVCC_EXECUTABLE STREQUAL CMAKE_CUDA_COMPILER AND - CMAKE_CUDA_COMPILER_VERSION MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=]) - set (CUDA_VERSION_PATCH "${CMAKE_MATCH_3}") - elseif (CUDA_NVCC_EXECUTABLE) - execute_process (COMMAND ${CUDA_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NOUT) - if (NOUT MATCHES [=[ V([0-9]+)\.([0-9]+)\.([0-9]+)]=]) - set (CUDA_VERSION_PATCH "${CMAKE_MATCH_3}") - endif () + # This seems to be necessary starting with 3.17.1, but gives an error + # about 17 being an unsupported version earlier... + if (${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.17.0") + set (CMAKE_CUDA_STANDARD 17) endif () - endif () - - message (STATUS "Found CUDA: ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}.${CUDA_VERSION_PATCH}") - if (CUDA_VERSION_MAJOR EQUAL 11 AND CUDA_VERSION_MINOR EQUAL 3 AND CUDA_VERSION_PATCH LESS 109) - message (SEND_ERROR "Unfortunately, pbrt-v4 triggers an internal compiler error in CUDA 11.3.0. Please either use CUDA 11.0-11.2 or 11.3.1 or later.") - endif () - if ("${PBRT_OPTIX7_PATH}" STREQUAL "") - message (WARNING "Found CUDA but PBRT_OPTIX7_PATH is not set. Disabling GPU compilation.") - else () - enable_language (CUDA) - list (APPEND PBRT_DEFINITIONS "PBRT_BUILD_GPU_RENDERER") - if (PBRT_NVTX) - list (APPEND PBRT_DEFINITIONS "NVTX") - endif () - if (PBRT_NVML) - list (APPEND PBRT_DEFINITIONS "PBRT_USE_NVML") + # https://github.com/VIAME/VIAME/blob/aa6c5f56a898b08e4da102c400b453e23952199c/CMakeLists.txt#L291 + if (NOT CUDA_VERSION_PATCH) + if (CUDA_NVCC_EXECUTABLE AND + CUDA_NVCC_EXECUTABLE STREQUAL CMAKE_CUDA_COMPILER AND + CMAKE_CUDA_COMPILER_VERSION MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=]) + set (CUDA_VERSION_PATCH "${CMAKE_MATCH_3}") + elseif (CUDA_NVCC_EXECUTABLE) + execute_process (COMMAND ${CUDA_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NOUT) + if (NOUT MATCHES [=[ V([0-9]+)\.([0-9]+)\.([0-9]+)]=]) + set (CUDA_VERSION_PATCH "${CMAKE_MATCH_3}") + endif () endif () - set (PBRT_CUDA_ENABLED ON) - - # FIXME - include_directories (${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) # for regular c++ compiles - - # http://www.ssl.berkeley.edu/~jimm/grizzly_docs/SSL/opt/intel/cc/9.0/lib/locale/en_US/mcpcom.msg - target_compile_options ( - pbrt_warnings - INTERFACE - #"$<$:SHELL:-Xptxas --warn-on-double-precision-use>" - "$<$:SHELL:-Xcudafe --diag_suppress=partial_override>" - "$<$:SHELL:-Xcudafe --diag_suppress=virtual_function_decl_hidden>" - "$<$:SHELL:-Xcudafe --diag_suppress=integer_sign_change>" - "$<$:SHELL:-Xcudafe --diag_suppress=declared_but_not_referenced>" - # WAR invalid warnings about this with "if constexpr" - "$<$:SHELL:-Xcudafe --diag_suppress=implicit_return_from_non_void_function>" - ) - - # Willie hears yeh.. - string (APPEND CMAKE_CUDA_FLAGS " -Xnvlink -suppress-stack-size-warning") - - target_compile_options ( - cuda_build_configuration - INTERFACE - "$<$:--std=c++17;--use_fast_math;--expt-relaxed-constexpr;--extended-lambda;--forward-unknown-to-host-compiler>" - # The "$>>" part is to not add debugging symbols when generating PTX files for OptiX; see https://github.com/mmp/pbrt-v4/issues/69#issuecomment-715499748. - "$<$:$,$>>>,-G;-g,-lineinfo;-maxrregcount;128>>" - ) - - if (PBRT_GPU_SHADER_MODEL STREQUAL "") - # https://wagonhelm.github.io/articles/2018-03/detecting-cuda-capability-with-cmake - # Get CUDA compute capability - set (CHECK_CUDA_OUTPUT_EXE ${CMAKE_BINARY_DIR}/checkcuda) - if (MSVC) - execute_process (COMMAND ${CMAKE_CUDA_COMPILER} -lcuda ${PBRT_SOURCE_CODE}/cmake/checkcuda.cu -ccbin ${CMAKE_CXX_COMPILER} -o ${CHECK_CUDA_OUTPUT_EXE} - RESULT_VARIABLE BUILD_CHECK_CUDA_RETURN_CODE) - else () - execute_process (COMMAND ${CMAKE_CUDA_COMPILER} -lcuda ${PBRT_SOURCE_CODE}/cmake/checkcuda.cu -o ${CHECK_CUDA_OUTPUT_EXE} - RESULT_VARIABLE BUILD_CHECK_CUDA_RETURN_CODE) - endif () + endif () - if (NOT ${BUILD_CHECK_CUDA_RETURN_CODE} EQUAL 0) - message (SEND_ERROR "Was unable to build checkcuda, consider manually setting PBRT_GPU_SHADER_MODEL") - endif () + message (STATUS "Found CUDA: ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}.${CUDA_VERSION_PATCH}") + if (CUDA_VERSION_MAJOR EQUAL 11 AND CUDA_VERSION_MINOR EQUAL 3 AND CUDA_VERSION_PATCH LESS 109) + message (SEND_ERROR "Unfortunately, pbrt-v4 triggers an internal compiler error in CUDA 11.3.0. Please either use CUDA 11.0-11.2 or 11.3.1 or later.") + endif () - execute_process (COMMAND ${CHECK_CUDA_OUTPUT_EXE} - RESULT_VARIABLE CUDA_RETURN_CODE - OUTPUT_VARIABLE CHECK_CUDA_OUTPUT) + if ("${PBRT_OPTIX7_PATH}" STREQUAL "") + message (WARNING "Found CUDA but PBRT_OPTIX7_PATH is not set. Disabling GPU compilation.") + else () + enable_language (CUDA) + list (APPEND PBRT_DEFINITIONS "PBRT_BUILD_GPU_RENDERER") + if (PBRT_NVTX) + list (APPEND PBRT_DEFINITIONS "NVTX") + endif () + if (PBRT_NVML) + list (APPEND PBRT_DEFINITIONS "PBRT_USE_NVML") + endif () + set (PBRT_CUDA_ENABLED ON) + + # FIXME + include_directories (${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) # for regular c++ compiles + + # http://www.ssl.berkeley.edu/~jimm/grizzly_docs/SSL/opt/intel/cc/9.0/lib/locale/en_US/mcpcom.msg + target_compile_options ( + pbrt_warnings + INTERFACE + #"$<$:SHELL:-Xptxas --warn-on-double-precision-use>" + "$<$:SHELL:-Xcudafe --diag_suppress=partial_override>" + "$<$:SHELL:-Xcudafe --diag_suppress=virtual_function_decl_hidden>" + "$<$:SHELL:-Xcudafe --diag_suppress=integer_sign_change>" + "$<$:SHELL:-Xcudafe --diag_suppress=declared_but_not_referenced>" + # WAR invalid warnings about this with "if constexpr" + "$<$:SHELL:-Xcudafe --diag_suppress=implicit_return_from_non_void_function>" + ) - if (NOT ${CUDA_RETURN_CODE} EQUAL 0) - message (SEND_ERROR ${CHECK_CUDA_OUTPUT}) - else () - set(ARCH "${CHECK_CUDA_OUTPUT}") - message (STATUS "Detected CUDA Architecture: ${ARCH}") - string (APPEND CMAKE_CUDA_FLAGS " --gpu-architecture=${ARCH}") - endif () - else () - set(ARCH "${PBRT_GPU_SHADER_MODEL}") - message (STATUS "Specified CUDA Architecture: ${ARCH}") - string (APPEND CMAKE_CUDA_FLAGS " --gpu-architecture=${ARCH}") - endif () + # Willie hears yeh.. + string (APPEND CMAKE_CUDA_FLAGS " -Xnvlink -suppress-stack-size-warning") - set (PBRT_CUDA_LIB cuda) - # optix - # FIXME - include_directories (${PBRT_OPTIX7_PATH}/include) - - # find CUDA's bin2c executable - get_filename_component (cuda_compiler_bin "${CMAKE_CUDA_COMPILER}" DIRECTORY) - find_program (BIN2C - NAMES bin2c - PATHS ${cuda_compiler_bin} - DOC "Path to the CUDA SDK bin2c executable." - NO_DEFAULT_PATH) - if (NOT BIN2C) - message (FATAL_ERROR - "bin2c not found:\n" - " CMAKE_CUDA_COMPILER='${CMAKE_CUDA_COMPILER}'\n" - " cuda_compiler_bin='${cuda_compiler_bin}'\n" - ) - endif () + target_compile_options ( + cuda_build_configuration + INTERFACE + "$<$:--std=c++17;--use_fast_math;--expt-relaxed-constexpr;--extended-lambda;--forward-unknown-to-host-compiler>" + # The "$>>" part is to not add debugging symbols when generating PTX files for OptiX; see https://github.com/mmp/pbrt-v4/issues/69#issuecomment-715499748. + "$<$:$,$>>>,-G;-g,-lineinfo;-maxrregcount;128>>" + ) - # this macro defines cmake rules that execute the following four steps: - # 1) compile the given cuda file ${cuda_file} to an intermediary PTX file - # 2) use the 'bin2c' tool (that comes with CUDA) to - # create a second intermediary (.c-)file which defines a const string variable - # (named '${c_var_name}') whose (constant) value is the PTX output - # from the previous step. - # 3) compile the given .c file to an intermediary object file (why thus has - # that PTX string 'embedded' as a global constant. - # 4) assign the name of the intermediary .o file to the cmake variable - # 'output_var', which can then be added to cmake targets. - macro (cuda_compile_and_embed output_var cuda_file lib_name) - add_library ("${lib_name}" OBJECT "${cuda_file}") - set_property (TARGET "${lib_name}" PROPERTY CUDA_PTX_COMPILATION ON) - - # disable "extern declaration... is treated as a static definition" warning - if (CUDA_VERSION_MAJOR EQUAL 11 AND CUDA_VERSION_MINOR LESS 2) - target_compile_options ("${lib_name}" PRIVATE - -Xcudafe=--display_error_number -Xcudafe=--diag_suppress=3089) + if (PBRT_GPU_SHADER_MODEL STREQUAL "") + # https://wagonhelm.github.io/articles/2018-03/detecting-cuda-capability-with-cmake + # Get CUDA compute capability + set (CHECK_CUDA_OUTPUT_EXE ${CMAKE_BINARY_DIR}/checkcuda) + if (MSVC) + execute_process (COMMAND ${CMAKE_CUDA_COMPILER} -lcuda ${PBRT_SOURCE_CODE}/cmake/checkcuda.cu -ccbin ${CMAKE_CXX_COMPILER} -o ${CHECK_CUDA_OUTPUT_EXE} + RESULT_VARIABLE BUILD_CHECK_CUDA_RETURN_CODE) + else () + execute_process (COMMAND ${CMAKE_CUDA_COMPILER} -lcuda ${PBRT_SOURCE_CODE}/cmake/checkcuda.cu -o ${CHECK_CUDA_OUTPUT_EXE} + RESULT_VARIABLE BUILD_CHECK_CUDA_RETURN_CODE) + endif () + + if (NOT ${BUILD_CHECK_CUDA_RETURN_CODE} EQUAL 0) + message (SEND_ERROR "Was unable to build checkcuda, consider manually setting PBRT_GPU_SHADER_MODEL") + endif () + + execute_process (COMMAND ${CHECK_CUDA_OUTPUT_EXE} + RESULT_VARIABLE CUDA_RETURN_CODE + OUTPUT_VARIABLE CHECK_CUDA_OUTPUT) + + if (NOT ${CUDA_RETURN_CODE} EQUAL 0) + message (SEND_ERROR ${CHECK_CUDA_OUTPUT}) + else () + set(ARCH "${CHECK_CUDA_OUTPUT}") + message (STATUS "Detected CUDA Architecture: ${ARCH}") + string (APPEND CMAKE_CUDA_FLAGS " --gpu-architecture=${ARCH}") + endif () else () - target_compile_options ("${lib_name}" PRIVATE - -Xcudafe=--display_error_number -Xcudafe=--diag_suppress=20044) + set(ARCH "${PBRT_GPU_SHADER_MODEL}") + message (STATUS "Specified CUDA Architecture: ${ARCH}") + string (APPEND CMAKE_CUDA_FLAGS " --gpu-architecture=${ARCH}") endif () - # CUDA integration in Visual Studio seems broken as even if "Use - # Host Preprocessor Definitions" is checked, the host preprocessor - # definitions are still not used when compiling device code. - # To work around that, define the macros using --define-macro to - # avoid CMake identifying those as macros and using the proper (but - # broken) way of specifying them. - if (${CMAKE_GENERATOR} MATCHES "^Visual Studio") - # As PBRT_DEBUG_BUILD is specified globally as a definition, we need to - # manually add it due to the bug mentioned earlier and due to it - # not being found in PBRT_DEFINITIONS. - if (CMAKE_BUILD_TYPE MATCHES Debug) - set (cuda_definitions "--define-macro=PBRT_DEBUG_BUILD") - endif () - foreach (arg ${PBRT_DEFINITIONS}) - list (APPEND cuda_definitions "--define-macro=${arg}") - endforeach () - target_compile_options ("${lib_name}" PRIVATE ${cuda_definitions}) - else () - target_compile_definitions ("${lib_name}" PRIVATE ${PBRT_DEFINITIONS}) + set (PBRT_CUDA_LIB cuda) + # optix + # FIXME + include_directories (${PBRT_OPTIX7_PATH}/include) + + # find CUDA's bin2c executable + get_filename_component (cuda_compiler_bin "${CMAKE_CUDA_COMPILER}" DIRECTORY) + find_program (BIN2C + NAMES bin2c + PATHS ${cuda_compiler_bin} + DOC "Path to the CUDA SDK bin2c executable." + NO_DEFAULT_PATH) + if (NOT BIN2C) + message (FATAL_ERROR + "bin2c not found:\n" + " CMAKE_CUDA_COMPILER='${CMAKE_CUDA_COMPILER}'\n" + " cuda_compiler_bin='${cuda_compiler_bin}'\n" + ) endif () - target_include_directories ("${lib_name}" PRIVATE src ${CMAKE_BINARY_DIR}) - target_include_directories ("${lib_name}" SYSTEM PRIVATE ${NANOVDB_INCLUDE}) - target_link_libraries ("${lib_name}" PRIVATE cuda_build_configuration pbrt_opt pbrt_warnings) - add_dependencies ("${lib_name}" pbrt_soa_generated) - set (c_var_name ${output_var}) - set (embedded_file ${cuda_file}.ptx_embedded.c) - add_custom_command ( - OUTPUT "${embedded_file}" - COMMAND ${CMAKE_COMMAND} - "-DBIN_TO_C_COMMAND=${BIN2C}" - "-DOBJECTS=$" - "-DVAR_NAME=${c_var_name}" - "-DOUTPUT=${embedded_file}" - -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/bin2c_wrapper.cmake - VERBATIM - DEPENDS "${lib_name}" $ - COMMENT "Embedding PTX generated from ${cuda_file}" - ) - set (${output_var} ${embedded_file}) - endmacro () - endif () - endif() -else () - message (STATUS "CUDA not found") + + # this macro defines cmake rules that execute the following four steps: + # 1) compile the given cuda file ${cuda_file} to an intermediary PTX file + # 2) use the 'bin2c' tool (that comes with CUDA) to + # create a second intermediary (.c-)file which defines a const string variable + # (named '${c_var_name}') whose (constant) value is the PTX output + # from the previous step. + # 3) compile the given .c file to an intermediary object file (why thus has + # that PTX string 'embedded' as a global constant. + # 4) assign the name of the intermediary .o file to the cmake variable + # 'output_var', which can then be added to cmake targets. + macro (cuda_compile_and_embed output_var cuda_file lib_name) + add_library ("${lib_name}" OBJECT "${cuda_file}") + set_property (TARGET "${lib_name}" PROPERTY CUDA_PTX_COMPILATION ON) + + # disable "extern declaration... is treated as a static definition" warning + if (CUDA_VERSION_MAJOR EQUAL 11 AND CUDA_VERSION_MINOR LESS 2) + target_compile_options ("${lib_name}" PRIVATE + -Xcudafe=--display_error_number -Xcudafe=--diag_suppress=3089) + else () + target_compile_options ("${lib_name}" PRIVATE + -Xcudafe=--display_error_number -Xcudafe=--diag_suppress=20044) + endif () + + # CUDA integration in Visual Studio seems broken as even if "Use + # Host Preprocessor Definitions" is checked, the host preprocessor + # definitions are still not used when compiling device code. + # To work around that, define the macros using --define-macro to + # avoid CMake identifying those as macros and using the proper (but + # broken) way of specifying them. + if (${CMAKE_GENERATOR} MATCHES "^Visual Studio") + # As PBRT_DEBUG_BUILD is specified globally as a definition, we need to + # manually add it due to the bug mentioned earlier and due to it + # not being found in PBRT_DEFINITIONS. + if (CMAKE_BUILD_TYPE MATCHES Debug) + set (cuda_definitions "--define-macro=PBRT_DEBUG_BUILD") + endif () + foreach (arg ${PBRT_DEFINITIONS}) + list (APPEND cuda_definitions "--define-macro=${arg}") + endforeach () + target_compile_options ("${lib_name}" PRIVATE ${cuda_definitions}) + else () + target_compile_definitions ("${lib_name}" PRIVATE ${PBRT_DEFINITIONS}) + endif () + target_include_directories ("${lib_name}" PRIVATE src ${CMAKE_BINARY_DIR}) + target_include_directories ("${lib_name}" SYSTEM PRIVATE ${NANOVDB_INCLUDE}) + target_link_libraries ("${lib_name}" PRIVATE cuda_build_configuration pbrt_opt pbrt_warnings) + add_dependencies ("${lib_name}" pbrt_soa_generated) + set (c_var_name ${output_var}) + set (embedded_file ${cuda_file}.ptx_embedded.c) + add_custom_command ( + OUTPUT "${embedded_file}" + COMMAND ${CMAKE_COMMAND} + "-DBIN_TO_C_COMMAND=${BIN2C}" + "-DOBJECTS=$" + "-DVAR_NAME=${c_var_name}" + "-DOUTPUT=${embedded_file}" + -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/bin2c_wrapper.cmake + VERBATIM + DEPENDS "${lib_name}" $ + COMMENT "Embedding PTX generated from ${cuda_file}" + ) + set (${output_var} ${embedded_file}) + endmacro () + endif () + endif() + else () + message (STATUS "CUDA not found") + endif () endif () ########################################################################### @@ -363,10 +431,12 @@ endif () include (CheckCXXCompilerFlag) -check_cxx_compiler_flag ("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE) -if (COMPILER_SUPPORTS_MARCH_NATIVE AND PBRT_BUILD_NATIVE_EXECUTABLE) - target_compile_options (pbrt_opt INTERFACE - "$<$:SHELL:-Xcompiler >-march=native") +if (NOT PBRT_HIP_ENABLED) + check_cxx_compiler_flag ("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE) + if (COMPILER_SUPPORTS_MARCH_NATIVE AND PBRT_BUILD_NATIVE_EXECUTABLE) + target_compile_options (pbrt_opt INTERFACE + "$<$:SHELL:-Xcompiler >-march=native") + endif () endif () if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel") @@ -692,85 +762,72 @@ SET (PBRT_UTIL_SOURCE_HEADERS src/pbrt/util/vecmath.h ) -if (PBRT_CUDA_ENABLED) - set (PBRT_GPU_SOURCE - src/pbrt/gpu/optix/aggregate.cpp - src/pbrt/gpu/optix/denoiser.cpp - src/pbrt/gpu/memory.cpp - src/pbrt/gpu/util.cpp - ) - set (PBRT_GPU_SOURCE_HEADERS - src/pbrt/gpu/optix/aggregate.h - src/pbrt/gpu/cudagl.h - src/pbrt/gpu/optix/denoiser.h - src/pbrt/gpu/memory.h - src/pbrt/gpu/optix/optix.h - src/pbrt/gpu/util.h - ) - - set_source_files_properties ( - src/pbrt/bsdf.cpp - src/pbrt/bssrdf.cpp - src/pbrt/bxdfs.cpp - src/pbrt/cameras.cpp - src/pbrt/film.cpp - src/pbrt/filters.cpp -# src/pbrt/genscene.cpp - src/pbrt/interaction.cpp - src/pbrt/lights.cpp - src/pbrt/lightsamplers.cpp - src/pbrt/materials.cpp -# src/pbrt/media.cpp - src/pbrt/options.cpp -# src/pbrt/paramdict.cpp -# src/pbrt/parser.cpp - src/pbrt/pbrt.cpp - src/pbrt/samplers.cpp - src/pbrt/shapes.cpp - src/pbrt/textures.cpp - - src/pbrt/util/bluenoise.cpp - src/pbrt/util/check.cpp - src/pbrt/util/color.cpp - src/pbrt/util/colorspace.cpp - src/pbrt/util/error.cpp -# src/pbrt/util/file.cpp -# src/pbrt/util/float.cpp -# src/pbrt/util/image.cpp - src/pbrt/util/log.cpp -# src/pbrt/util/loopsubdiv.cpp - src/pbrt/util/lowdiscrepancy.cpp - src/pbrt/util/math.cpp -# src/pbrt/util/memory.cpp - src/pbrt/util/mesh.cpp -# src/pbrt/util/mipmap.cpp - src/pbrt/util/noise.cpp -# src/pbrt/util/parallel.cpp - src/pbrt/util/pmj02tables.cpp - src/pbrt/util/primes.cpp -# src/pbrt/util/print.cpp -# src/pbrt/util/progressreporter.cpp - src/pbrt/util/pstd.cpp - src/pbrt/util/rng.cpp - src/pbrt/util/sampling.cpp - src/pbrt/util/scattering.cpp - src/pbrt/util/sobolmatrices.cpp - src/pbrt/util/spectrum.cpp - src/pbrt/util/stats.cpp -# src/pbrt/util/stbimage.cpp -# src/pbrt/util/string.cpp - src/pbrt/util/transform.cpp - src/pbrt/util/vecmath.cpp +if (PBRT_CUDA_ENABLED OR PBRT_HIP_ENABLED) + if (PBRT_HIP_ENABLED) + set (PBRT_GPU_SOURCE src/pbrt/gpu/hiprt/aggregate.cpp src/pbrt/gpu/memory.cpp src/pbrt/gpu/util.cpp) + set (PBRT_GPU_SOURCE_HEADERS src/pbrt/gpu/hiprt/aggregate.h src/pbrt/gpu/common.h src/pbrt/gpu/cudagl.h src/pbrt/gpu/memory.h src/pbrt/gpu/util.h) + else () + set (PBRT_GPU_SOURCE src/pbrt/gpu/optix/aggregate.cpp src/pbrt/gpu/optix/denoiser.cpp src/pbrt/gpu/memory.cpp src/pbrt/gpu/util.cpp) + set (PBRT_GPU_SOURCE_HEADERS src/pbrt/gpu/optix/aggregate.h src/pbrt/gpu/optix/denoiser.h src/pbrt/gpu/common.h src/pbrt/gpu/cudagl.h src/pbrt/gpu/memory.h src/pbrt/gpu/util.h) + endif () +set(PBRT_GPU_SOURCE_ALL + src/pbrt/bsdf.cpp + src/pbrt/bssrdf.cpp + src/pbrt/bxdfs.cpp + src/pbrt/cameras.cpp + src/pbrt/film.cpp + src/pbrt/filters.cpp + # src/pbrt/genscene.cpp + src/pbrt/interaction.cpp + src/pbrt/lights.cpp + src/pbrt/lightsamplers.cpp + src/pbrt/materials.cpp + # src/pbrt/media.cpp + src/pbrt/options.cpp + # src/pbrt/paramdict.cpp src/pbrt/parser.cpp + src/pbrt/pbrt.cpp + src/pbrt/samplers.cpp + src/pbrt/shapes.cpp + src/pbrt/textures.cpp + src/pbrt/util/bluenoise.cpp + src/pbrt/util/check.cpp + src/pbrt/util/color.cpp + src/pbrt/util/colorspace.cpp + src/pbrt/util/error.cpp + # src/pbrt/util/file.cpp src/pbrt/util/float.cpp src/pbrt/util/image.cpp + src/pbrt/util/log.cpp + # src/pbrt/util/loopsubdiv.cpp + src/pbrt/util/lowdiscrepancy.cpp + src/pbrt/util/math.cpp + # src/pbrt/util/memory.cpp + src/pbrt/util/mesh.cpp + # src/pbrt/util/mipmap.cpp + src/pbrt/util/noise.cpp + # src/pbrt/util/parallel.cpp + src/pbrt/util/pmj02tables.cpp + src/pbrt/util/primes.cpp + # src/pbrt/util/print.cpp src/pbrt/util/progressreporter.cpp + src/pbrt/util/pstd.cpp + src/pbrt/util/rng.cpp + src/pbrt/util/sampling.cpp + src/pbrt/util/scattering.cpp + src/pbrt/util/sobolmatrices.cpp + src/pbrt/util/spectrum.cpp + src/pbrt/util/stats.cpp + # src/pbrt/util/stbimage.cpp src/pbrt/util/string.cpp + src/pbrt/util/transform.cpp + src/pbrt/util/vecmath.cpp ${PBRT_WAVEFRONT_SOURCE} ${PBRT_GPU_SOURCE} + src/pbrt/cmd/pspec_gpu.cpp) - src/pbrt/cmd/pspec_gpu.cpp - - PROPERTIES LANGUAGE CUDA - ) - - cuda_compile_and_embed (PBRT_EMBEDDED_PTX src/pbrt/gpu/optix/optix.cu optix.cu) + if(PBRT_HIP_ENABLED) + set_source_files_properties(${PBRT_GPU_SOURCE_ALL} PROPERTIES LANGUAGE CXX) + else() + set_source_files_properties(${PBRT_GPU_SOURCE_ALL} PROPERTIES LANGUAGE CUDA) + cuda_compile_and_embed(PBRT_EMBEDDED_PTX src/pbrt/gpu/optix/optix.cu optix.cu) + endif() endif () source_group ("Source Files" FILES ${PBRT_SOURCE}) @@ -781,7 +838,7 @@ source_group ("Source Files/util" FILES ${PBRT_UTIL_SOURCE}) source_group ("Header Files/util" FILES ${PBRT_UTIL_SOURCE_HEADERS}) source_group ("Source Files/wavefront" FILES ${PBRT_WAVEFRONT_SOURCE}) source_group ("Header Files/wavefront" FILES ${PBRT_WAVEFRONT_SOURCE_HEADERS}) -if (PBRT_CUDA_ENABLED) +if (PBRT_CUDA_ENABLED OR PBRT_HIP_ENABLED) source_group ("Source Files/gpu" FILES ${PBRT_GPU_SOURCE}) source_group ("Header Files/gpu" FILES ${PBRT_GPU_SOURCE_HEADERS}) endif () @@ -875,7 +932,12 @@ endif () target_compile_options (pbrt_lib PUBLIC ${PBRT_CXX_FLAGS}) -target_link_libraries (pbrt_lib PRIVATE pbrt_warnings pbrt_opt $<$:cuda_build_configuration>) +if (PBRT_HIP_ENABLED) + target_link_libraries (pbrt_lib PUBLIC hip::device) + target_link_libraries (pbrt_lib PRIVATE pbrt_warnings pbrt_opt) +else () + target_link_libraries (pbrt_lib PRIVATE pbrt_warnings pbrt_opt $<$:cuda_build_configuration>) +endif() add_sanitizers (pbrt_lib) @@ -910,6 +972,84 @@ if (PBRT_CUDA_ENABLED) endif () endif () +if (PBRT_HIP_ENABLED) + list (APPEND ALL_PBRT_LIBS ${PBRT_HIPRT_LIB_NAME}) + if (CMAKE_SYSTEM_NAME STREQUAL Windows) + set(HIPRT_BUILD_CMD "call ${PBRT_HIP_PATH}/bin/hipcc.bat") + string (APPEND HIPRT_BUILD_CMD " -DPBRT_IS_WINDOWS") + string (APPEND HIPRT_BUILD_CMD " -DNOMINMAX") + string (APPEND HIPRT_BUILD_CMD " -DPBRT_IS_MSVC") + string (APPEND HIPRT_BUILD_CMD " -D_CRT_SECURE_NO_WARNINGS") + string (APPEND HIPRT_BUILD_CMD " -D_ENABLE_EXTENDED_ALIGNED_STORAGE") + set(HIPRT_LINK_CMD "call ${PBRT_HIP_PATH}/bin/clang++.exe") + elseif (CMAKE_SYSTEM_NAME STREQUAL Linux) + set(HIPRT_BUILD_CMD "hipcc") + string (APPEND HIPRT_BUILD_CMD " -DPBRT_IS_LINUX") + set(HIPRT_LINK_CMD "/opt/rocm/bin/amdclang++" ) + endif () + foreach (TARGET IN LISTS AMDGPU_TARGETS) + message (STATUS "Current item: ${item}") + string (APPEND HIPRT_BUILD_CMD " --offload-arch=") + string (APPEND HIPRT_BUILD_CMD ${TARGET}) + string (APPEND HIPRT_LINK_CMD " --offload-arch=") + string (APPEND HIPRT_LINK_CMD ${TARGET}) + endforeach () + string (APPEND HIPRT_BUILD_CMD " -I.") + string (APPEND HIPRT_BUILD_CMD " -I../src") + string (APPEND HIPRT_BUILD_CMD " -I${PBRT_HIPRT_PATH}") + string (APPEND HIPRT_BUILD_CMD " -I../src/ext/openvdb/nanovdb") + string (APPEND HIPRT_BUILD_CMD " -x hip") + string (APPEND HIPRT_BUILD_CMD " ../src/pbrt/gpu/hiprt/hiprt.cu") + string (APPEND HIPRT_BUILD_CMD " ../src/pbrt/util/sobolmatrices.cpp") + string (APPEND HIPRT_BUILD_CMD " ../src/pbrt/util/primes.cpp") + string (APPEND HIPRT_BUILD_CMD " ../src/pbrt/options.cpp") + string (APPEND HIPRT_BUILD_CMD " ../src/pbrt/shapes.cpp") + string (APPEND HIPRT_BUILD_CMD " -O3") + string (APPEND HIPRT_BUILD_CMD " -ffast-math") + string (APPEND HIPRT_BUILD_CMD " -std=c++17") + string (APPEND HIPRT_BUILD_CMD " --cuda-device-only") + string (APPEND HIPRT_BUILD_CMD " -fgpu-rdc -c") + string (APPEND HIPRT_BUILD_CMD " --gpu-bundle-output -c -emit-llvm") + string (APPEND HIPRT_BUILD_CMD " -DWARP_THREADS=32 ") + string (APPEND HIPRT_BUILD_CMD " -DPBRT_BUILD_GPU_RENDERER") + string (APPEND HIPRT_BUILD_CMD " -DPBRT_RESTRICT=__restrict__ ") + string (APPEND HIPRT_BUILD_CMD " -DBLOCK_SIZE=64") + string (APPEND HIPRT_BUILD_CMD " -DSHARED_STACK_SIZE=16") + string (APPEND HIPRT_LINK_CMD " -o hiprt.hipfb") + string (APPEND HIPRT_LINK_CMD " -fgpu-rdc") + string (APPEND HIPRT_LINK_CMD " --hip-link") + string (APPEND HIPRT_LINK_CMD " -Xoffload-linker") + string (APPEND HIPRT_LINK_CMD " --whole-archive") + string (APPEND HIPRT_LINK_CMD " --cuda-device-only") + string (APPEND HIPRT_LINK_CMD " ${PBRT_HIPRT_BITCODE_FILENAME}") + string (APPEND HIPRT_LINK_CMD " hiprt-hip-amdgcn-amd-amdhsa.bc") + string (APPEND HIPRT_LINK_CMD " primes-hip-amdgcn-amd-amdhsa.bc") + string (APPEND HIPRT_LINK_CMD " sobolmatrices-hip-amdgcn-amd-amdhsa.bc") + string (APPEND HIPRT_LINK_CMD " options-hip-amdgcn-amd-amdhsa.bc") + string (APPEND HIPRT_LINK_CMD " shapes-hip-amdgcn-amd-amdhsa.bc") + if (CMAKE_SYSTEM_NAME STREQUAL Windows) + add_custom_target ( + libdeflate_msvc_patch + COMMAND cd ${CMAKE_CURRENT_SOURCE_DIR}/src/ext/libdeflate && git apply ${CMAKE_CURRENT_SOURCE_DIR}/0001-MSVC-HIP.patch > nul 2> nul & exit 0 + ) + add_dependencies (pbrt_lib libdeflate_msvc_patch) + message (STATUS "Applying patch to libdeflate to support _BitScanReverse for Windows HIP build.") + file (WRITE "${CMAKE_CURRENT_BINARY_DIR}/hiprt.bat" "${HIPRT_BUILD_CMD}\n${HIPRT_LINK_CMD}\n") + add_custom_target ( + hiprt_kernels ALL + DEPENDS pbrt_lib + COMMAND ${CMAKE_CURRENT_BINARY_DIR}/hiprt.bat + COMMENT "Compiling HIPRT kernels...") + elseif (CMAKE_SYSTEM_NAME STREQUAL Linux) + file (WRITE "${CMAKE_CURRENT_BINARY_DIR}/hiprt.sh" "${HIPRT_BUILD_CMD}\n${HIPRT_LINK_CMD}\n") + add_custom_target ( + hiprt_kernels ALL + DEPENDS pbrt_lib + COMMAND chmod 777 ${CMAKE_CURRENT_BINARY_DIR}/hiprt.sh && ${CMAKE_CURRENT_BINARY_DIR}/hiprt.sh + COMMENT "Compiling HIPRT kernels...") + endif () +endif () + if (WIN32) list (APPEND ALL_PBRT_LIBS "dbghelp" "wsock32" "ws2_32") endif () @@ -956,6 +1096,7 @@ add_executable (pbrt::pbrt_exe ALIAS pbrt_exe) target_compile_definitions (pbrt_exe PRIVATE ${PBRT_DEFINITIONS}) target_compile_options (pbrt_exe PRIVATE ${PBRT_CXX_FLAGS}) +target_link_options (pbrt_exe PRIVATE ${PBRT_LINK_FLAGS}) target_include_directories (pbrt_exe PRIVATE src src/ext) target_link_libraries (pbrt_exe PRIVATE ${ALL_PBRT_LIBS} pbrt_opt pbrt_warnings) @@ -976,6 +1117,7 @@ set_property (TARGET sky_lib PROPERTY FOLDER "ext") target_compile_definitions (imgtool PRIVATE ${PBRT_DEFINITIONS}) target_compile_options (imgtool PRIVATE ${PBRT_CXX_FLAGS}) +target_link_options (imgtool PRIVATE ${PBRT_LINK_FLAGS}) target_include_directories (imgtool PRIVATE src src/ext ${FLIP_INCLUDE}) target_link_libraries (imgtool PRIVATE ${ALL_PBRT_LIBS} pbrt_opt pbrt_warnings sky_lib flip_lib) @@ -991,6 +1133,7 @@ add_executable (pbrt::pspec ALIAS pspec) target_compile_definitions (pspec PRIVATE ${PBRT_DEFINITIONS}) target_compile_options (pspec PRIVATE ${PBRT_CXX_FLAGS}) +target_link_options (pspec PRIVATE ${PBRT_LINK_FLAGS}) target_include_directories (pspec PRIVATE src src/ext) target_link_libraries (pspec PRIVATE ${ALL_PBRT_LIBS} pbrt_warnings) @@ -1006,6 +1149,7 @@ add_executable (pbrt::plytool ALIAS plytool) target_compile_definitions (plytool PRIVATE ${PBRT_DEFINITIONS}) target_compile_options (plytool PUBLIC ${PBRT_CXX_FLAGS}) +target_link_options (plytool PRIVATE ${PBRT_LINK_FLAGS}) target_include_directories (plytool PUBLIC src src/ext) target_link_libraries (plytool PRIVATE ${ALL_PBRT_LIBS} pbrt_warnings pbrt_opt) @@ -1025,6 +1169,7 @@ target_compile_definitions (nanovdb2pbrt PRIVATE ${PBRT_DEFINITIONS}) target_compile_options (nanovdb2pbrt PUBLIC ${PBRT_CXX_FLAGS}) target_include_directories (nanovdb2pbrt PUBLIC src src/ext) target_link_libraries (nanovdb2pbrt PRIVATE ${ALL_PBRT_LIBS} pbrt_warnings pbrt_opt) +target_link_options (nanovdb2pbrt PRIVATE ${PBRT_LINK_FLAGS}) set_target_properties (nanovdb2pbrt PROPERTIES OUTPUT_NAME nanovdb2pbrt) @@ -1087,7 +1232,8 @@ add_executable (pbrt_test src/pbrt/cmd/pbrt_test.cpp ${PBRT_TEST_SOURCE}) target_link_libraries (pbrt_test PRIVATE ${ALL_PBRT_LIBS} pbrt_opt pbrt_warnings) target_compile_definitions (pbrt_test PRIVATE ${PBRT_DEFINITIONS}) target_include_directories (pbrt_test PRIVATE src src/ext ${DOUBLE_CONVERSION_INCLUDE}) -target_compile_options(pbrt_test PUBLIC ${PBRT_CXX_FLAGS}) +target_compile_options (pbrt_test PUBLIC ${PBRT_CXX_FLAGS}) +target_link_options (pbrt_test PRIVATE ${PBRT_LINK_FLAGS}) add_sanitizers (pbrt_test) diff --git a/LICENSE.txt b/LICENSE.txt index d64569567..37965801b 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,3 +1,7 @@ +All of the modifications made in commit ID 14582eebabcf1ca33323e6b6d6757a49d2306dbf are covered by the MIT license (below). +The rest would be covered by the existing Apache license. + +--------------------------------------------------------- Apache License Version 2.0, January 2004 @@ -200,3 +204,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. + + +--------------------------------------------------------- + +The MIT License (MIT) + +Copyright (c) 2022-2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/README.md b/README.md index 0454dda15..d1d822297 100644 --- a/README.md +++ b/README.md @@ -228,3 +228,33 @@ is straightforward: ```bash $ imgtool denoise-optix noisy.exr --outfile denoised.exr ``` + +Instructions to build the HIP port +-------- + +Linux: +* Install [ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/native-install/ubuntu.html) +* Download and extract [HIPRT](https://gpuopen.com/hiprt/) + * Note that HIP is very sensitive to the difference between versions of HIPRT and the application on Linux + * If you encounter linking error, we recommend to compile HIPRT by yourself +* `cmake -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ -DPBRT_HIPRT=ON -DPBRT_HIPRT_PATH=~/hiprtSdk ..` +* `make` or `make pbrt_exe` + +Windows: +* Download and install [HIP SDK](https://www.amd.com/en/developer/resources/rocm-hub/hip-sdk.html) + * Make sure that you checked HIPRT to be installed +* Download and install [Strawberry Perl](https://strawberryperl.com/) +* Run `x64 Native Tools Command Prompt for VS 2022` as administrator +* `set CC=clang` +* `set CXX=clang++` +* `mkdir build` and `cd build` +* `cmake -G "Ninja" -DCMAKE_BUILD_TYPE=Release -DPBRT_HIPRT=ON ..` +* `cmake --build . --config=Release` + +Example: `./pbrt --gpu ~/pbrt-v4-scenes/killeroos/killeroo-simple.pbrt` +Add `--interactive` for the interactive mode. + +GPU architecture and ROCm version +-------- + +The instructions above assume the `gfx1100` architecture. You can specify other architectures via the CMake `AMDGPU_TARGETS` variable. Note that PBRT should be compiled with the same version as HIPRT binaries (e.g., using ROCm 6.0 and `hiprt02004_6.0_amd_lib_linux.bc`). \ No newline at end of file diff --git a/src/ext/flip/flip.cpp b/src/ext/flip/flip.cpp index 0eedeed44..ba0f18541 100644 --- a/src/ext/flip/flip.cpp +++ b/src/ext/flip/flip.cpp @@ -50,6 +50,10 @@ #include #include +#ifndef M_PI +#define M_PI 3.14159265358979323846f +#endif + namespace flip_detail { class histogram diff --git a/src/ext/openvdb b/src/ext/openvdb index 414bed84c..385516321 160000 --- a/src/ext/openvdb +++ b/src/ext/openvdb @@ -1 +1 @@ -Subproject commit 414bed84c2fc22e188eac7b611aa85c7edd7a5a9 +Subproject commit 385516321b272c423e2a514b41bd21ce6ffdda72 diff --git a/src/pbrt/base/medium.h b/src/pbrt/base/medium.h index 698b04816..30500d006 100644 --- a/src/pbrt/base/medium.h +++ b/src/pbrt/base/medium.h @@ -88,7 +88,6 @@ class Medium std::string ToString() const; - PBRT_CPU_GPU bool IsEmissive() const; PBRT_CPU_GPU diff --git a/src/pbrt/cmd/imgtool.cpp b/src/pbrt/cmd/imgtool.cpp index a812fc7bb..e2c14dbf0 100644 --- a/src/pbrt/cmd/imgtool.cpp +++ b/src/pbrt/cmd/imgtool.cpp @@ -7,9 +7,9 @@ #include #include #ifdef PBRT_BUILD_GPU_RENDERER -#ifndef __HIP_PLATFORM_AMD__ +#ifdef __NVCC__ #include -#endif // __HIP_PLATFORM_AMD__ +#endif // __NVCC__ #include #endif // PBRT_BUILD_GPU_RENDERER #include @@ -2220,7 +2220,7 @@ int makeequiarea(std::vector args) { return 0; } -#ifdef PBRT_BUILD_GPU_RENDERER +#ifdef __NVCC__ int denoise_optix(std::vector args) { std::string inFilename, outFilename; @@ -2335,7 +2335,7 @@ int denoise_optix(std::vector args) { return 0; } -#endif // PBRT_BUILD_GPU_RENDERER +#endif // __NVCC__ int main(int argc, char *argv[]) { PBRTOptions opt; @@ -2362,10 +2362,10 @@ int main(int argc, char *argv[]) { return convert(args); else if (cmd == "diff") return diff(args); -#ifdef PBRT_BUILD_GPU_RENDERER +#ifdef __NVCC__ else if (cmd == "denoise-optix") return denoise_optix(args); -#endif // PBRT_BUILD_GPU_RENDERER +#endif // __NVCC__ else if (cmd == "error") return error(args); else if (cmd == "falsecolor") diff --git a/src/pbrt/cmd/pbrt_test.cpp b/src/pbrt/cmd/pbrt_test.cpp index 4bd7b23d3..3789dae8b 100644 --- a/src/pbrt/cmd/pbrt_test.cpp +++ b/src/pbrt/cmd/pbrt_test.cpp @@ -2,6 +2,7 @@ // The pbrt source code is licensed under the Apache License, Version 2.0. // SPDX: Apache-2.0 +#include #include #include @@ -9,7 +10,6 @@ #include #include -#include #include using namespace pbrt; diff --git a/src/pbrt/cmd/pspec_gpu.cpp b/src/pbrt/cmd/pspec_gpu.cpp index ce5b1a1f3..5cc0ffa00 100644 --- a/src/pbrt/cmd/pspec_gpu.cpp +++ b/src/pbrt/cmd/pspec_gpu.cpp @@ -16,8 +16,12 @@ #include #include +#if defined(__HIPCC__) +#include +#else #include #include +#endif #include diff --git a/src/pbrt/gpu/common.h b/src/pbrt/gpu/common.h new file mode 100644 index 000000000..7451d958d --- /dev/null +++ b/src/pbrt/gpu/common.h @@ -0,0 +1,111 @@ +#ifndef PBRT_GPU_COMMON_H +#define PBRT_GPU_COMMON_H + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__HIPCC__) +#include +#include +#else +#include +#endif + +namespace pbrt { + +class TriangleMesh; +class BilinearPatchMesh; + +struct TriangleMeshRecord { + const TriangleMesh *mesh; + Material material; + FloatTexture alphaTexture; + pstd::span areaLights; + MediumInterface *mediumInterface; +}; + +struct BilinearMeshRecord { + const BilinearPatchMesh *mesh; + Material material; + FloatTexture alphaTexture; + pstd::span areaLights; + MediumInterface *mediumInterface; +}; + +struct QuadricRecord { + Shape shape; + Material material; + FloatTexture alphaTexture; + Light areaLight; + MediumInterface *mediumInterface; +}; + +#if defined(__HIP_PLATFORM_AMD__) +static constexpr size_t HitgroupAlignment = 16u; + +struct alignas(HitgroupAlignment) HitgroupRecord { + PBRT_CPU_GPU HitgroupRecord() {} + PBRT_CPU_GPU HitgroupRecord(const HitgroupRecord &r) { + memcpy(this, &r, sizeof(HitgroupRecord)); + } + PBRT_CPU_GPU HitgroupRecord &operator=(const HitgroupRecord &r) { + if (this != &r) + memcpy(this, &r, sizeof(HitgroupRecord)); + return *this; + } + + union { + TriangleMeshRecord triRec; + BilinearMeshRecord blpRec; + QuadricRecord quadricRec; + }; + enum { TriangleMesh, BilinearMesh, Quadric } type; +}; +#endif + +struct RayIntersectParameters { +#if defined(__HIPCC__) + hiprtScene traversable; +#else + OptixTraversableHandle traversable; +#endif + + const RayQueue *rayQueue; + + // Closest hit + RayQueue *nextRayQueue; + EscapedRayQueue *escapedRayQueue; + HitAreaLightQueue *hitAreaLightQueue; + MaterialEvalQueue *basicEvalMaterialQueue, *universalEvalMaterialQueue; + MediumSampleQueue *mediumSampleQueue; + + // Shadow rays + ShadowRayQueue *shadowRayQueue; + SOA pixelSampleState; + + // Subsurface scattering... + SubsurfaceScatterQueue *subsurfaceScatterQueue; + +#if defined(__HIPCC__) + // Stack buffers + hiprtGlobalStackBuffer globalStackBuffer; + hiprtGlobalStackBuffer globalInstanceStackBuffer; + // Custom function table + hiprtFuncTable funcTable; + // Hitgroup records + HitgroupRecord *hgRecords; + // Offsets for hitgroup records + uint32_t *offsets; +#endif +}; +} // namespace pbrt + +#endif // PBRT_GPU_COMMON_H diff --git a/src/pbrt/gpu/cudagl.h b/src/pbrt/gpu/cudagl.h index 2695844f2..d04dd6351 100644 --- a/src/pbrt/gpu/cudagl.h +++ b/src/pbrt/gpu/cudagl.h @@ -34,9 +34,13 @@ #include +#if defined(__HIPCC__) +#include +#else #include #include #include +#endif #define GL_CHECK(call) \ do { \ @@ -370,6 +374,15 @@ CUDAOutputBuffer::CUDAOutputBuffer(int32_t width, int32_t height) nullptr, GL_STREAM_DRAW)); GL_CHECK(glBindBuffer(GL_ARRAY_BUFFER, 0u)); +#ifdef __HIPCC__ + uint32_t num_gl_devices = 0; + + int glDevice; + cudaGLGetDevices(&num_gl_devices, &glDevice, 1, cudaGLDeviceListAll); + + if (glDevice != current_device) + LOG_FATAL("Multi-GPU not supported with GL interop yet"); +#endif CUDA_CHECK(cudaGraphicsGLRegisterBuffer(&m_cuda_gfx_resource, m_pbo, cudaGraphicsMapFlagsWriteDiscard)); diff --git a/src/pbrt/gpu/hiprt/aggregate.cpp b/src/pbrt/gpu/hiprt/aggregate.cpp new file mode 100644 index 000000000..d99f2fa96 --- /dev/null +++ b/src/pbrt/gpu/hiprt/aggregate.cpp @@ -0,0 +1,1499 @@ +// pbrt is Copyright(c) 1998-2020 Matt Pharr, Wenzel Jakob, and Greg Humphreys. +// The pbrt source code is licensed under the Apache License, Version 2.0. +// SPDX: Apache-2.0 + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#define HIPRT_CHECK(EXPR) \ + do { \ + hiprtError res = EXPR; \ + if (res != hiprtSuccess) \ + LOG_FATAL("HIPRT call " #EXPR " failed with code %d", int(res)); \ + } while (false) /* eat semicolon */ + +namespace pbrt { + +STAT_MEMORY_COUNTER("Memory/Acceleration structures", gpuBVHBytes); +STAT_COUNTER("Geometry/Triangles added from displacement mapping", displacedTrisDelta); +STAT_COUNTER("Geometry/Curves", nCurves); +STAT_COUNTER("Geometry/Bilinear patches created for diced curves", nBLPsForCurves); + +static Material getMaterial(const ShapeSceneEntity &shape, + const std::map &namedMaterials, + const std::vector &materials) { + if (!shape.materialName.empty()) { + auto iter = namedMaterials.find(shape.materialName); + if (iter == namedMaterials.end()) + ErrorExit(&shape.loc, "%s: material not defined", shape.materialName); + return iter->second; + } else { + CHECK_NE(shape.materialIndex, -1); + return materials[shape.materialIndex]; + } +} + +static FloatTexture getAlphaTexture( + const ShapeSceneEntity &shape, + const std::map &floatTextures, Allocator alloc) { + FloatTexture alphaTexture; + + std::string alphaTexName = shape.parameters.GetTexture("alpha"); + if (alphaTexName.empty()) { + if (Float alpha = shape.parameters.GetOneFloat("alpha", 1.f); alpha < 1.f) + alphaTexture = alloc.new_object(alpha); + else + return nullptr; + } else { + auto iter = floatTextures.find(alphaTexName); + if (iter == floatTextures.end()) + ErrorExit(&shape.loc, "%s: alpha texture not defined.", alphaTexName); + + alphaTexture = iter->second; + } + + if (!BasicTextureEvaluator().CanEvaluate({alphaTexture}, {})) { + // It would be nice to just use the UniversalTextureEvaluator (maybe + // always), but optix complains "Error: Found call graph recursion"... + Warning(&shape.loc, + "%s: alpha texture too complex for BasicTextureEvaluator " + "(need fallback path). Ignoring for now.", + alphaTexName); + alphaTexture = nullptr; + } + + return alphaTexture; +} + +static MediumInterface *getMediumInterface(const ShapeSceneEntity &shape, + const std::map &media, + Allocator alloc) { + if (shape.insideMedium.empty() && shape.outsideMedium.empty()) + return nullptr; + + auto getMedium = [&](const std::string &name) -> Medium { + if (name.empty()) + return nullptr; + + auto iter = media.find(name); + if (iter == media.end()) + ErrorExit(&shape.loc, "%s: medium not defined", name); + return iter->second; + }; + + return alloc.new_object(getMedium(shape.insideMedium), + getMedium(shape.outsideMedium)); +} + +HiprtAggregate::HiprtAggregate( + const BasicScene &scene, CUDATrackedMemoryResource *memoryResource, + NamedTextures &textures, + const std::map *> &shapeIndexToAreaLights, + const std::map &media, + const std::map &namedMaterials, + const std::vector &materials, int maxQueueSize) + : memoryResource(memoryResource), cudaStream(nullptr), scene(nullptr) { + hipCtx_t cudaContext; + CUDA_CHECK(hipCtxGetCurrent(&cudaContext)); + CHECK(cudaContext != nullptr); + +#if defined(PBRT_IS_WINDOWS) + // On Windows, it is unfortunately necessary to disable + // multithreading here. The issue is that GPU managed memory can + // only be accessed by one of the CPU or the GPU at a time; the + // program crashes if this is restriction is violated. Thus, it's + // bad news if we are simultaneously, say, reading PLY files on + // the CPU and storing them in managed memory while an OptiX + // kernel is running on the GPU to build a BVH... (Issue #164). + if (Options->useGPU) + DisableThreadPool(); +#endif // PBRT_IS_WINDOWS + + ThreadLocal threadCUDAStreams([]() { + hipStream_t stream; + hipStreamCreate(&stream); + return stream; + }); + + paramsPool.resize(256); // should be plenty + for (ParamBufferState &ps : paramsPool) { + void *ptr; + CUDA_CHECK(hipMalloc(&ptr, sizeof(RayIntersectParameters))); + ps.ptr = (hipDeviceptr_t)ptr; + CUDA_CHECK(hipEventCreate(&ps.finishedEvent)); + CUDA_CHECK(hipHostMalloc(&ps.hostPtr, sizeof(RayIntersectParameters))); + } + + // Create HIPRT context + LOG_VERBOSE("Starting HIPRT initialization"); + + int current_device; + CUDA_CHECK(hipGetDevice(¤t_device)); + + hiprtContextCreationInput ctxInput; + ctxInput.ctxt = cudaContext; + ctxInput.device = current_device; + ctxInput.deviceType = hiprtDeviceAMD; + HIPRT_CHECK(hiprtCreateContext(HIPRT_API_VERSION, ctxInput, context)); + +#ifdef NDEBUG + hiprtSetLogLevel(hiprtLogLevelError); +#endif + LOG_VERBOSE("HIPRT version %d.%d.%x successfully initialized", HIPRT_MAJOR_VERSION, + HIPRT_MINOR_VERSION, HIPRT_PATCH_VERSION); + + // HIPRT module + module = compileHiprtModule(context); + + LOG_VERBOSE("Finished HIPRT initialization"); + + // Note: do not delete the pointers in threadBufferResources, since doing + // so would cause the memory they manage to be freed. + ThreadLocal threadAllocators([memoryResource]() { + pstd::pmr::monotonic_buffer_resource *resource = + new pstd::pmr::monotonic_buffer_resource(1024 * 1024, memoryResource); + return Allocator(resource); + }); + + /////////////////////////////////////////////////////////////////////////// + // Build top-level acceleration structures for non-instanced shapes + LOG_VERBOSE("Starting to create shapes and acceleration structures"); + for (const auto &shape : scene.shapes) + if (shape.name != "sphere" && shape.name != "cylinder" && shape.name != "disk" && + shape.name != "trianglemesh" && shape.name != "plymesh" && + shape.name != "loopsubdiv" && shape.name != "bilinearmesh" && + shape.name != "curve") + ErrorExit(&shape.loc, "%s: unknown shape", shape.name); + + LOG_VERBOSE("Starting to read PLY meshes"); + std::map plyMeshes = + PreparePLYMeshes(scene.shapes, textures.floatTextures); + LOG_VERBOSE("Finished reading PLY meshes"); + + LOG_VERBOSE("Starting to build geometries (BLAS)"); + geomGroups.resize(scene.instanceDefinitions.size() + 1); + + geomGroups[0].triGeomContainer = buildBVHForTriangles( + scene.shapes, plyMeshes, context, textures.floatTextures, namedMaterials, + materials, media, shapeIndexToAreaLights, threadAllocators, threadCUDAStreams); + + geomGroups[0].blpGeomContainer = buildBVHForBLPs( + scene.shapes, context, textures.floatTextures, namedMaterials, materials, media, + shapeIndexToAreaLights, threadAllocators, threadCUDAStreams); + + geomGroups[0].quadricGeomContainer = buildBVHForQuadrics( + scene.shapes, context, textures.floatTextures, namedMaterials, materials, media, + shapeIndexToAreaLights, threadAllocators, threadCUDAStreams); + LOG_VERBOSE("Finished building geometries (BLAS)"); + + /////////////////////////////////////////////////////////////////////////// + // Create instanced geometries + LOG_VERBOSE("Starting to build instanced geometries", + scene.instanceDefinitions.size()); + + InstanceContainer instanceContainer; + instanceContainer.appendRecords(geomGroups[0].triGeomContainer); + instanceContainer.appendRecords(geomGroups[0].blpGeomContainer); + instanceContainer.appendRecords(geomGroups[0].quadricGeomContainer); + + std::vector allInstanceNames; + for (const auto &def : scene.instanceDefinitions) + allInstanceNames.push_back(def.first); + + std::unordered_map instanceMap; + for (int i = 0; i < scene.instanceDefinitions.size(); ++i) { + InternedString name = allInstanceNames[i]; + auto iter = scene.instanceDefinitions.find(name); + CHECK(iter != scene.instanceDefinitions.end()); + const auto &def = *iter; + + if (!def.second->animatedShapes.empty()) + Warning("Ignoring %d animated shapes in instance \"%s\".", + def.second->animatedShapes.size(), def.first); + + std::map meshes = + PreparePLYMeshes(def.second->shapes, textures.floatTextures); + + GeometryGroup &geomGroup = geomGroups[i + 1]; + + geomGroup.triGeomContainer = buildBVHForTriangles( + def.second->shapes, meshes, context, textures.floatTextures, namedMaterials, + materials, media, {}, threadAllocators, threadCUDAStreams); + + geomGroup.blpGeomContainer = buildBVHForBLPs( + def.second->shapes, context, textures.floatTextures, namedMaterials, + materials, media, {}, threadAllocators, threadCUDAStreams); + + geomGroup.quadricGeomContainer = buildBVHForQuadrics( + def.second->shapes, context, textures.floatTextures, namedMaterials, + materials, media, {}, threadAllocators, threadCUDAStreams); + + instanceContainer.appendRecords(geomGroup.triGeomContainer); + instanceContainer.appendRecords(geomGroup.blpGeomContainer); + instanceContainer.appendRecords(geomGroup.quadricGeomContainer); + + meshes.clear(); + instanceMap[def.first] = geomGroup; + } + LOG_VERBOSE("Finished building instanced geometries"); + + /////////////////////////////////////////////////////////////////////////// + // Instancing + LOG_VERBOSE("Starting to build scene (TLAS)"); + hiprtFrameMatrix identity{}; + for (size_t i = 0; i < 3; ++i) + identity.matrix[i][i] = 1.0f; + + instanceContainer.insertInstance(geomGroups[0].triGeomContainer, identity); + instanceContainer.insertInstance(geomGroups[0].blpGeomContainer, identity); + instanceContainer.insertInstance(geomGroups[0].quadricGeomContainer, identity); + size_t nNoInstancedGeoms = instanceContainer.instances.size(); + + for (size_t i = 0; i < scene.instances.size(); ++i) { + const auto &sceneInstance = scene.instances[i]; + auto iter = instanceMap.find(sceneInstance.name); + + if (iter != instanceMap.end()) { + SquareMatrix<4> O2WTransfrom = sceneInstance.renderFromInstance->GetMatrix(); + SquareMatrix<4> W2OTransfrom = Inverse(O2WTransfrom).value(); + + hiprtFrameMatrix transform; + for (int j = 0; j < 3; ++j) + for (int k = 0; k < 4; ++k) + transform.matrix[j][k] = O2WTransfrom[j][k]; + + GeometryGroup &geomGroup = iter->second; + instanceContainer.insertInstance(geomGroup.triGeomContainer, transform); + instanceContainer.insertInstance(geomGroup.blpGeomContainer, transform); + instanceContainer.insertInstance(geomGroup.quadricGeomContainer, transform); + } + } + + this->scene = + buildBVHForInstances(instanceContainer.instances, instanceContainer.transforms, + context, threadAllocators, threadCUDAStreams); + LOG_VERBOSE("Finished building scene (TLAS)"); + + /////////////////////////////////////////////////////////////////////////// + // Bounds + hiprtFloat3 aabbMin, aabbMax; + HIPRT_CHECK(hiprtExportSceneAabb(context, this->scene, aabbMin, aabbMax)); + bounds.pMin = {aabbMin.x, aabbMin.y, aabbMin.z}; + bounds.pMax = {aabbMax.x, aabbMax.y, aabbMax.z}; + + /////////////////////////////////////////////////////////////////////////// + // Copy hitgroup records and offsets + hipStream_t buildStream = threadCUDAStreams.Get(); + CUDA_CHECK(hipMalloc(&hgRecords, + instanceContainer.hgRecords.size() * sizeof(HitgroupRecord))); + CUDA_CHECK(hipMemcpyAsync((void *)hgRecords, instanceContainer.hgRecords.data(), + sizeof(HitgroupRecord) * instanceContainer.hgRecords.size(), + hipMemcpyHostToDevice, buildStream)); + CUDA_CHECK(hipMalloc(&offsets, instanceContainer.offsets.size() * sizeof(uint32_t))); + CUDA_CHECK(hipMemcpyAsync((void *)offsets, instanceContainer.offsets.data(), + sizeof(uint32_t) * instanceContainer.offsets.size(), + hipMemcpyHostToDevice, buildStream)); + + /////////////////////////////////////////////////////////////////////////// + // Create stack buffers + constexpr uint32_t StackSize = 64; + hiprtGlobalStackBufferInput stackBufferInput{hiprtStackTypeGlobal, + hiprtStackEntryTypeInteger, StackSize, + (uint32_t)maxQueueSize}; + HIPRT_CHECK( + hiprtCreateGlobalStackBuffer(context, stackBufferInput, globalStackBuffer)); + + constexpr uint32_t InstanceStackSize = 1; + hiprtGlobalStackBufferInput instanceStackBufferInput{ + hiprtStackTypeGlobal, hiprtStackEntryTypeInstance, InstanceStackSize, + (uint32_t)maxQueueSize}; + HIPRT_CHECK(hiprtCreateGlobalStackBuffer(context, instanceStackBufferInput, + globalInstanceStackBuffer)); + +#if defined(PBRT_IS_WINDOWS) + if (Options->useGPU) + ReenableThreadPool(); +#endif // PBRT_IS_WINDOWS +} + +void HiprtAggregate::IntersectClosest(int maxRays, const RayQueue *rayQueue, + EscapedRayQueue *escapedRayQueue, + HitAreaLightQueue *hitAreaLightQueue, + MaterialEvalQueue *basicEvalMaterialQueue, + MaterialEvalQueue *universalEvalMaterialQueue, + MediumSampleQueue *mediumSampleQueue, + RayQueue *nextRayQueue) const { + std::pair events = + GetProfilerEvents("Trace closest hit rays"); + + hipEventRecord(events.first); + + if (scene) { + RayIntersectParameters params; + params.traversable = scene; + params.rayQueue = rayQueue; + params.nextRayQueue = nextRayQueue; + params.escapedRayQueue = escapedRayQueue; + params.hitAreaLightQueue = hitAreaLightQueue; + params.basicEvalMaterialQueue = basicEvalMaterialQueue; + params.universalEvalMaterialQueue = universalEvalMaterialQueue; + params.mediumSampleQueue = mediumSampleQueue; + params.globalStackBuffer = globalStackBuffer; + params.globalInstanceStackBuffer = globalInstanceStackBuffer; + params.funcTable = module.funcTable; + params.hgRecords = hgRecords; + params.offsets = offsets; + + ParamBufferState &pbs = getParamBuffer(params); + +#ifndef NDEBUG + LOG_VERBOSE("Launching intersect closest"); +#endif + + hipDeviceptr_t paramsPtr; + size_t paramsSize; + CUDA_CHECK( + hipModuleGetGlobal(¶msPtr, ¶msSize, module.hipModule, "paramBuffer")); + CUDA_CHECK(hipMemcpyAsync((void *)paramsPtr, (const void *)pbs.ptr, + sizeof(params), hipMemcpyDeviceToDevice, cudaStream)); + + hiprtLaunch(module.closestFunction, maxRays, 1, BlockSize, 1, nullptr, + cudaStream); + CUDA_CHECK(hipEventRecord(pbs.finishedEvent)); + +#ifndef NDEBUG + CUDA_CHECK(hipDeviceSynchronize()); + LOG_VERBOSE("Post-sync triangle intersect closest"); +#endif + } + + hipEventRecord(events.second); +}; + +void HiprtAggregate::IntersectShadow(int maxRays, ShadowRayQueue *shadowRayQueue, + SOA *pixelSampleState) const { + std::pair events = GetProfilerEvents("Trace shadow rays"); + + hipEventRecord(events.first); + + if (scene) { + RayIntersectParameters params; + params.traversable = scene; + params.shadowRayQueue = shadowRayQueue; + params.pixelSampleState = *pixelSampleState; + params.globalStackBuffer = globalStackBuffer; + params.globalInstanceStackBuffer = globalInstanceStackBuffer; + params.funcTable = module.funcTable; + params.hgRecords = hgRecords; + params.offsets = offsets; + + ParamBufferState &pbs = getParamBuffer(params); + +#ifndef NDEBUG + LOG_VERBOSE("Launching intersect shadow"); +#endif + + hipDeviceptr_t paramsPtr; + size_t paramsSize; + CUDA_CHECK( + hipModuleGetGlobal(¶msPtr, ¶msSize, module.hipModule, "paramBuffer")); + CUDA_CHECK(hipMemcpyAsync((void *)paramsPtr, (const void *)pbs.ptr, + sizeof(params), hipMemcpyDeviceToDevice, cudaStream)); + + hiprtLaunch(module.shadowFunction, maxRays, 1, BlockSize, 1, nullptr, cudaStream); + CUDA_CHECK(hipEventRecord(pbs.finishedEvent)); + +#ifndef NDEBUG + CUDA_CHECK(hipDeviceSynchronize()); + LOG_VERBOSE("Post-sync intersect shadow"); +#endif + } + + hipEventRecord(events.second); +} + +void HiprtAggregate::IntersectShadowTr(int maxRays, ShadowRayQueue *shadowRayQueue, + SOA *pixelSampleState) const { + std::pair events = + GetProfilerEvents("Tracing shadow Tr rays"); + + hipEventRecord(events.first); + + if (scene) { + RayIntersectParameters params; + params.traversable = scene; + params.shadowRayQueue = shadowRayQueue; + params.pixelSampleState = *pixelSampleState; + params.globalStackBuffer = globalStackBuffer; + params.globalInstanceStackBuffer = globalInstanceStackBuffer; + params.funcTable = module.funcTable; + params.hgRecords = hgRecords; + params.offsets = offsets; + + ParamBufferState &pbs = getParamBuffer(params); + +#ifndef NDEBUG + LOG_VERBOSE("Launching intersect shadow Tr"); +#endif + hipDeviceptr_t paramsPtr; + size_t paramsSize; + CUDA_CHECK( + hipModuleGetGlobal(¶msPtr, ¶msSize, module.hipModule, "paramBuffer")); + CUDA_CHECK(hipMemcpyAsync((void *)paramsPtr, (const void *)pbs.ptr, + sizeof(params), hipMemcpyDeviceToDevice, cudaStream)); + + hiprtLaunch(module.shadowTrFunction, maxRays, 1, BlockSize, 1, nullptr, + cudaStream); + CUDA_CHECK(hipEventRecord(pbs.finishedEvent)); + +#ifndef NDEBUG + CUDA_CHECK(hipDeviceSynchronize()); + LOG_VERBOSE("Post-sync intersect shadow Tr"); +#endif + } + + hipEventRecord(events.second); +} + +void HiprtAggregate::IntersectOneRandom( + int maxRays, SubsurfaceScatterQueue *subsurfaceScatterQueue) const { + std::pair events = + GetProfilerEvents("Tracing subsurface scattering probe rays"); + + hipEventRecord(events.first); + + if (scene) { + RayIntersectParameters params; + params.traversable = scene; + params.subsurfaceScatterQueue = subsurfaceScatterQueue; + params.globalStackBuffer = globalStackBuffer; + params.globalInstanceStackBuffer = globalInstanceStackBuffer; + params.funcTable = module.funcTable; + params.hgRecords = hgRecords; + params.offsets = offsets; + + ParamBufferState &pbs = getParamBuffer(params); + +#ifndef NDEBUG + LOG_VERBOSE("Launching intersect random"); +#endif + hipDeviceptr_t paramsPtr; + size_t paramsSize; + CUDA_CHECK( + hipModuleGetGlobal(¶msPtr, ¶msSize, module.hipModule, "paramBuffer")); + CUDA_CHECK(hipMemcpyAsync((void *)paramsPtr, (const void *)pbs.ptr, + sizeof(params), hipMemcpyDeviceToDevice, cudaStream)); + + hiprtLaunch(module.oneRandomFunction, maxRays, 1, BlockSize, 1, nullptr, + cudaStream); + CUDA_CHECK(hipEventRecord(pbs.finishedEvent)); + +#ifndef NDEBUG + CUDA_CHECK(hipDeviceSynchronize()); + LOG_VERBOSE("Post-sync intersect random"); +#endif + } + + hipEventRecord(events.second); +} + +std::map HiprtAggregate::PreparePLYMeshes( + const std::vector &shapes, + const std::map &floatTextures) { + std::map plyMeshes; + std::mutex mutex; + ParallelFor(0, shapes.size(), [&](int64_t i) { + const auto &shape = shapes[i]; + if (shape.name != "plymesh") + return; + + std::string filename = + ResolveFilename(shape.parameters.GetOneString("filename", "")); + if (filename.empty()) + ErrorExit(&shape.loc, "plymesh: \"filename\" must be provided."); + TriQuadMesh plyMesh = TriQuadMesh::ReadPLY(filename); // todo: alloc + if (!plyMesh.triIndices.empty() || !plyMesh.quadIndices.empty()) { + plyMesh.ConvertToOnlyTriangles(); + + Float edgeLength = + shape.parameters.GetOneFloat("displacement.edgelength", 1.f); + edgeLength *= Options->displacementEdgeScale; + + std::string displacementTexName = shape.parameters.GetTexture("displacement"); + if (!displacementTexName.empty()) { + auto iter = floatTextures.find(displacementTexName); + if (iter == floatTextures.end()) + ErrorExit(&shape.loc, "%s: no such texture defined.", + displacementTexName); + FloatTexture displacement = iter->second; + + LOG_VERBOSE("Starting to displace mesh \"%s\" with \"%s\"", filename, + displacementTexName); + + size_t origNumTris = plyMesh.triIndices.size() / 3; + + plyMesh = plyMesh.Displace( + [&](Point3f v0, Point3f v1) { + v0 = (*shape.renderFromObject)(v0); + v1 = (*shape.renderFromObject)(v1); + return Distance(v0, v1); + }, + edgeLength, + [&](Point3f *pCPU, const Normal3f *nCPU, const Point2f *uvCPU, + int nVertices) { + Point3f *p; + Normal3f *n; + Point2f *uv; + CUDA_CHECK(hipMallocManaged(&p, nVertices * sizeof(Point3f))); + CUDA_CHECK(hipMallocManaged(&n, nVertices * sizeof(Normal3f))); + CUDA_CHECK(hipMallocManaged(&uv, nVertices * sizeof(Point2f))); + + std::memcpy(p, pCPU, nVertices * sizeof(Point3f)); + std::memcpy(n, nCPU, nVertices * sizeof(Normal3f)); + std::memcpy(uv, uvCPU, nVertices * sizeof(Point2f)); + + GPUParallelFor( + "Evaluate Displacement", nVertices, [=] PBRT_GPU(int i) { + TextureEvalContext ctx; + ctx.p = p[i]; + ctx.uv = uv[i]; + Float d = UniversalTextureEvaluator()(displacement, ctx); + p[i] += Vector3f(d * n[i]); + }); + GPUWait(); + + std::memcpy(pCPU, p, nVertices * sizeof(Point3f)); + + CUDA_CHECK(hipFree(p)); + CUDA_CHECK(hipFree(n)); + CUDA_CHECK(hipFree(uv)); + }, + &shape.loc); + + displacedTrisDelta += plyMesh.triIndices.size() / 3 - origNumTris; + + LOG_VERBOSE("Finished displacing mesh \"%s\" with \"%s\" -> %d tris", + filename, displacementTexName, plyMesh.triIndices.size() / 3); + } + } + + std::lock_guard lock(mutex); + plyMeshes[i] = std::move(plyMesh); + }); + + return plyMeshes; +} + +HiprtAggregate::ParamBufferState &HiprtAggregate::getParamBuffer( + const RayIntersectParameters ¶ms) const { + CHECK(nextParamOffset < paramsPool.size()); + + ParamBufferState &pbs = paramsPool[nextParamOffset]; + if (++nextParamOffset == paramsPool.size()) + nextParamOffset = 0; + if (!pbs.used) + pbs.used = true; + else + CUDA_CHECK(hipEventSynchronize(pbs.finishedEvent)); + + // Copy to host-side pinned memory + memcpy(pbs.hostPtr, ¶ms, sizeof(params)); + CUDA_CHECK(hipMemcpyAsync((void *)pbs.ptr, pbs.hostPtr, sizeof(params), + hipMemcpyHostToDevice, cudaStream)); + + return pbs; +} + +BilinearPatchMesh *HiprtAggregate::diceCurveToBLP(const ShapeSceneEntity &shape, + int nDiceU, int nDiceV, + Allocator alloc) { + CHECK_EQ(shape.name, "curve"); + const ParameterDictionary ¶meters = shape.parameters; + const FileLoc *loc = &shape.loc; + + ++nCurves; + + // Extract parameters; the following ~90 lines of code are, + // unfortunately, copied from Curve::Create. We would like to avoid + // the overhead of splitting the curve and creating Curve objects, so + // here we go.. + Float width = parameters.GetOneFloat("width", 1.f); + Float width0 = parameters.GetOneFloat("width0", width); + Float width1 = parameters.GetOneFloat("width1", width); + + int degree = parameters.GetOneInt("degree", 3); + if (degree != 2 && degree != 3) { + Error(loc, "Invalid degree %d: only degree 2 and 3 curves are supported.", + degree); + return {}; + } + + std::string basis = parameters.GetOneString("basis", "bezier"); + if (basis != "bezier" && basis != "bspline") { + Error(loc, + "Invalid basis \"%s\": only \"bezier\" and \"bspline\" are " + "supported.", + basis); + return {}; + } + + int nSegments; + std::vector cp = parameters.GetPoint3fArray("P"); + bool bezierBasis = (basis == "bezier"); + if (bezierBasis) { + // After the first segment, which uses degree+1 control points, + // subsequent segments reuse the last control point of the previous + // one and then use degree more control points. + if (((cp.size() - 1 - degree) % degree) != 0) { + Error(loc, + "Invalid number of control points %d: for the degree %d " + "Bezier basis %d + n * %d are required, for n >= 0.", + (int)cp.size(), degree, degree + 1, degree); + return {}; + } + nSegments = (cp.size() - 1) / degree; + } else { + if (cp.size() < degree + 1) { + Error(loc, + "Invalid number of control points %d: for the degree %d " + "b-spline basis, must have >= %d.", + int(cp.size()), degree, degree + 1); + return {}; + } + nSegments = cp.size() - degree; + } + + CurveType type; + std::string curveType = parameters.GetOneString("type", "flat"); + if (curveType == "flat") + type = CurveType::Flat; + else if (curveType == "ribbon") + type = CurveType::Ribbon; + else if (curveType == "cylinder") + type = CurveType::Cylinder; + else { + Error(loc, R"(Unknown curve type "%s". Using "cylinder".)", curveType); + type = CurveType::Cylinder; + } + + std::vector n = parameters.GetNormal3fArray("N"); + if (!n.empty()) { + if (type != CurveType::Ribbon) { + Warning("Curve normals are only used with \"ribbon\" type curves."); + n = {}; + } else if (n.size() != nSegments + 1) { + Error(loc, + "Invalid number of normals %d: must provide %d normals for " + "ribbon curves with %d segments.", + int(n.size()), nSegments + 1, nSegments); + return {}; + } + for (Normal3f &nn : n) + Normalize(nn); + } else if (type == CurveType::Ribbon) { + Error(loc, "Must provide normals \"N\" at curve endpoints with ribbon " + "curves."); + return {}; + } + + // Start dicing... + std::vector blpIndices; + std::vector blpP; + std::vector blpN; + std::vector blpUV; + + int lastCPOffset = -1; + pstd::array segCpBezier; + + for (int i = 0; i <= nDiceU; ++i) { + Float u = Float(i) / Float(nDiceU); + Float width = Lerp(u, width0, width1); + + int segmentIndex = int(u * nSegments); + if (segmentIndex == nSegments) // u == 1... + --segmentIndex; + + // Compute offset into original control points for current u + int cpOffset; + if (bezierBasis) + cpOffset = segmentIndex * degree; + else + // Uniform b-spline. + cpOffset = segmentIndex; + + if (cpOffset != lastCPOffset) { + // update segCpBezier + if (bezierBasis) { + if (degree == 2) { + // Elevate to degree 3. + segCpBezier = ElevateQuadraticBezierToCubic( + pstd::MakeConstSpan(cp).subspan(cpOffset, 3)); + } else { + // All set. + for (int i = 0; i < 4; ++i) + segCpBezier[i] = cp[cpOffset + i]; + } + } else { + // Uniform b-spline. + if (degree == 2) { + pstd::array bezCp = QuadraticBSplineToBezier( + pstd::MakeConstSpan(cp).subspan(cpOffset, 3)); + segCpBezier = + ElevateQuadraticBezierToCubic(pstd::MakeConstSpan(bezCp)); + } else { + segCpBezier = CubicBSplineToBezier( + pstd::MakeConstSpan(cp).subspan(cpOffset, 4)); + } + } + lastCPOffset = cpOffset; + } + + Float uSeg = (u * nSegments) - segmentIndex; + DCHECK(uSeg >= 0 && uSeg <= 1); + + Vector3f dpdu; + Point3f p = EvaluateCubicBezier(segCpBezier, uSeg, &dpdu); + + switch (type) { + case CurveType::Ribbon: { + Float normalAngle = AngleBetween(n[segmentIndex], n[segmentIndex + 1]); + Float invSinNormalAngle = 1 / std::sin(normalAngle); + + Normal3f nu; + if (normalAngle == 0) + nu = n[segmentIndex]; + else { + Float sin0 = std::sin((1 - uSeg) * normalAngle) * invSinNormalAngle; + Float sin1 = std::sin(uSeg * normalAngle) * invSinNormalAngle; + nu = sin0 * n[segmentIndex] + sin1 * n[segmentIndex + 1]; + } + Vector3f dpdv = Normalize(Cross(nu, dpdu)) * width; + + blpP.push_back(p - dpdv / 2); + blpP.push_back(p + dpdv / 2); + blpUV.push_back(Point2f(u, 0)); + blpUV.push_back(Point2f(u, 1)); + + if (i > 0) { + blpIndices.push_back(2 * (i - 1)); + blpIndices.push_back(2 * (i - 1) + 1); + blpIndices.push_back(2 * i); + blpIndices.push_back(2 * i + 1); + } + break; + } + case CurveType::Flat: + case CurveType::Cylinder: { + Vector3f ortho[2]; + CoordinateSystem(Normalize(dpdu), &ortho[0], &ortho[1]); + ortho[0] *= width / 2; + ortho[1] *= width / 2; + + // Repeat the first/last vertex so we can assign different + // texture coordinates... + for (int v = 0; v <= nDiceV; ++v) { + Float angle = Float(v) / nDiceV * 2 * Pi; + blpP.push_back(p + ortho[0] * std::cos(angle) + + ortho[1] * std::sin(angle)); + blpN.push_back(Normal3f(Normalize(blpP.back() - p))); + blpUV.push_back(Point2f(u, Float(v) / nDiceV)); + } + + if (i > 0) { + for (int v = 0; v < nDiceV; ++v) { + // Indexing is funny due to doubled-up last vertex + blpIndices.push_back((nDiceV + 1) * (i - 1) + v); + blpIndices.push_back((nDiceV + 1) * (i - 1) + v + 1); + blpIndices.push_back((nDiceV + 1) * i + v); + blpIndices.push_back((nDiceV + 1) * i + v + 1); + } + } + break; + } + } + } + + nBLPsForCurves += blpIndices.size() / 4; + + return alloc.new_object( + *shape.renderFromObject, shape.reverseOrientation, blpIndices, blpP, blpN, blpUV, + std::vector(), nullptr, alloc); +} + +HiprtAggregate::GeometryContainer HiprtAggregate::buildBVHForTriangles( + const std::vector &shapes, + const std::map &plyMeshes, hiprtContext context, + const std::map &floatTextures, + const std::map &namedMaterials, + const std::vector &materials, const std::map &media, + const std::map *> &shapeIndexToAreaLights, + ThreadLocal &threadAllocators, + ThreadLocal &threadCUDAStreams) { + // Count how many of the shapes are triangle meshes + std::vector meshIndexToShapeIndex; + for (size_t i = 0; i < shapes.size(); ++i) { + const auto &shape = shapes[i]; + if (shape.name == "trianglemesh" || shape.name == "plymesh" || + shape.name == "loopsubdiv") + meshIndexToShapeIndex.push_back(i); + } + + size_t nMeshes = meshIndexToShapeIndex.size(); + if (nMeshes == 0) + return {}; + + LOG_VERBOSE("Building triangle BLAS"); + + std::vector meshes(nMeshes, nullptr); + ParallelFor(0, nMeshes, [&](int64_t meshIndex) { + Allocator alloc = threadAllocators.Get(); + size_t shapeIndex = meshIndexToShapeIndex[meshIndex]; + const auto &shape = shapes[shapeIndex]; + + TriangleMesh *mesh = nullptr; + if (shape.name == "trianglemesh") { + mesh = Triangle::CreateMesh(shape.renderFromObject, shape.reverseOrientation, + shape.parameters, &shape.loc, alloc); + CHECK(mesh != nullptr); + } else if (shape.name == "loopsubdiv") { + // Copied from pbrt/shapes.cpp... :-p + int nLevels = shape.parameters.GetOneInt("levels", 3); + std::vector vertexIndices = shape.parameters.GetIntArray("indices"); + if (vertexIndices.empty()) + ErrorExit(&shape.loc, "Vertex indices \"indices\" not " + "provided for LoopSubdiv shape."); + + std::vector P = shape.parameters.GetPoint3fArray("P"); + if (P.empty()) + ErrorExit(&shape.loc, "Vertex positions \"P\" not provided " + "for LoopSubdiv shape."); + + // don't actually use this for now... + std::string scheme = shape.parameters.GetOneString("scheme", "loop"); + + mesh = LoopSubdivide(shape.renderFromObject, shape.reverseOrientation, + nLevels, vertexIndices, P, alloc); + CHECK(mesh != nullptr); + } else if (shape.name == "plymesh") { + auto plyIter = plyMeshes.find(shapeIndex); + CHECK(plyIter != plyMeshes.end()); + const TriQuadMesh &plyMesh = plyIter->second; + + if (!plyMesh.quadIndices.empty() && shape.lightIndex != -1) { +#if 0 + // If you'd like to know what they are... + for (int i = 0; i < plyMesh.quadIndices.size(); ++i) + Printf("%s\n", plyMesh.p[plyMesh.quadIndices[i]]); +#endif + // This would be nice to fix, but it involves some + // plumbing and it's a rare case. The underlying issue + // is that when we create AreaLights for emissive + // shapes earlier, we're not expecting this.. + std::string filename = + ResolveFilename(shape.parameters.GetOneString("filename", "")); + ErrorExit(&shape.loc, + "%s: PLY file being used as an area light has quads--" + "this is currently unsupported. Please replace them with " + "\"bilinearmesh\" " + "shapes as a workaround. (Sorry!).", + filename); + } + + mesh = alloc.new_object( + *shape.renderFromObject, shape.reverseOrientation, plyMesh.triIndices, + plyMesh.p, std::vector(), plyMesh.n, plyMesh.uv, + plyMesh.faceIndices, alloc); + } else + LOG_FATAL("Logic error in GPUAggregate::buildBVHForTriangles()"); + + meshes[meshIndex] = mesh; + }); + + GeometryContainer geomContainer(nMeshes); + + ParallelFor(0, nMeshes, [&](int64_t startIndex, int64_t endIndex) { + Allocator alloc = threadAllocators.Get(); + + for (int meshIndex = startIndex; meshIndex < endIndex; ++meshIndex) { + TriangleMesh *mesh = meshes[meshIndex]; + size_t shapeIndex = meshIndexToShapeIndex[meshIndex]; + const auto &shape = shapes[shapeIndex]; + FloatTexture alphaTexture = getAlphaTexture(shape, floatTextures, alloc); + Material material = getMaterial(shape, namedMaterials, materials); + + // Do this here, after the alpha texture has been consumed. + shape.parameters.ReportUnused(); + + HitgroupRecord &hgRecord = geomContainer.hgRecords[meshIndex]; + hgRecord.type = HitgroupRecord::TriangleMesh; + hgRecord.triRec.mesh = mesh; + hgRecord.triRec.material = material; + hgRecord.triRec.alphaTexture = alphaTexture; + hgRecord.triRec.areaLights = {}; + if (shape.lightIndex != -1) { + if (!material) + Warning(&shape.loc, "Ignoring area light specification for shape " + "with \"interface\" material."); + else { + // Note: this will hit if we try to have an instance as an area + // light. + auto iter = shapeIndexToAreaLights.find(shapeIndex); + CHECK(iter != shapeIndexToAreaLights.end()); + CHECK_EQ(iter->second->size(), mesh->nTriangles); + hgRecord.triRec.areaLights = pstd::MakeSpan(*iter->second); + } + } + hgRecord.triRec.mediumInterface = getMediumInterface(shape, media, alloc); + } + }); + + hiprtBuildOptions options; + options.buildFlags = hiprtBuildFlagBitPreferBalancedBuild; + options.batchBuildMaxPrimCount = 512u; + + size_t nTris = 0; + size_t nVerts = 0; + std::vector triOffsets; + std::vector vertsOffsets; + for (size_t meshIndex = 0; meshIndex < nMeshes; ++meshIndex) { + TriangleMesh *mesh = meshes[meshIndex]; + triOffsets.push_back(nTris); + vertsOffsets.push_back(nVerts); + nTris += mesh->nTriangles; + nVerts += mesh->nVertices; + } + + hiprtInt3 *tris; + CUDA_CHECK(hipMalloc(&tris, sizeof(hiprtInt3) * nTris)); + + hiprtFloat3 *verts; + CUDA_CHECK(hipMalloc(&verts, sizeof(hiprtFloat3) * nVerts)); + + std::vector triBuffer(nTris); + std::vector vertBuffer(nVerts); + + std::vector geomInputs(nMeshes); + std::vector geoms(nMeshes); + std::vector geomAddrs(nMeshes); + ParallelFor(0, nMeshes, [&](int64_t meshIndex) { + TriangleMesh *mesh = meshes[meshIndex]; + size_t triOffset = triOffsets[meshIndex]; + size_t vertOffset = vertsOffsets[meshIndex]; + + hiprtTriangleMeshPrimitive prim; + prim.triangleCount = mesh->nTriangles; + prim.triangleStride = sizeof(hiprtInt3); + prim.triangleIndices = tris + triOffset; + prim.vertexCount = mesh->nVertices; + prim.vertexStride = sizeof(hiprtFloat3); + prim.vertices = verts + vertOffset; + + hiprtGeometryBuildInput &geomInput = geomInputs[meshIndex]; + geomInput.type = hiprtPrimitiveTypeTriangleMesh; + geomInput.primitive.triangleMesh = prim; + geomInput.geomType = 0; + + geomAddrs[meshIndex] = &geoms[meshIndex]; + + std::memcpy(&triBuffer[triOffset], mesh->vertexIndices, + sizeof(hiprtInt3) * mesh->nTriangles); + std::memcpy(&vertBuffer[vertOffset], mesh->p, + sizeof(hiprtFloat3) * mesh->nVertices); + }); + + hipStream_t buildStream = threadCUDAStreams.Get(); + CUDA_CHECK(hipMemcpyAsync(tris, triBuffer.data(), sizeof(hiprtInt3) * nTris, + hipMemcpyHostToDevice, buildStream)); + CUDA_CHECK(hipMemcpyAsync(verts, vertBuffer.data(), sizeof(hiprtFloat3) * nVerts, + hipMemcpyHostToDevice, buildStream)); + + size_t geomTempSize; + HIPRT_CHECK(hiprtGetGeometriesBuildTemporaryBufferSize( + context, nMeshes, geomInputs.data(), options, geomTempSize)); + + hiprtDevicePtr tempBuffer; + CUDA_CHECK(hipMalloc(&tempBuffer, geomTempSize)); + + HIPRT_CHECK(hiprtCreateGeometries(context, nMeshes, geomInputs.data(), options, + geomAddrs.data())); + + LOG_VERBOSE("Starting to build triangle mesh geometries"); + HIPRT_CHECK(hiprtBuildGeometries(context, hiprtBuildOperationBuild, nMeshes, + geomInputs.data(), options, tempBuffer, buildStream, + geoms.data())); + LOG_VERBOSE("Finished building triangle mesh geometries"); + + CUDA_CHECK(hipFree(tris)); + CUDA_CHECK(hipFree(verts)); + CUDA_CHECK(hipFree(tempBuffer)); + + hiprtFrameMatrix identity{}; + for (size_t i = 0; i < 3; ++i) + identity.matrix[i][i] = 1.0f; + + std::vector instances(nMeshes); + std::vector transforms(nMeshes); + ParallelFor(0, nMeshes, [&](int64_t meshIndex) { + instances[meshIndex].type = hiprtInstanceTypeGeometry; + instances[meshIndex].geometry = geoms[meshIndex]; + transforms[meshIndex] = identity; + }); + + geomContainer.scene = buildBVHForInstances(instances, transforms, context, + threadAllocators, threadCUDAStreams); + + return geomContainer; +} + +HiprtAggregate::GeometryContainer HiprtAggregate::buildBVHForBLPs( + const std::vector &shapes, hiprtContext context, + const std::map &floatTextures, + const std::map &namedMaterials, + const std::vector &materials, const std::map &media, + const std::map *> &shapeIndexToAreaLights, + ThreadLocal &threadAllocators, + ThreadLocal &threadCUDAStreams) { + // Count how many BLP meshes there are in shapes + std::vector meshIndexToShapeIndex; + for (size_t i = 0; i < shapes.size(); ++i) { + const auto &shape = shapes[i]; + if (shape.name == "bilinearmesh" || shape.name == "curve") + meshIndexToShapeIndex.push_back(i); + } + + size_t nMeshes = meshIndexToShapeIndex.size(); + if (nMeshes == 0) + return {}; + + LOG_VERBOSE("Building bilinear patch BLAS"); + + // Create meshes + std::vector meshes(nMeshes, nullptr); + ParallelFor(0, nMeshes, [&](int64_t meshIndex) { + Allocator alloc = threadAllocators.Get(); + size_t shapeIndex = meshIndexToShapeIndex[meshIndex]; + const auto &shape = shapes[shapeIndex]; + + if (shape.name == "bilinearmesh") { + BilinearPatchMesh *mesh = BilinearPatch::CreateMesh( + shape.renderFromObject, shape.reverseOrientation, shape.parameters, + &shape.loc, alloc); + meshes[meshIndex] = mesh; + } else if (shape.name == "curve") { + BilinearPatchMesh *curveMesh = + diceCurveToBLP(shape, 5 /* nseg */, 5 /* nvert */, alloc); + if (curveMesh) { + meshes[meshIndex] = curveMesh; + } + } + }); + + GeometryContainer geomContainer(nMeshes); + + ParallelFor(0, nMeshes, [&](int64_t startIndex, int64_t endIndex) { + Allocator alloc = threadAllocators.Get(); + + for (int meshIndex = startIndex; meshIndex < endIndex; ++meshIndex) { + BilinearPatchMesh *mesh = meshes[meshIndex]; + size_t shapeIndex = meshIndexToShapeIndex[meshIndex]; + const auto &shape = shapes[shapeIndex]; + Material material = getMaterial(shape, namedMaterials, materials); + FloatTexture alphaTexture = getAlphaTexture(shape, floatTextures, alloc); + + // After "alpha" has been consumed... + shape.parameters.ReportUnused(); + + HitgroupRecord &hgRecord = geomContainer.hgRecords[meshIndex]; + hgRecord.type = HitgroupRecord::BilinearMesh; + hgRecord.blpRec.mesh = mesh; + hgRecord.blpRec.material = material; + hgRecord.blpRec.alphaTexture = alphaTexture; + hgRecord.blpRec.areaLights = {}; + if (shape.lightIndex != -1) { + if (!material) + Warning(&shape.loc, + "Ignoring area light specification for shape with " + "\"interface\" material."); + else { + auto iter = shapeIndexToAreaLights.find(shapeIndex); + // Note: this will hit if we try to have an instance as an area + // light. + CHECK(iter != shapeIndexToAreaLights.end()); + CHECK_EQ(iter->second->size(), mesh->nPatches); + hgRecord.blpRec.areaLights = pstd::MakeSpan(*iter->second); + } + } + hgRecord.blpRec.mediumInterface = getMediumInterface(shape, media, alloc); + } + }); + + hiprtBuildOptions options; + options.buildFlags = hiprtBuildFlagBitPreferBalancedBuild; + options.batchBuildMaxPrimCount = 512u; + + size_t nAabbs = 0; + std::vector aabbOffsets; + for (size_t meshIndex = 0; meshIndex < nMeshes; ++meshIndex) { + BilinearPatchMesh *mesh = meshes[meshIndex]; + aabbOffsets.push_back(nAabbs); + nAabbs += mesh->nPatches; + } + + hiprtFloat4 *aabbs; + CUDA_CHECK(hipMalloc(&aabbs, sizeof(hiprtFloat4) * 2 * nAabbs)); + + std::vector aabbBuffer(2 * nAabbs); + + std::vector geomInputs(nMeshes); + std::vector geoms(nMeshes); + std::vector geomAddrs(nMeshes); + ParallelFor(0, nMeshes, [&](int64_t meshIndex) { + BilinearPatchMesh *mesh = meshes[meshIndex]; + size_t aabbIndex = aabbOffsets[meshIndex]; + + hiprtAABBListPrimitive prim; + prim.aabbCount = mesh->nPatches; + prim.aabbStride = 2 * sizeof(hiprtFloat4); + prim.aabbs = aabbs + 2 * aabbIndex; + + hiprtGeometryBuildInput &geomInput = geomInputs[meshIndex]; + geomInput.type = hiprtPrimitiveTypeAABBList; + geomInput.primitive.aabbList = prim; + geomInput.geomType = 1; + + geomAddrs[meshIndex] = &geoms[meshIndex]; + + for (int patchIndex = 0; patchIndex < mesh->nPatches; ++patchIndex) { + Bounds3f patchBounds; + for (int i = 0; i < 4; ++i) + patchBounds = + Union(patchBounds, mesh->p[mesh->vertexIndices[4 * patchIndex + i]]); + hiprtFloat4 aabbMin = + hiprtFloat4{float(patchBounds.pMin.x), float(patchBounds.pMin.y), + float(patchBounds.pMin.z), 0.0f}; + hiprtFloat4 aabbMax = + hiprtFloat4{float(patchBounds.pMax.x), float(patchBounds.pMax.y), + float(patchBounds.pMax.z), 0.0f}; + aabbBuffer[2 * aabbIndex + 0] = aabbMin; + aabbBuffer[2 * aabbIndex + 1] = aabbMax; + ++aabbIndex; + } + }); + + hipStream_t buildStream = threadCUDAStreams.Get(); + CUDA_CHECK(hipMemcpyAsync(aabbs, aabbBuffer.data(), sizeof(hiprtFloat4) * 2 * nAabbs, + hipMemcpyHostToDevice, buildStream)); + + size_t geomTempSize; + HIPRT_CHECK(hiprtGetGeometriesBuildTemporaryBufferSize( + context, nMeshes, geomInputs.data(), options, geomTempSize)); + + hiprtDevicePtr tempBuffer; + CUDA_CHECK(hipMalloc(&tempBuffer, geomTempSize)); + + HIPRT_CHECK(hiprtCreateGeometries(context, nMeshes, geomInputs.data(), options, + geomAddrs.data())); + + LOG_VERBOSE("Starting to build bilinear patch mesh geometries"); + HIPRT_CHECK(hiprtBuildGeometries(context, hiprtBuildOperationBuild, nMeshes, + geomInputs.data(), options, tempBuffer, buildStream, + geoms.data())); + LOG_VERBOSE("Finished building bilinear patch mesh geometries"); + + CUDA_CHECK(hipFree(aabbs)); + CUDA_CHECK(hipFree(tempBuffer)); + + hiprtFrameMatrix identity{}; + for (size_t i = 0; i < 3; ++i) + identity.matrix[i][i] = 1.0f; + + std::vector instances(nMeshes); + std::vector transforms(nMeshes); + ParallelFor(0, nMeshes, [&](int64_t meshIndex) { + instances[meshIndex].type = hiprtInstanceTypeGeometry; + instances[meshIndex].geometry = geoms[meshIndex]; + transforms[meshIndex] = identity; + }); + + geomContainer.scene = buildBVHForInstances(instances, transforms, context, + threadAllocators, threadCUDAStreams); + + return geomContainer; +} + +HiprtAggregate::GeometryContainer HiprtAggregate::buildBVHForQuadrics( + const std::vector &shapes, hiprtContext context, + const std::map &floatTextures, + const std::map &namedMaterials, + const std::vector &materials, const std::map &media, + const std::map *> &shapeIndexToAreaLights, + ThreadLocal &threadAllocators, + ThreadLocal &threadCUDAStreams) { + int nQuadrics = 0; + for (size_t shapeIndex = 0; shapeIndex < shapes.size(); ++shapeIndex) { + const auto &s = shapes[shapeIndex]; + if (s.name == "sphere" || s.name == "cylinder" || s.name == "disk") + ++nQuadrics; + } + + if (nQuadrics == 0) + return {}; + + LOG_VERBOSE("Building quadric BLAS"); + + hiprtBuildOptions options; + options.buildFlags = hiprtBuildFlagBitPreferBalancedBuild; + options.batchBuildMaxPrimCount = 512u; + + Allocator alloc = threadAllocators.Get(); + hiprtFloat4 *aabbs; + CUDA_CHECK(hipMalloc(&aabbs, sizeof(hiprtFloat4) * 2 * nQuadrics)); + + std::vector aabbBuffer(2 * nQuadrics); + + int quadricIndex = 0; + GeometryContainer geomContainer(nQuadrics); + for (size_t shapeIndex = 0; shapeIndex < shapes.size(); ++shapeIndex) { + const auto &s = shapes[shapeIndex]; + if (s.name != "sphere" && s.name != "cylinder" && s.name != "disk") + continue; + + pstd::vector shapes = Shape::Create( + s.name, s.renderFromObject, s.objectFromRender, s.reverseOrientation, + s.parameters, floatTextures, &s.loc, alloc); + if (shapes.empty()) + continue; + CHECK_EQ(1, shapes.size()); + Shape shape = shapes[0]; + + Bounds3f shapeBounds = shape.Bounds(); + hiprtFloat4 aabbMin = + hiprtFloat4{float(shapeBounds.pMin.x), float(shapeBounds.pMin.y), + float(shapeBounds.pMin.z), 0.0f}; + hiprtFloat4 aabbMax = + hiprtFloat4{float(shapeBounds.pMax.x), float(shapeBounds.pMax.y), + float(shapeBounds.pMax.z), 0.0f}; + aabbBuffer[2 * quadricIndex + 0] = aabbMin; + aabbBuffer[2 * quadricIndex + 1] = aabbMax; + + // Find alpha texture, if present. + Material material = getMaterial(s, namedMaterials, materials); + FloatTexture alphaTexture = getAlphaTexture(s, floatTextures, alloc); + + // Once again, after any alpha texture is created... + s.parameters.ReportUnused(); + + HitgroupRecord &hgRecord = geomContainer.hgRecords[quadricIndex]; + hgRecord.type = HitgroupRecord::Quadric; + hgRecord.quadricRec.shape = shape; + hgRecord.quadricRec.material = material; + hgRecord.quadricRec.alphaTexture = alphaTexture; + hgRecord.quadricRec.areaLight = nullptr; + if (s.lightIndex != -1) { + if (!material) + Warning(&s.loc, "Ignoring area light specification for shape with " + "\"interface\" material."); + else { + auto iter = shapeIndexToAreaLights.find(shapeIndex); + // Note: this will hit if we try to have an instance as an area + // light. + CHECK(iter != shapeIndexToAreaLights.end()); + CHECK_EQ(iter->second->size(), 1); + hgRecord.quadricRec.areaLight = (*iter->second)[0]; + } + } + hgRecord.quadricRec.mediumInterface = getMediumInterface(s, media, alloc); + + ++quadricIndex; + } + nQuadrics = quadricIndex; + geomContainer.resize(nQuadrics); + + std::vector geomInputs(nQuadrics); + std::vector geoms(nQuadrics); + std::vector geomAddrs(nQuadrics); + ParallelFor(0, nQuadrics, [&](int64_t quadricIndex) { + hiprtAABBListPrimitive prim; + prim.aabbCount = 1; + prim.aabbStride = 2 * sizeof(hiprtFloat4); + prim.aabbs = aabbs + 2 * quadricIndex; + + hiprtGeometryBuildInput &geomInput = geomInputs[quadricIndex]; + geomInput.type = hiprtPrimitiveTypeAABBList; + geomInput.primitive.aabbList = prim; + geomInput.geomType = 2; + + geomAddrs[quadricIndex] = &geoms[quadricIndex]; + }); + + hipStream_t buildStream = threadCUDAStreams.Get(); + CUDA_CHECK(hipMemcpyAsync(aabbs, aabbBuffer.data(), + sizeof(hiprtFloat4) * 2 * nQuadrics, hipMemcpyHostToDevice, + buildStream)); + + size_t geomTempSize; + HIPRT_CHECK(hiprtGetGeometriesBuildTemporaryBufferSize( + context, nQuadrics, geomInputs.data(), options, geomTempSize)); + + hiprtDevicePtr tempBuffer; + CUDA_CHECK(hipMalloc(&tempBuffer, geomTempSize)); + + HIPRT_CHECK(hiprtCreateGeometries(context, nQuadrics, geomInputs.data(), options, + geomAddrs.data())); + + LOG_VERBOSE("Starting to build quadric geometries"); + HIPRT_CHECK(hiprtBuildGeometries(context, hiprtBuildOperationBuild, nQuadrics, + geomInputs.data(), options, tempBuffer, buildStream, + geoms.data())); + LOG_VERBOSE("Finished building quadric geometries"); + + CUDA_CHECK(hipFree(aabbs)); + CUDA_CHECK(hipFree(tempBuffer)); + + hiprtFrameMatrix identity{}; + for (size_t i = 0; i < 3; ++i) + identity.matrix[i][i] = 1.0f; + + std::vector instances(nQuadrics); + std::vector transforms(nQuadrics); + ParallelFor(0, nQuadrics, [&](int64_t quadricIndex) { + instances[quadricIndex].type = hiprtInstanceTypeGeometry; + instances[quadricIndex].geometry = geoms[quadricIndex]; + transforms[quadricIndex] = identity; + }); + + geomContainer.scene = buildBVHForInstances(instances, transforms, context, + threadAllocators, threadCUDAStreams); + + return geomContainer; +} + +hiprtScene HiprtAggregate::buildBVHForInstances( + const std::vector &instances, + const std::vector &transforms, hiprtContext context, + ThreadLocal &threadAllocators, + ThreadLocal &threadCUDAStreams) { + CHECK(transforms.size() == instances.size()); + hiprtBuildOptions options; + options.buildFlags = hiprtBuildFlagBitPreferBalancedBuild; + options.batchBuildMaxPrimCount = 512u; + + hiprtSceneBuildInput sceneInput; + sceneInput.instanceCount = instances.size(); + sceneInput.frameCount = instances.size(); + sceneInput.instanceFrames = nullptr; + sceneInput.instanceMasks = nullptr; + sceneInput.instanceTransformHeaders = nullptr; + sceneInput.instances = nullptr; + sceneInput.frameType = hiprtFrameTypeMatrix; + + size_t sceneTempSize; + HIPRT_CHECK(hiprtGetSceneBuildTemporaryBufferSize(context, sceneInput, options, + sceneTempSize)); + + hiprtDevicePtr tempBuffer; + CUDA_CHECK(hipMalloc(&tempBuffer, sceneTempSize)); + CUDA_CHECK( + hipMalloc(&sceneInput.instances, sizeof(hiprtInstance) * instances.size())); + CUDA_CHECK(hipMalloc(&sceneInput.instanceFrames, + sizeof(hiprtFrameMatrix) * transforms.size())); + + hipStream_t buildStream = threadCUDAStreams.Get(); + CUDA_CHECK(hipMemcpyAsync((void *)sceneInput.instances, instances.data(), + sizeof(hiprtInstance) * instances.size(), + hipMemcpyHostToDevice, buildStream)); + CUDA_CHECK(hipMemcpyAsync((void *)sceneInput.instanceFrames, transforms.data(), + sizeof(hiprtFrameMatrix) * transforms.size(), + hipMemcpyHostToDevice, buildStream)); + + hiprtScene scene; + HIPRT_CHECK(hiprtCreateScene(context, sceneInput, options, scene)); + + LOG_VERBOSE("Started to build scene"); + HIPRT_CHECK(hiprtBuildScene(context, hiprtBuildOperationBuild, sceneInput, options, + tempBuffer, buildStream, scene)); + LOG_VERBOSE("Finished building scene"); + + CUDA_CHECK(hipFree(tempBuffer)); + CUDA_CHECK(hipFree(sceneInput.instances)); + CUDA_CHECK(hipFree(sceneInput.instanceFrames)); + + return scene; +} + +static void loadFile(const std::string& path, std::vector &dst) { + std::fstream f(path, std::ios::binary | std::ios::in); + if (f.is_open()) { + size_t sizeFile; + f.seekg(0, std::fstream::end); + size_t size = sizeFile = (size_t)f.tellg(); + dst.resize(size); + f.seekg(0, std::fstream::beg); + f.read(dst.data(), size); + f.close(); + } +} + +HiprtAggregate::Module HiprtAggregate::compileHiprtModule(hiprtContext context) { + const std::string path = "../src/pbrt/gpu/hiprt/hiprt.cu"; + const std::string closestFunction = "__raygen__findClosest"; + const std::string shadowFunction = "__raygen__shadow"; + const std::string shadowTrFunction = "__raygen__shadow_Tr"; + const std::string oneRandomFunction = "__raygen__randomHit"; + const std::string binFilename = "hiprt.hipfb"; + + std::vector binary; + loadFile(binFilename, binary); + + Module module; + CUDA_CHECK(hipModuleLoadData(&module.hipModule, binary.data())); + + HIPRT_CHECK(hiprtCreateFuncTable(context, 3, 1, module.funcTable)); + + CUDA_CHECK(hipModuleGetFunction(&module.closestFunction, module.hipModule, + closestFunction.c_str())); + CUDA_CHECK(hipModuleGetFunction(&module.shadowFunction, module.hipModule, + shadowFunction.c_str())); + CUDA_CHECK(hipModuleGetFunction(&module.shadowTrFunction, module.hipModule, + shadowTrFunction.c_str())); + CUDA_CHECK(hipModuleGetFunction(&module.oneRandomFunction, module.hipModule, + oneRandomFunction.c_str())); + + return module; +} + +void HiprtAggregate::hiprtLaunch(hipFunction_t func, int nx, int ny, int tx, int ty, + void **args, hipStream_t cudaStream, + size_t sharedMemoryBytes) { + int3 tpb = {tx, ty, 1}; + int3 nb; + nb.x = (nx + tpb.x - 1) / tpb.x; + nb.y = (ny + tpb.y - 1) / tpb.y; + CUDA_CHECK(hipModuleLaunchKernel(func, nb.x, nb.y, 1, tpb.x, tpb.y, 1, + sharedMemoryBytes, cudaStream, args, 0)); +} + +} // namespace pbrt diff --git a/src/pbrt/gpu/hiprt/aggregate.h b/src/pbrt/gpu/hiprt/aggregate.h new file mode 100644 index 000000000..e9fdc61bc --- /dev/null +++ b/src/pbrt/gpu/hiprt/aggregate.h @@ -0,0 +1,191 @@ +#ifndef PBRT_HIPRT_AGGREGATE_H +#define PBRT_HIPRT_AGGREGATE_H + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +namespace pbrt { + +class HiprtAggregate : public WavefrontAggregate { + public: + HiprtAggregate(const BasicScene &scene, CUDATrackedMemoryResource *memoryResource, + NamedTextures &textures, + const std::map *> &shapeIndexToAreaLights, + const std::map &media, + const std::map &namedMaterials, + const std::vector &materials, int maxQueueSize); + + Bounds3f Bounds() const { return bounds; } + + void IntersectClosest(int maxRays, const RayQueue *rayQueue, + EscapedRayQueue *escapedRayQueue, + HitAreaLightQueue *hitAreaLightQueue, + MaterialEvalQueue *basicEvalMaterialQueue, + MaterialEvalQueue *universalEvalMaterialQueue, + MediumSampleQueue *mediumSampleQueue, + RayQueue *nextRayQueue) const; + + void IntersectShadow(int maxRays, ShadowRayQueue *shadowRayQueue, + SOA *pixelSampleState) const; + + void IntersectShadowTr(int maxRays, ShadowRayQueue *shadowRayQueue, + SOA *pixelSampleState) const; + + void IntersectOneRandom(int maxRays, + SubsurfaceScatterQueue *subsurfaceScatterQueue) const; + + // WAR: The enclosing parent function ("PreparePLYMeshes") for an + // extended __device__ lambda cannot have private or protected access + // within its class, so it's public... + static std::map PreparePLYMeshes( + const std::vector &shapes, + const std::map &floatTextures); + + private: + static constexpr size_t BlockSize = 64u; + + struct ParamBufferState { + bool used = false; + hipEvent_t finishedEvent; + hipDeviceptr_t ptr = 0; + void *hostPtr = nullptr; + }; + + mutable std::vector paramsPool; + mutable size_t nextParamOffset = 0; + + struct Module { + hipModule_t hipModule; + hipFunction_t closestFunction; + hipFunction_t shadowFunction; + hipFunction_t shadowTrFunction; + hipFunction_t oneRandomFunction; + hiprtFuncTable funcTable; + }; + + struct GeometryContainer { + GeometryContainer() = default; + GeometryContainer(size_t size) : hgRecords(size) {} + + void resize(size_t size) { + hgRecords.resize(size); + } + + uint32_t offset; + hiprtScene scene; + std::vector hgRecords; + }; + + struct GeometryGroup { + GeometryContainer triGeomContainer; + GeometryContainer blpGeomContainer; + GeometryContainer quadricGeomContainer; + }; + + struct InstanceContainer { + void appendRecords(GeometryContainer &g) { + g.offset = hgRecords.size(); + hgRecords.insert(hgRecords.end(), g.hgRecords.begin(), g.hgRecords.end()); + } + + void insertInstance(GeometryContainer &g, const hiprtFrameMatrix& m) { + if (g.scene != nullptr) { + hiprtInstance instance; + instance.type = hiprtInstanceTypeScene; + instance.scene = g.scene; + instances.push_back(instance); + transforms.push_back(m); + offsets.push_back(g.offset); + } + } + + std::vector offsets; + std::vector transforms; + std::vector instances; + std::vector hgRecords; + }; + + ParamBufferState &getParamBuffer(const RayIntersectParameters &) const; + + static BilinearPatchMesh *diceCurveToBLP(const ShapeSceneEntity &shape, int nDiceU, + int nDiceV, Allocator alloc); + + static GeometryContainer buildBVHForTriangles( + const std::vector &shapes, + const std::map &plyMeshes, hiprtContext context, + const std::map &floatTextures, + const std::map &namedMaterials, + const std::vector &materials, + const std::map &media, + const std::map *> &shapeIndexToAreaLights, + ThreadLocal &threadAllocators, + ThreadLocal &threadCUDAStreams); + + static GeometryContainer buildBVHForBLPs( + const std::vector &shapes, hiprtContext context, + const std::map &floatTextures, + const std::map &namedMaterials, + const std::vector &materials, + const std::map &media, + const std::map *> &shapeIndexToAreaLights, + ThreadLocal &threadAllocators, + ThreadLocal &threadCUDAStreams); + + static GeometryContainer buildBVHForQuadrics( + const std::vector &shapes, hiprtContext context, + const std::map &floatTextures, + const std::map &namedMaterials, + const std::vector &materials, + const std::map &media, + const std::map *> &shapeIndexToAreaLights, + ThreadLocal &threadAllocators, + ThreadLocal &threadCUDAStreams); + + static hiprtScene buildBVHForInstances( + const std::vector &instances, + const std::vector &transforms, hiprtContext context, + ThreadLocal &threadAllocators, + ThreadLocal &threadCUDAStreams); + + static Module compileHiprtModule(hiprtContext context); + + static void hiprtLaunch(hipFunction_t func, int nx, int ny, void **args, + hipStream_t cudaStream, size_t sharedMemoryBytes = 0); + + static void hiprtLaunch(hipFunction_t func, int nx, int ny, int tx, int ty, + void **args, hipStream_t cudaStream, + size_t sharedMemoryBytes = 0); + Module module; + hiprtContext context; + + hiprtScene scene; + hiprtGlobalStackBuffer globalStackBuffer; + hiprtGlobalStackBuffer globalInstanceStackBuffer; + std::vector geomGroups; + HitgroupRecord *hgRecords; + uint32_t *offsets; + + Bounds3f bounds; + CUDATrackedMemoryResource *memoryResource; + hipStream_t cudaStream; +}; + +} // namespace pbrt + +#endif // PBRT_HIPRT_AGGREGATE_H diff --git a/src/pbrt/gpu/hiprt/hiprt.cu b/src/pbrt/gpu/hiprt/hiprt.cu new file mode 100644 index 000000000..c713ae14f --- /dev/null +++ b/src/pbrt/gpu/hiprt/hiprt.cu @@ -0,0 +1,609 @@ +#include +#include + +__device__ bool __filter__alphaKilled(const hiprtRay &ray, const void *data, + void *payload, const hiprtHit &hit); +__device__ bool __intersection__bilinearPatch(const hiprtRay &ray, const void *data, + void *payload, hiprtHit &hit); +__device__ bool __intersection__quadric(const hiprtRay &ray, const void *data, + void *payload, hiprtHit &hit); + +HIPRT_DEVICE bool intersectFunc(uint32_t geomType, uint32_t rayType, + const hiprtFuncTableHeader &tableHeader, + const hiprtRay &ray, void *payload, hiprtHit &hit) { + const uint32_t index = tableHeader.numGeomTypes * rayType + geomType; + const void *data = tableHeader.funcDataSets[index].intersectFuncData; + switch (index) { + case 1: { + return __intersection__bilinearPatch(ray, data, payload, hit); + } + case 2: { + return __intersection__quadric(ray, data, payload, hit); + } + default: { + return false; + } + } +} + +HIPRT_DEVICE bool filterFunc(uint32_t geomType, uint32_t rayType, + const hiprtFuncTableHeader &tableHeader, const hiprtRay &ray, + void *payload, const hiprtHit &hit) { + const uint32_t index = tableHeader.numGeomTypes * rayType + geomType; + const void *data = tableHeader.funcDataSets[index].filterFuncData; + switch (index) { + case 0: { + return __filter__alphaKilled(ray, data, payload, hit); + } + default: { + return false; + } + } +} + +// pbrt is Copyright(c) 1998-2020 Matt Pharr, Wenzel Jakob, and Greg Humphreys. +// The pbrt source code is licensed under the Apache License, Version 2.0. +// SPDX: Apache-2.0 + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Make various functions visible to HIPRT, which doesn't get to link +// shader code with the HIP code in the main executable... +#include +#include +#include +#include +#include +#include + +#include + +using namespace pbrt; + +alignas(alignof(RayIntersectParameters)) __constant__ + unsigned char paramBuffer[sizeof(RayIntersectParameters)]; +#define params (*(RayIntersectParameters *)paramBuffer) + +/////////////////////////////////////////////////////////////////////////// +// Utility functions + +template +__device__ inline hiprtHit Trace(hiprtScene scene, Ray ray, Float tMax, uint32_t &missed, + void *payload) { + hiprtRay hiprtRay; + hiprtRay.origin = make_float3(ray.o.x, ray.o.y, ray.o.z); + hiprtRay.direction = make_float3(ray.d.x, ray.d.y, ray.d.z); + hiprtRay.minT = 1e-7f; + hiprtRay.maxT = tMax; + + __shared__ int sharedStackCache[SHARED_STACK_SIZE * BLOCK_SIZE]; + hiprtSharedStackBuffer sharedStackBuffer{SHARED_STACK_SIZE, sharedStackCache}; + hiprtGlobalStack stack(params.globalStackBuffer, sharedStackBuffer); + + hiprtSharedStackBuffer sharedInstanceStackBuffer{}; + hiprtGlobalInstanceStack instanceStack(params.globalInstanceStackBuffer, + sharedInstanceStackBuffer); + + hiprtHit hit; + if constexpr (!AnyHit) { + hiprtSceneTraversalClosestCustomStack + tr(scene, hiprtRay, stack, instanceStack, hiprtFullRayMask, + hiprtTraversalHintDefault, payload, params.funcTable, 0, ray.time); + hit = tr.getNextHit(); + } else { + hiprtSceneTraversalAnyHitCustomStack + tr(scene, hiprtRay, stack, instanceStack, hiprtFullRayMask, + hiprtTraversalHintDefault, payload, params.funcTable, 0, ray.time); + hit = tr.getNextHit(); + if (hit.t == 0.0f) + hit.primID = hiprtInvalidValue; + } + + missed = uint32_t(hit.primID == hiprtInvalidValue); + return hit; +} + +static __device__ uint32_t recordIndex(const hiprtHit &hit) { + return hit.instanceIDs[1] + params.offsets[hit.instanceIDs[0]]; +} + +static __forceinline__ __device__ Transform getWorldFromInstance(const hiprtHit &hit) { + hiprtFrameMatrix hiprtWorldFromObjM = + hiprtGetObjectToWorldFrameMatrix(params.traversable, hit.instanceIDs, 0.0f); + hiprtFrameMatrix hiprtObjFromWorldM = + hiprtGetWorldToObjectFrameMatrix(params.traversable, hit.instanceIDs, 0.0f); + SquareMatrix<4> worldFromObjM, objFromWorldM; + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 4; ++j) { + worldFromObjM[i][j] = hiprtWorldFromObjM.matrix[i][j]; + objFromWorldM[i][j] = hiprtObjFromWorldM.matrix[i][j]; + } + } + return Transform(worldFromObjM, objFromWorldM); +} + +static __forceinline__ __device__ SurfaceInteraction +getTriangleIntersection(const Ray &rayWorld, const hiprtHit &hit) { + const TriangleMeshRecord &rec = params.hgRecords[recordIndex(hit)].triRec; + + float b1 = hit.uv.x; + float b2 = hit.uv.y; + float b0 = 1 - b1 - b2; + + float3 rd = make_float3(rayWorld.d.x, rayWorld.d.y, rayWorld.d.z); + Vector3f wo = -Vector3f(rd.x, rd.y, rd.z); + + Transform worldFromInstance = getWorldFromInstance(hit); + + Float time = rayWorld.time; + wo = worldFromInstance.ApplyInverse(wo); + + TriangleIntersection ti{b0, b1, b2, hit.t}; + SurfaceInteraction intr = + Triangle::InteractionFromIntersection(rec.mesh, hit.primID, ti, time, wo); + return worldFromInstance(intr); +} + +static __forceinline__ __device__ SurfaceInteraction +getBilinearPatchIntersection(const Ray &rayWorld, const hiprtHit &hit, Point2f uv) { + BilinearMeshRecord &rec = params.hgRecords[recordIndex(hit)].blpRec; + + float3 rd = make_float3(rayWorld.d.x, rayWorld.d.y, rayWorld.d.z); + Vector3f wo = -Vector3f(rd.x, rd.y, rd.z); + + return BilinearPatch::InteractionFromIntersection(rec.mesh, hit.primID, uv, + rayWorld.time, wo); +} + +static __device__ inline SurfaceInteraction getQuadricIntersection( + const Ray &rayWorld, const hiprtHit &hit, const QuadricIntersection &si) { + QuadricRecord &rec = params.hgRecords[recordIndex(hit)].quadricRec; + + float3 rd = make_float3(rayWorld.d.x, rayWorld.d.y, rayWorld.d.z); + Vector3f wo = -Vector3f(rd.x, rd.y, rd.z); + Float time = rayWorld.time; + + SurfaceInteraction intr; + if (const Sphere *sphere = rec.shape.CastOrNullptr()) + intr = sphere->InteractionFromIntersection(si, wo, time); + else if (const Cylinder *cylinder = rec.shape.CastOrNullptr()) + intr = cylinder->InteractionFromIntersection(si, wo, time); + else if (const Disk *disk = rec.shape.CastOrNullptr()) + intr = disk->InteractionFromIntersection(si, wo, time); + else + CHECK(!"unexpected quadric"); + + return intr; +} + +/////////////////////////////////////////////////////////////////////////// +// Intersection and filter functions + +__device__ bool __filter__alphaKilled(const hiprtRay &ray, const void *data, + void *payload, const hiprtHit &hit) { + const TriangleMeshRecord &rec = params.hgRecords[recordIndex(hit)].triRec; + if (!rec.alphaTexture) + return false; + + Ray rayWorld = *(Ray *)payload; + + SurfaceInteraction intr = getTriangleIntersection(rayWorld, hit); + + BasicTextureEvaluator eval; + Float alpha = eval(rec.alphaTexture, intr); + if (alpha >= 1) + return false; + if (alpha <= 0) + return true; + else { + float3 o = make_float3(rayWorld.o.x, rayWorld.o.y, rayWorld.o.z); + float3 d = make_float3(rayWorld.d.x, rayWorld.d.y, rayWorld.d.z); + Float u = HashFloat(o, d); + return u > alpha; + } +} + +__device__ bool __intersection__bilinearPatch(const hiprtRay &ray_, const void *data, + void *payload, hiprtHit &hit) { + BilinearMeshRecord &rec = params.hgRecords[recordIndex(hit)].blpRec; + + float3 org = ray_.origin; + float3 dir = ray_.direction; + Float tMax = ray_.maxT; + Ray ray(Point3f(org.x, org.y, org.z), Vector3f(dir.x, dir.y, dir.z)); + Ray rayWorld = *(Ray *)payload; + + int vertexIndex = 4 * hit.primID; + Point3f p00 = rec.mesh->p[rec.mesh->vertexIndices[vertexIndex]]; + Point3f p10 = rec.mesh->p[rec.mesh->vertexIndices[vertexIndex + 1]]; + Point3f p01 = rec.mesh->p[rec.mesh->vertexIndices[vertexIndex + 2]]; + Point3f p11 = rec.mesh->p[rec.mesh->vertexIndices[vertexIndex + 3]]; + pstd::optional isect = + IntersectBilinearPatch(ray, tMax, p00, p10, p01, p11); + + if (!isect) + return false; + + if (rec.alphaTexture) { + SurfaceInteraction intr = getBilinearPatchIntersection(rayWorld, hit, isect->uv); + BasicTextureEvaluator eval; + Float alpha = eval(rec.alphaTexture, intr); + if (alpha < 1) { + if (alpha == 0) + // No hit + return false; + + float3 o = make_float3(rayWorld.o.x, rayWorld.o.y, rayWorld.o.z); + float3 d = make_float3(rayWorld.d.x, rayWorld.d.y, rayWorld.d.z); + Float u = HashFloat(o, d); + if (u > alpha) + // no hit + return false; + } + } + + hit.t = isect->t; + hit.uv.x = isect->uv[0]; + hit.uv.y = isect->uv[1]; + + return true; +} + +__device__ bool __intersection__quadric(const hiprtRay &ray_, const void *data, + void *payload, hiprtHit &hit) { + QuadricRecord &rec = params.hgRecords[recordIndex(hit)].quadricRec; + + float3 org = ray_.origin; + float3 dir = ray_.direction; + Float tMax = ray_.maxT; + Ray ray(Point3f(org.x, org.y, org.z), Vector3f(dir.x, dir.y, dir.z)); + pstd::optional isect; + + if (const Sphere *sphere = rec.shape.CastOrNullptr()) + isect = sphere->BasicIntersect(ray, tMax); + else if (const Cylinder *cylinder = rec.shape.CastOrNullptr()) + isect = cylinder->BasicIntersect(ray, tMax); + else if (const Disk *disk = rec.shape.CastOrNullptr()) + isect = disk->BasicIntersect(ray, tMax); + + if (!isect) + return false; + + Ray rayWorld = *(Ray *)payload; + + if (rec.alphaTexture) { + SurfaceInteraction intr = getQuadricIntersection(rayWorld, hit, *isect); + + BasicTextureEvaluator eval; + Float alpha = eval(rec.alphaTexture, intr); + if (alpha < 1) { + if (alpha == 0) + // No hit + return false; + + float3 o = make_float3(rayWorld.o.x, rayWorld.o.y, rayWorld.o.z); + float3 d = make_float3(rayWorld.d.x, rayWorld.d.y, rayWorld.d.z); + Float u = HashFloat(o.x, o.y, o.z, d.x, d.y, d.z); + if (u > alpha) + // no hit + return false; + } + } + + hit.t = isect->tHit; + hit.normal = make_float3(isect->pObj.x, isect->pObj.y, isect->pObj.z); + hit.uv.x = isect->phi; + + return true; +} + +/////////////////////////////////////////////////////////////////////////// +// Closest hit + +struct ClosestHitContext { + ClosestHitContext() = default; + __device__ ClosestHitContext(Medium rayMedium, bool shadowRay) + : rayMedium(rayMedium), shadowRay(shadowRay) {} + + Medium rayMedium; + bool shadowRay; + + // out + Point3fi piHit; + Normal3f nHit; + Material material; + MediumInterface mediumInterface; + + __device__ Ray SpawnRayTo(const Point3f &p) const { + Interaction intr(piHit, nHit); + intr.mediumInterface = &mediumInterface; + return intr.SpawnRayTo(p); + } +}; + +static __forceinline__ __device__ void ProcessClosestIntersection( + SurfaceInteraction intr, const hiprtHit &hit, ClosestHitContext &ctx) { + int rayIndex = blockIdx.x * blockDim.x + threadIdx.x; + + Medium rayMedium = ctx.rayMedium; + + if (intr.mediumInterface) + ctx.mediumInterface = *intr.mediumInterface; + else + ctx.mediumInterface = MediumInterface(rayMedium); + + ctx.piHit = intr.pi; + ctx.nHit = intr.n; + ctx.material = intr.material; + + if (ctx.shadowRay) + return; + + // We only have the ray queue (and it only makes sense to access) for + // regular closest hit rays. + RayWorkItem r = (*params.rayQueue)[rayIndex]; + + EnqueueWorkAfterIntersection(r, rayMedium, hit.t, intr, params.mediumSampleQueue, + params.nextRayQueue, params.hitAreaLightQueue, + params.basicEvalMaterialQueue, + params.universalEvalMaterialQueue); +} +__device__ void closesthitTriangle(const Ray &rayWorld, const hiprtHit &hit, + ClosestHitContext &ctx) { + const TriangleMeshRecord &rec = params.hgRecords[recordIndex(hit)].triRec; + + SurfaceInteraction intr = getTriangleIntersection(rayWorld, hit); + + if (rec.mediumInterface && rec.mediumInterface->IsMediumTransition()) + intr.mediumInterface = rec.mediumInterface; + intr.material = rec.material; + if (!rec.areaLights.empty()) + intr.areaLight = rec.areaLights[hit.primID]; + + ProcessClosestIntersection(intr, hit, ctx); +} + +__device__ void closesthitBilinearPatch(const Ray &rayWorld, const hiprtHit &hit, + ClosestHitContext &ctx) { + BilinearMeshRecord &rec = params.hgRecords[recordIndex(hit)].blpRec; + + Point2f uv(hit.uv.x, hit.uv.y); + + SurfaceInteraction intr = getBilinearPatchIntersection(rayWorld, hit, uv); + if (rec.mediumInterface && rec.mediumInterface->IsMediumTransition()) + intr.mediumInterface = rec.mediumInterface; + intr.material = rec.material; + if (!rec.areaLights.empty()) + intr.areaLight = rec.areaLights[hit.primID]; + + Transform worldFromInstance = getWorldFromInstance(hit); + intr = worldFromInstance(intr); + + ProcessClosestIntersection(intr, hit, ctx); +} + +__device__ void closesthitQuadric(const Ray &rayWorld, const hiprtHit &hit, + ClosestHitContext &ctx) { + QuadricRecord &rec = params.hgRecords[recordIndex(hit)].quadricRec; + QuadricIntersection qi; + qi.pObj = Point3f(hit.normal.x, hit.normal.y, hit.normal.z); + qi.phi = hit.uv.x; + + SurfaceInteraction intr = getQuadricIntersection(rayWorld, hit, qi); + if (rec.mediumInterface && rec.mediumInterface->IsMediumTransition()) + intr.mediumInterface = rec.mediumInterface; + intr.material = rec.material; + if (rec.areaLight) + intr.areaLight = rec.areaLight; + + Transform worldFromInstance = getWorldFromInstance(hit); + intr = worldFromInstance(intr); + + ProcessClosestIntersection(intr, hit, ctx); +} + +extern "C" __global__ void __raygen__findClosest() { + int rayIndex = blockIdx.x * blockDim.x + threadIdx.x; + if (rayIndex >= params.rayQueue->Size()) + return; + + RayWorkItem r = (*params.rayQueue)[rayIndex]; + Ray ray = r.ray; + Float tMax = 1e30f; + + PBRT_DBG("ray o %f %f %f dir %f %f %f tmax %f\n", ray.o.x, ray.o.y, ray.o.z, ray.d.x, + ray.d.y, ray.d.z, tMax); + + uint32_t missed = 0; + hiprtHit hit = Trace(params.traversable, ray, tMax, missed, &ray); + + if (missed) { + EnqueueWorkAfterMiss(r, params.mediumSampleQueue, params.escapedRayQueue); + return; + } + + ClosestHitContext ctx(ray.medium, false); + if (params.hgRecords[recordIndex(hit)].type == HitgroupRecord::TriangleMesh) + closesthitTriangle(ray, hit, ctx); + else if (params.hgRecords[recordIndex(hit)].type == HitgroupRecord::BilinearMesh) + closesthitBilinearPatch(ray, hit, ctx); + else if (params.hgRecords[recordIndex(hit)].type == HitgroupRecord::Quadric) + closesthitQuadric(ray, hit, ctx); + else + CHECK(!"unexpected primitive type"); +} + +/////////////////////////////////////////////////////////////////////////// +// Shadow rays + +extern "C" __global__ void __raygen__shadow() { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= params.shadowRayQueue->Size()) + return; + + ShadowRayWorkItem sr = (*params.shadowRayQueue)[index]; + PBRT_DBG("Tracing shadow ray index %d o %f %f %f d %f %f %f\n", index, sr.ray.o.x, + sr.ray.o.y, sr.ray.o.z, sr.ray.d.x, sr.ray.d.y, sr.ray.d.z); + + uint32_t missed = 0; + Trace(params.traversable, sr.ray, sr.tMax, missed, &sr.ray); + + RecordShadowRayResult(sr, ¶ms.pixelSampleState, !missed); +} + +extern "C" __global__ void __raygen__shadow_Tr() { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= params.shadowRayQueue->Size()) + return; + + ShadowRayWorkItem sr = (*params.shadowRayQueue)[index]; + + ClosestHitContext ctx; + + TraceTransmittance( + sr, ¶ms.pixelSampleState, + [&](Ray ray, Float tMax) -> TransmittanceTraceResult { + uint32_t missed = 0; + hiprtHit hit = Trace(params.traversable, ray, tMax, missed, + &ray); // the closest hit is actually used + + ctx = ClosestHitContext(ray.medium, true); + + if (!missed) { + if (params.hgRecords[recordIndex(hit)].type == + HitgroupRecord::TriangleMesh) + closesthitTriangle(ray, hit, ctx); + else if (params.hgRecords[recordIndex(hit)].type == + HitgroupRecord::BilinearMesh) + closesthitBilinearPatch(ray, hit, ctx); + else if (params.hgRecords[recordIndex(hit)].type == + HitgroupRecord::Quadric) + closesthitQuadric(ray, hit, ctx); + else + CHECK(!"unexpected primitive type"); + } + + return TransmittanceTraceResult{!missed, Point3f(ctx.piHit), ctx.material}; + }, + [&](Point3f p) -> Ray { return ctx.SpawnRayTo(p); }); +} + +/////////////////////////////////////////////////////////////////////////// +// Random hit (for subsurface scattering) + +struct RandomHitPayload { + WeightedReservoirSampler wrs; + Material material; + pstd::optional intr; +}; + +__device__ void closesthitRandomHitTriangle(const Ray &rayWorld, const hiprtHit &hit, + RandomHitPayload *p) { + const TriangleMeshRecord &rec = params.hgRecords[recordIndex(hit)].triRec; + + PBRT_DBG("Anyhit triangle for random hit: rec.material %p params.materials %p\n", + rec.material.ptr(), p->material.ptr()); + + SurfaceInteraction intr = getTriangleIntersection(rayWorld, hit); + p->intr = intr; + + if (rec.material == p->material) + p->wrs.Add([&] __device__() { return intr; }, 1.f); +} + +__device__ void closesthitRandomHitBilinearPatch(const Ray &rayWorld, const hiprtHit &hit, + RandomHitPayload *p) { + BilinearMeshRecord &rec = params.hgRecords[recordIndex(hit)].blpRec; + + PBRT_DBG("Anyhit blp for random hit: rec.material %p params.materials %p\n", + rec.material.ptr(), p->material.ptr()); + + Point2f uv(hit.uv.x, hit.uv.y); + SurfaceInteraction intr = getBilinearPatchIntersection(rayWorld, hit, uv); + p->intr = intr; + + if (rec.material == p->material) + p->wrs.Add([&] __device__() { return intr; }, 1.f); +} + +__device__ void closesthitRandomHitQuadric(const Ray &rayWorld, const hiprtHit &hit, + RandomHitPayload *p) { + QuadricRecord &rec = params.hgRecords[recordIndex(hit)].quadricRec; + + PBRT_DBG("Anyhit quadric for random hit: rec.material %p params.materials %p\n", + rec.material.ptr(), p->material.ptr()); + + QuadricIntersection qi; + qi.pObj = Point3f(hit.normal.x, hit.normal.y, hit.normal.z); + qi.phi = hit.uv.x; + + SurfaceInteraction intr = getQuadricIntersection(rayWorld, hit, qi); + p->intr = intr; + + if (rec.material == p->material) + p->wrs.Add([&] __device__() { return intr; }, 1.f); +} + +extern "C" __global__ void __raygen__randomHit() { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= params.subsurfaceScatterQueue->Size()) + return; + + SubsurfaceScatterWorkItem s = (*params.subsurfaceScatterQueue)[index]; + + Ray ray(s.p0, s.p1 - s.p0); + + RandomHitPayload payload; + payload.wrs.Seed(Hash(s.p0, s.p1)); + payload.material = s.material; + + PBRT_DBG("Randomhit raygen ray.o %f %f %f ray.d %f %f %f\n", ray.o.x, ray.o.y, + ray.o.z, ray.d.x, ray.d.y, ray.d.z); + + int depth = 0; + while (LengthSquared(ray.d) > 0 && ++depth < 100) { + uint32_t missed = 0; + hiprtHit hit = + Trace(params.traversable, ray, 1.f /* tMax */, missed, &ray); + + if (!missed) { + if (params.hgRecords[recordIndex(hit)].type == HitgroupRecord::TriangleMesh) + closesthitRandomHitTriangle(ray, hit, &payload); + else if (params.hgRecords[recordIndex(hit)].type == + HitgroupRecord::BilinearMesh) + closesthitRandomHitBilinearPatch(ray, hit, &payload); + else if (params.hgRecords[recordIndex(hit)].type == HitgroupRecord::Quadric) + closesthitRandomHitQuadric(ray, hit, &payload); + else + CHECK(!"unexpected primitive type"); + } + + if (payload.intr) { + ray = payload.intr->SpawnRayTo(s.p1); + payload.intr.reset(); + } else + break; + } + + if (payload.wrs.HasSample() && + payload.wrs.WeightSum() > 0) { // TODO: latter check shouldn't be needed... + const SubsurfaceInteraction &si = payload.wrs.GetSample(); + + params.subsurfaceScatterQueue->reservoirPDF[index] = + payload.wrs.SampleProbability(); + params.subsurfaceScatterQueue->ssi[index] = payload.wrs.GetSample(); + } else + params.subsurfaceScatterQueue->reservoirPDF[index] = 0; +} diff --git a/src/pbrt/gpu/memory.cpp b/src/pbrt/gpu/memory.cpp index 0cec0a9ab..da3601884 100644 --- a/src/pbrt/gpu/memory.cpp +++ b/src/pbrt/gpu/memory.cpp @@ -8,8 +8,12 @@ #include #include +#if defined(__HIPCC__) +#include +#else #include #include +#endif namespace pbrt { diff --git a/src/pbrt/gpu/util.cpp b/src/pbrt/gpu/util.cpp index aa89707c7..b0a631d8e 100644 --- a/src/pbrt/gpu/util.cpp +++ b/src/pbrt/gpu/util.cpp @@ -76,21 +76,27 @@ void GPUInit() { #endif CUDA_CHECK(cudaSetDevice(device)); +// there was a bug in HIP stating unsupported while actually supported +// not sure whether has been fixed +#ifdef __NVCC__ int hasUnifiedAddressing; CUDA_CHECK(cudaDeviceGetAttribute(&hasUnifiedAddressing, cudaDevAttrUnifiedAddressing, device)); if (!hasUnifiedAddressing) LOG_FATAL("The selected GPU device (%d) does not support unified addressing.", device); +#endif CUDA_CHECK(cudaDeviceSetLimit(cudaLimitStackSize, 8192)); size_t stackSize; CUDA_CHECK(cudaDeviceGetLimit(&stackSize, cudaLimitStackSize)); LOG_VERBOSE("Reset stack size to %d", stackSize); +#ifdef __NVCC__ CUDA_CHECK(cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 32 * 1024 * 1024)); CUDA_CHECK(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1)); +#endif #ifdef NVTX #ifdef PBRT_IS_WINDOWS diff --git a/src/pbrt/gpu/util.h b/src/pbrt/gpu/util.h index 72b8cc4cc..98f782aa6 100644 --- a/src/pbrt/gpu/util.h +++ b/src/pbrt/gpu/util.h @@ -18,8 +18,12 @@ #include #include +#if defined(__HIPCC__) +#include +#else #include #include +#endif #ifdef NVTX #ifdef UNICODE @@ -38,6 +42,7 @@ LOG_FATAL("CUDA error: %s", cudaGetErrorString(error)); \ } else /* eat semicolon */ +#ifdef __NVCC__ // only used in denoiser.cpp #define CU_CHECK(EXPR) \ do { \ CUresult result = EXPR; \ @@ -47,6 +52,7 @@ LOG_FATAL("CUDA error: %s", str); \ } \ } while (false) /* eat semicolon */ +#endif namespace pbrt { @@ -64,15 +70,20 @@ inline int GetBlockSize(const char *description, F kernel) { return iter->second; int minGridSize, blockSize; +// this API is not reliable in HIP sometimes returning even negative values +#ifdef __HIPCC__ + blockSize = 64; +#else CUDA_CHECK( cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, kernel, 0, 0)); +#endif kernelBlockSizes[index] = blockSize; LOG_VERBOSE("[%s]: block size %d", description, blockSize); return blockSize; } -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__HIPCC__) template __global__ void Kernel(F func, int nItems) { int tid = blockIdx.x * blockDim.x + threadIdx.x; @@ -113,7 +124,7 @@ void GPUParallelFor(const char *description, int nItems, F func) { #endif } -#endif // __NVCC__ +#endif // __NVCC__ || __HIPCC__ // GPU Synchronization Function Declarations void GPUWait(); diff --git a/src/pbrt/media.h b/src/pbrt/media.h index 8a5677a74..d31d02aa2 100644 --- a/src/pbrt/media.h +++ b/src/pbrt/media.h @@ -24,9 +24,6 @@ #include #include #include -#if defined(PBRT_BUILD_GPU_RENDERER) && defined(__NVCC__) -#include -#endif // PBRT_BUILD_GPU_RENDERER #include #include diff --git a/src/pbrt/options.h b/src/pbrt/options.h index 56f8e67be..397ef9710 100644 --- a/src/pbrt/options.h +++ b/src/pbrt/options.h @@ -62,9 +62,9 @@ struct PBRTOptions : BasicPBRTOptions { extern PBRTOptions *Options; #if defined(PBRT_BUILD_GPU_RENDERER) -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIPCC__) extern __constant__ BasicPBRTOptions OptionsGPU; -#endif // __CUDACC__ +#endif // __CUDACC__ || __HIPCC__ void CopyOptionsToGPU(); #endif // PBRT_BUILD_GPU_RENDERER diff --git a/src/pbrt/pbrt.h b/src/pbrt/pbrt.h index 8207a4800..e52297032 100644 --- a/src/pbrt/pbrt.h +++ b/src/pbrt/pbrt.h @@ -18,11 +18,11 @@ #endif // PBRT_IS_WINDOWS // GPU Macro Definitions -#if defined(__CUDA_ARCH__) +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) #define PBRT_IS_GPU_CODE #endif -#if defined(PBRT_BUILD_GPU_RENDERER) && defined(__CUDACC__) +#if defined(PBRT_BUILD_GPU_RENDERER) && (defined(__CUDACC__) || defined(__HIPCC__)) #ifndef PBRT_NOINLINE #define PBRT_NOINLINE __attribute__((noinline)) #endif diff --git a/src/pbrt/shapes.h b/src/pbrt/shapes.h index d775b6469..f74df4ba9 100644 --- a/src/pbrt/shapes.h +++ b/src/pbrt/shapes.h @@ -812,7 +812,7 @@ inline Cylinder::Cylinder(const Transform *renderFromObject, phiMax(Radians(Clamp(phiMax, 0, 360))) {} // Triangle Declarations -#if defined(PBRT_BUILD_GPU_RENDERER) && defined(__CUDACC__) +#if defined(PBRT_BUILD_GPU_RENDERER) && (defined(__HIPCC__) || defined(__CUDACC__)) extern PBRT_GPU pstd::vector *allTriangleMeshesGPU; #endif @@ -1264,7 +1264,7 @@ class Curve { }; // BilinearPatch Declarations -#if defined(PBRT_BUILD_GPU_RENDERER) && defined(__CUDACC__) +#if defined(PBRT_BUILD_GPU_RENDERER) && (defined(__HIPCC__) || defined(__CUDACC__)) extern PBRT_GPU pstd::vector *allBilinearMeshesGPU; #endif diff --git a/src/pbrt/textures.cpp b/src/pbrt/textures.cpp index 48a0aeb2b..d21c72f28 100644 --- a/src/pbrt/textures.cpp +++ b/src/pbrt/textures.cpp @@ -180,7 +180,7 @@ std::string SpectrumBilerpTexture::ToString() const { } // CheckerboardTexture Function Definitions -Float Checkerboard(TextureEvalContext ctx, TextureMapping2D map2D, +PBRT_CPU_GPU Float Checkerboard(TextureEvalContext ctx, TextureMapping2D map2D, TextureMapping3D map3D) { // Define 1D checkerboard filtered integral functions auto d = [](Float x) { @@ -285,7 +285,7 @@ std::string SpectrumCheckerboardTexture::ToString() const { } // InsidePolkaDot Function Definition -bool InsidePolkaDot(Point2f st) { +PBRT_CPU_GPU bool InsidePolkaDot(Point2f st) { // Compute cell indices (_sCell_,_tCell_ for dots int sCell = pstd::floor(st[0] + .5f), tCell = pstd::floor(st[1] + .5f); @@ -356,10 +356,10 @@ std::string FBmTexture::ToString() const { } // SpectrumImageTexture Method Definitions -SampledSpectrum SpectrumImageTexture::Evaluate(TextureEvalContext ctx, +PBRT_CPU_GPU SampledSpectrum SpectrumImageTexture::Evaluate(TextureEvalContext ctx, SampledWavelengths lambda) const { #ifdef PBRT_IS_GPU_CODE - assert(!"Should not be called in GPU code"); + CHECK(!"Should not be called in GPU code"); return SampledSpectrum(0); #else // Apply texture mapping and flip $t$ coordinate for image texture lookup @@ -477,7 +477,7 @@ SpectrumImageTexture *SpectrumImageTexture::Create( } // MarbleTexture Method Definitions -SampledSpectrum MarbleTexture::Evaluate(TextureEvalContext ctx, +PBRT_CPU_GPU SampledSpectrum MarbleTexture::Evaluate(TextureEvalContext ctx, SampledWavelengths lambda) const { TexCoord3D c = mapping.Map(ctx); c.p *= scale; @@ -582,7 +582,6 @@ SpectrumDirectionMixTexture *SpectrumDirectionMixTexture::Create( parameters.GetSpectrumTexture("tex2", one, spectrumType, alloc), dir); } -static std::mutex ptexMutex; static Ptex::PtexCache *cache; STAT_COUNTER("Texture/Ptex lookups", nLookups); @@ -601,7 +600,8 @@ struct : public PtexErrorHandler { PtexTextureBase::PtexTextureBase(const std::string &filename, ColorEncoding encoding, Float scale) : filename(filename), encoding(encoding), scale(scale) { - ptexMutex.lock(); + std::mutex mutex; + mutex.lock(); if (!cache) { int maxFiles = 100; size_t maxMem = 1ull << 32; // 4GB @@ -611,7 +611,7 @@ PtexTextureBase::PtexTextureBase(const std::string &filename, ColorEncoding enco &errorHandler); // TODO? cache->setSearchPath(...); } - ptexMutex.unlock(); + mutex.unlock(); // Issue an error if the texture doesn't exist or has an unsupported // number of channels. @@ -695,7 +695,7 @@ std::string SpectrumPtexTexture::ToString() const { return StringPrintf("[ SpectrumPtexTexture %s ]", BaseToString()); } -Float FloatPtexTexture::Evaluate(TextureEvalContext ctx) const { +PBRT_CPU_GPU Float FloatPtexTexture::Evaluate(TextureEvalContext ctx) const { #ifdef PBRT_IS_GPU_CODE LOG_FATAL("Ptex not supported with GPU renderer"); return 0; @@ -709,7 +709,7 @@ Float FloatPtexTexture::Evaluate(TextureEvalContext ctx) const { #endif } -SampledSpectrum SpectrumPtexTexture::Evaluate(TextureEvalContext ctx, +PBRT_CPU_GPU SampledSpectrum SpectrumPtexTexture::Evaluate(TextureEvalContext ctx, SampledWavelengths lambda) const { #ifdef PBRT_IS_GPU_CODE LOG_FATAL("Ptex not supported with GPU renderer"); @@ -989,15 +989,23 @@ WrinkledTexture *WrinkledTexture::Create(const Transform &renderFromTexture, #if defined(PBRT_BUILD_GPU_RENDERER) +// mipmaps were not fully supported in HIP +// we use just the base resolution for now +#ifdef __HIPCC__ +using TextureArray = cudaArray_t; +#else +using TextureArray = cudaMipmappedArray_t; +#endif + struct LuminanceTextureCacheItem { - cudaMipmappedArray_t mipArray; + TextureArray texArray; cudaTextureReadMode readMode; int nMIPMapLevels; bool originallySingleChannel; }; struct RGBTextureCacheItem { - cudaMipmappedArray_t mipArray; + TextureArray texArray; cudaTextureReadMode readMode; int nMIPMapLevels; const RGBColorSpace *colorSpace; @@ -1009,10 +1017,10 @@ static std::map rgbTextureCache; STAT_MEMORY_COUNTER("Memory/ImageTextures", gpuImageTextureBytes); -static cudaMipmappedArray_t createSingleChannelTextureArray( +static TextureArray createSingleChannelTextureArray( const Image &image, const RGBColorSpace *colorSpace, int *nMIPMapLevels) { CHECK_EQ(1, image.NChannels()); - cudaMipmappedArray_t mipArray; + TextureArray texArray; cudaChannelFormatDesc channelDesc; switch (image.Format()) { @@ -1036,13 +1044,37 @@ static cudaMipmappedArray_t createSingleChannelTextureArray( const Image &baseImage = mipmap.GetLevel(0); cudaExtent extent = make_cudaExtent(baseImage.Resolution().x, baseImage.Resolution().y, 0); - CUDA_CHECK(cudaMallocMipmappedArray(&mipArray, &channelDesc, extent, mipmap.Levels(), + +#ifdef __HIPCC__ + int pitch; + switch (image.Format()) { + case PixelFormat::U256: + pitch = baseImage.Resolution().x * sizeof(uint8_t); + break; + case PixelFormat::Half: + pitch = baseImage.Resolution().x * sizeof(Half); + break; + case PixelFormat::Float: + pitch = baseImage.Resolution().x * sizeof(float); + break; + default: + LOG_FATAL("Unhandled PixelFormat"); + } + + gpuImageTextureBytes += pitch * baseImage.Resolution().y; + + CUDA_CHECK(cudaMallocArray(&texArray, &channelDesc, extent.width, extent.height)); + CUDA_CHECK(cudaMemcpy2DToArray(texArray, /* offset */ 0, 0, + baseImage.RawPointer({0, 0}), pitch, pitch, + baseImage.Resolution().y, cudaMemcpyHostToDevice)); +#else + CUDA_CHECK(cudaMallocMipmappedArray(&texArray, &channelDesc, extent, mipmap.Levels(), 0 /* flags */)); for (int level = 0; level < mipmap.Levels(); ++level) { const Image &levelImage = mipmap.GetLevel(level); cudaArray_t levelArray; - CUDA_CHECK(cudaGetMipmappedArrayLevel(&levelArray, mipArray, level)); + CUDA_CHECK(cudaGetMipmappedArrayLevel(&levelArray, texArray, level)); int pitch; switch (image.Format()) { @@ -1065,8 +1097,9 @@ static cudaMipmappedArray_t createSingleChannelTextureArray( levelArray, /* offset */ 0, 0, levelImage.RawPointer({0, 0}), pitch, pitch, levelImage.Resolution().y, cudaMemcpyHostToDevice)); } +#endif - return mipArray; + return texArray; } static cudaTextureAddressMode convertAddressMode(const std::string &mode) { @@ -1107,7 +1140,7 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create( ColorEncoding encoding = ColorEncoding::Get(encodingString, alloc); // These have to be initialized one way or another in the below - cudaMipmappedArray_t mipArray; + TextureArray texArray; int nMIPMapLevels = 0; cudaTextureReadMode readMode; const RGBColorSpace *colorSpace = nullptr; @@ -1117,7 +1150,7 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create( auto rgbIter = rgbTextureCache.find(filename); if (rgbIter != rgbTextureCache.end()) { LOG_VERBOSE("Found %s in RGB tex array cache!", filename); - mipArray = rgbIter->second.mipArray; + texArray = rgbIter->second.texArray; readMode = rgbIter->second.readMode; nMIPMapLevels = rgbIter->second.nMIPMapLevels; colorSpace = rgbIter->second.colorSpace; @@ -1128,7 +1161,7 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create( // GPUFloatImageTexture converted it to single channel if (lumIter != lumTextureCache.end() && lumIter->second.originallySingleChannel) { LOG_VERBOSE("Found %s in luminance tex array cache!", filename); - mipArray = lumIter->second.mipArray; + texArray = lumIter->second.texArray; readMode = lumIter->second.readMode; nMIPMapLevels = lumIter->second.nMIPMapLevels; colorSpace = RGBColorSpace::sRGB; @@ -1162,14 +1195,35 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create( cudaExtent extent = make_cudaExtent(baseImage.Resolution().x, baseImage.Resolution().y, 0); - CUDA_CHECK(cudaMallocMipmappedArray(&mipArray, &channelDesc, +#ifdef __HIPCC__ + std::vector rgba(4 * baseImage.Resolution().x * + baseImage.Resolution().y); + size_t offset = 0; + for (int y = 0; y < baseImage.Resolution().y; ++y) + for (int x = 0; x < baseImage.Resolution().x; ++x) { + for (int c = 0; c < 3; ++c) + rgba[offset++] = + ((uint8_t *)baseImage.RawPointer({x, y}))[c]; + rgba[offset++] = 255; + } + + int pitch = baseImage.Resolution().x * 4 * sizeof(uint8_t); + gpuImageTextureBytes += pitch * baseImage.Resolution().y; + + CUDA_CHECK(cudaMallocArray(&texArray, &channelDesc, extent.width, + extent.height)); + CUDA_CHECK(cudaMemcpy2DToArray( + texArray, /* offset */ 0, 0, rgba.data(), pitch, pitch, + baseImage.Resolution().y, cudaMemcpyHostToDevice)); +#else + CUDA_CHECK(cudaMallocMipmappedArray(&texArray, &channelDesc, extent, mipmap.Levels(), 0 /* flags */)); for (int level = 0; level < mipmap.Levels(); ++level) { const Image &levelImage = mipmap.GetLevel(level); cudaArray_t levelArray; CUDA_CHECK( - cudaGetMipmappedArrayLevel(&levelArray, mipArray, level)); + cudaGetMipmappedArrayLevel(&levelArray, texArray, level)); std::vector rgba(4 * levelImage.Resolution().x * levelImage.Resolution().y); @@ -1190,6 +1244,7 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create( /* offset */ 0, 0, rgba.data(), pitch, pitch, levelImage.Resolution().y, cudaMemcpyHostToDevice)); } +#endif break; } case PixelFormat::Half: { @@ -1198,7 +1253,28 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create( cudaExtent extent = make_cudaExtent(baseImage.Resolution().x, baseImage.Resolution().y, 0); - CUDA_CHECK(cudaMallocMipmappedArray(&mipArray, &channelDesc, +#ifdef __HIPCC__ + std::vector rgba(4 * baseImage.Resolution().x * + baseImage.Resolution().y); + size_t offset = 0; + for (int y = 0; y < baseImage.Resolution().y; ++y) + for (int x = 0; x < baseImage.Resolution().x; ++x) { + for (int c = 0; c < 3; ++c) + rgba[offset++] = + Half(baseImage.GetChannel({x, y}, c)); + rgba[offset++] = Half(1.f); + } + + int pitch = baseImage.Resolution().x * 4 * sizeof(Half); + gpuImageTextureBytes += pitch * baseImage.Resolution().y; + + CUDA_CHECK(cudaMallocArray(&texArray, &channelDesc, extent.width, + extent.height)); + CUDA_CHECK(cudaMemcpy2DToArray( + texArray, /* offset */ 0, 0, rgba.data(), pitch, pitch, + baseImage.Resolution().y, cudaMemcpyHostToDevice)); +#else + CUDA_CHECK(cudaMallocMipmappedArray(&texArray, &channelDesc, extent, mipmap.Levels(), 0 /* flags */)); @@ -1206,7 +1282,7 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create( const Image &levelImage = mipmap.GetLevel(level); cudaArray_t levelArray; CUDA_CHECK( - cudaGetMipmappedArrayLevel(&levelArray, mipArray, level)); + cudaGetMipmappedArrayLevel(&levelArray, texArray, level)); std::vector rgba(4 * levelImage.Resolution().x * levelImage.Resolution().y); @@ -1228,6 +1304,7 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create( /* offset */ 0, 0, rgba.data(), pitch, pitch, levelImage.Resolution().y, cudaMemcpyHostToDevice)); } +#endif break; } case PixelFormat::Float: { @@ -1236,7 +1313,28 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create( cudaExtent extent = make_cudaExtent(baseImage.Resolution().x, baseImage.Resolution().y, 0); - CUDA_CHECK(cudaMallocMipmappedArray(&mipArray, &channelDesc, +#ifdef __HIPCC__ + std::vector rgba(4 * baseImage.Resolution().x * + baseImage.Resolution().y); + + size_t offset = 0; + for (int y = 0; y < baseImage.Resolution().y; ++y) + for (int x = 0; x < baseImage.Resolution().x; ++x) { + for (int c = 0; c < 3; ++c) + rgba[offset++] = baseImage.GetChannel({x, y}, c); + rgba[offset++] = 1.f; + } + + int pitch = baseImage.Resolution().x * 4 * sizeof(float); + gpuImageTextureBytes += pitch * baseImage.Resolution().y; + + CUDA_CHECK(cudaMallocArray(&texArray, &channelDesc, extent.width, + extent.height)); + CUDA_CHECK(cudaMemcpy2DToArray( + texArray, /* offset */ 0, 0, rgba.data(), pitch, pitch, + baseImage.Resolution().y, cudaMemcpyHostToDevice)); +#else + CUDA_CHECK(cudaMallocMipmappedArray(&texArray, &channelDesc, extent, mipmap.Levels(), 0 /* flags */)); @@ -1244,7 +1342,7 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create( const Image &levelImage = mipmap.GetLevel(level); cudaArray_t levelArray; CUDA_CHECK( - cudaGetMipmappedArrayLevel(&levelArray, mipArray, level)); + cudaGetMipmappedArrayLevel(&levelArray, texArray, level)); std::vector rgba(4 * levelImage.Resolution().x * levelImage.Resolution().y); @@ -1265,6 +1363,7 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create( /* offset */ 0, 0, rgba.data(), pitch, pitch, levelImage.Resolution().y, cudaMemcpyHostToDevice)); } +#endif break; } default: @@ -1273,15 +1372,15 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create( textureCacheMutex.lock(); rgbTextureCache[filename] = RGBTextureCacheItem{ - mipArray, readMode, nMIPMapLevels, colorSpace}; + texArray, readMode, nMIPMapLevels, colorSpace}; textureCacheMutex.unlock(); } else if (image.NChannels() == 1) { - mipArray = createSingleChannelTextureArray(image, colorSpace, + texArray = createSingleChannelTextureArray(image, colorSpace, &nMIPMapLevels); textureCacheMutex.lock(); lumTextureCache[filename] = LuminanceTextureCacheItem{ - mipArray, readMode, nMIPMapLevels, true}; + texArray, readMode, nMIPMapLevels, true}; textureCacheMutex.unlock(); isSingleChannel = true; } else { @@ -1293,8 +1392,13 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create( } cudaResourceDesc resDesc = {}; +#ifdef __HIPCC__ + resDesc.resType = cudaResourceTypeArray; + resDesc.res.array.array = texArray; +#else resDesc.resType = cudaResourceTypeMipmappedArray; - resDesc.res.mipmap.mipmap = mipArray; + resDesc.res.mipmap.mipmap = texArray; +#endif cudaTextureDesc texDesc = {}; texDesc.addressMode[0] = convertAddressMode(wrapString); @@ -1302,6 +1406,7 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create( texDesc.filterMode = filter == "point" ? cudaFilterModePoint : cudaFilterModeLinear; texDesc.readMode = readMode; texDesc.normalizedCoords = 1; +#ifndef __HIPCC__ texDesc.maxAnisotropy = Clamp(maxAniso, 1, 16); texDesc.maxMipmapLevelClamp = nMIPMapLevels - 1; texDesc.minMipmapLevelClamp = 0; @@ -1309,6 +1414,7 @@ GPUSpectrumImageTexture *GPUSpectrumImageTexture::Create( (filter == "trilinear" || filter == "ewa" || filter == "EWA") ? cudaFilterModeLinear : cudaFilterModePoint; +#endif texDesc.borderColor[0] = texDesc.borderColor[1] = texDesc.borderColor[2] = texDesc.borderColor[3] = 0.f; texDesc.sRGB = 1; @@ -1356,7 +1462,7 @@ GPUFloatImageTexture *GPUFloatImageTexture::Create( std::string encodingString = parameters.GetOneString("encoding", defaultEncoding); ColorEncoding encoding = ColorEncoding::Get(encodingString, alloc); - cudaMipmappedArray_t mipArray; + TextureArray texArray; int nMIPMapLevels = 0; cudaTextureReadMode readMode; @@ -1364,7 +1470,7 @@ GPUFloatImageTexture *GPUFloatImageTexture::Create( auto iter = lumTextureCache.find(filename); if (iter != lumTextureCache.end()) { LOG_VERBOSE("Found %s in luminance tex array cache!", filename); - mipArray = iter->second.mipArray; + texArray = iter->second.texArray; readMode = iter->second.readMode; nMIPMapLevels = iter->second.nMIPMapLevels; textureCacheMutex.unlock(); @@ -1407,19 +1513,24 @@ GPUFloatImageTexture *GPUFloatImageTexture::Create( image.NChannels()); } - mipArray = createSingleChannelTextureArray(image, colorSpace, &nMIPMapLevels); + texArray = createSingleChannelTextureArray(image, colorSpace, &nMIPMapLevels); readMode = (image.Format() == PixelFormat::U256) ? cudaReadModeNormalizedFloat : cudaReadModeElementType; textureCacheMutex.lock(); lumTextureCache[filename] = - LuminanceTextureCacheItem{mipArray, readMode, nMIPMapLevels, !convertedImage}; + LuminanceTextureCacheItem{texArray, readMode, nMIPMapLevels, !convertedImage}; textureCacheMutex.unlock(); } cudaResourceDesc resDesc = {}; +#ifdef __HIPCC__ + resDesc.resType = cudaResourceTypeArray; + resDesc.res.array.array = texArray; +#else resDesc.resType = cudaResourceTypeMipmappedArray; - resDesc.res.mipmap.mipmap = mipArray; + resDesc.res.mipmap.mipmap = texArray; +#endif cudaTextureDesc texDesc = {}; texDesc.addressMode[0] = convertAddressMode(wrapString); @@ -1427,6 +1538,7 @@ GPUFloatImageTexture *GPUFloatImageTexture::Create( texDesc.filterMode = filter == "point" ? cudaFilterModePoint : cudaFilterModeLinear; texDesc.readMode = readMode; texDesc.normalizedCoords = 1; +#ifndef __HIPCC__ texDesc.maxAnisotropy = Clamp(maxAniso, 1, 16); texDesc.maxMipmapLevelClamp = nMIPMapLevels - 1; texDesc.minMipmapLevelClamp = 0; @@ -1434,6 +1546,7 @@ GPUFloatImageTexture *GPUFloatImageTexture::Create( (filter == "trilinear" || filter == "ewa" || filter == "EWA") ? cudaFilterModeLinear : cudaFilterModePoint; +#endif texDesc.borderColor[0] = texDesc.borderColor[1] = texDesc.borderColor[2] = texDesc.borderColor[3] = 0.f; texDesc.sRGB = 1; @@ -1562,11 +1675,11 @@ SpectrumTexture SpectrumTexture::Create(const std::string &name, } // UniversalTextureEvaluator Method Definitions -Float UniversalTextureEvaluator::operator()(FloatTexture tex, TextureEvalContext ctx) { +PBRT_CPU_GPU Float UniversalTextureEvaluator::operator()(FloatTexture tex, TextureEvalContext ctx) { return tex.Evaluate(ctx); } -SampledSpectrum UniversalTextureEvaluator::operator()(SpectrumTexture tex, +PBRT_CPU_GPU SampledSpectrum UniversalTextureEvaluator::operator()(SpectrumTexture tex, TextureEvalContext ctx, SampledWavelengths lambda) { return tex.Evaluate(ctx, lambda); diff --git a/src/pbrt/textures.h b/src/pbrt/textures.h index 4a074e967..14b0bb433 100644 --- a/src/pbrt/textures.h +++ b/src/pbrt/textures.h @@ -24,6 +24,10 @@ #include #include +#if defined(__HIPCC__) +#include +#endif + namespace pbrt { // TextureEvalContext Definition @@ -220,7 +224,7 @@ class TextureMapping2D : public TaggedPointerMap(ctx); }; return Dispatch(map); } @@ -260,7 +264,7 @@ class TextureMapping3D : public TaggedPointer { TexCoord3D Map(TextureEvalContext ctx) const; }; -inline TexCoord3D TextureMapping3D::Map(TextureEvalContext ctx) const { +PBRT_CPU_GPU inline TexCoord3D TextureMapping3D::Map(TextureEvalContext ctx) const { auto map = [&](auto ptr) { return ptr->Map(ctx); }; return Dispatch(map); } @@ -578,7 +582,7 @@ class FloatImageTexture : public ImageTextureBase { PBRT_CPU_GPU Float Evaluate(TextureEvalContext ctx) const { #ifdef PBRT_IS_GPU_CODE - assert(!"Should not be called in GPU code"); + CHECK(!"Should not be called in GPU code"); return 0; #else TexCoord2D c = mapping.Map(ctx); @@ -624,7 +628,7 @@ class SpectrumImageTexture : public ImageTextureBase { SpectrumType spectrumType; }; -#if defined(PBRT_BUILD_GPU_RENDERER) && defined(__NVCC__) +#if defined(PBRT_BUILD_GPU_RENDERER) && (defined(__NVCC__) || defined(__HIPCC__)) class GPUSpectrumImageTexture { public: GPUSpectrumImageTexture(std::string filename, TextureMapping2D mapping, @@ -732,7 +736,7 @@ class GPUFloatImageTexture { bool invert; }; -#else // PBRT_BUILD_GPU_RENDERER && __NVCC__ +#else // PBRT_BUILD_GPU_RENDERER && (__NVCC__ || __HIPCC__) class GPUSpectrumImageTexture { public: @@ -769,7 +773,7 @@ class GPUFloatImageTexture { std::string ToString() const { return "GPUFloatImageTexture"; } }; -#endif // PBRT_BUILD_GPU_RENDERER && __NVCC__ +#endif // PBRT_BUILD_GPU_RENDERER && (__NVCC__ || __HIPCC__) // MarbleTexture Definition class MarbleTexture { @@ -1125,13 +1129,13 @@ class WrinkledTexture { Float omega; }; -inline Float FloatTexture::Evaluate(TextureEvalContext ctx) const { +PBRT_CPU_GPU inline Float FloatTexture::Evaluate(TextureEvalContext ctx) const { auto eval = [&](auto ptr) { return ptr->Evaluate(ctx); }; return Dispatch(eval); } -inline SampledSpectrum SpectrumTexture::Evaluate(TextureEvalContext ctx, - SampledWavelengths lambda) const { +PBRT_CPU_GPU inline SampledSpectrum SpectrumTexture::Evaluate( + TextureEvalContext ctx, SampledWavelengths lambda) const { auto eval = [&](auto ptr) { return ptr->Evaluate(ctx, lambda); }; return Dispatch(eval); } diff --git a/src/pbrt/util/check.h b/src/pbrt/util/check.h index e2b513a2e..2128504b0 100644 --- a/src/pbrt/util/check.h +++ b/src/pbrt/util/check.h @@ -18,10 +18,19 @@ namespace pbrt { void PrintStackTrace(); +#define EMPTY_CHECK \ + do { \ + } while (false) /* swallow semicolon */ + #ifdef PBRT_IS_GPU_CODE +#ifdef __HIP_DEVICE_COMPILE__ +#define CHECK(x) EMPTY_CHECK +#define CHECK_IMPL(a, b, op) EMPTY_CHECK +#else #define CHECK(x) assert(x) #define CHECK_IMPL(a, b, op) assert((a)op(b)) +#endif #define CHECK_EQ(a, b) CHECK_IMPL(a, b, ==) #define CHECK_NE(a, b) CHECK_IMPL(a, b, !=) @@ -56,7 +65,7 @@ void PrintStackTrace(); #ifdef PBRT_DEBUG_BUILD -#define DCHECK(x) (CHECK(x)) +#define DCHECK(x) CHECK(x) #define DCHECK_EQ(a, b) CHECK_EQ(a, b) #define DCHECK_NE(a, b) CHECK_NE(a, b) #define DCHECK_GT(a, b) CHECK_GT(a, b) @@ -66,10 +75,6 @@ void PrintStackTrace(); #else -#define EMPTY_CHECK \ - do { \ - } while (false) /* swallow semicolon */ - // Use an empty check (rather than expanding the macros to nothing) to swallow the // semicolon at the end, and avoid empty if-statements. #define DCHECK(x) EMPTY_CHECK diff --git a/src/pbrt/util/color.h b/src/pbrt/util/color.h index c04536901..5d10c9911 100644 --- a/src/pbrt/util/color.h +++ b/src/pbrt/util/color.h @@ -476,18 +476,18 @@ class GammaColorEncoding { pstd::array inverseLUT; }; -inline void ColorEncoding::ToLinear(pstd::span vin, +PBRT_CPU_GPU inline void ColorEncoding::ToLinear(pstd::span vin, pstd::span vout) const { auto tolin = [&](auto ptr) { return ptr->ToLinear(vin, vout); }; Dispatch(tolin); } -inline Float ColorEncoding::ToFloatLinear(Float v) const { +PBRT_CPU_GPU inline Float ColorEncoding::ToFloatLinear(Float v) const { auto tfl = [&](auto ptr) { return ptr->ToFloatLinear(v); }; return Dispatch(tfl); } -inline void ColorEncoding::FromLinear(pstd::span vin, +PBRT_CPU_GPU inline void ColorEncoding::FromLinear(pstd::span vin, pstd::span vout) const { auto fl = [&](auto ptr) { return ptr->FromLinear(vin, vout); }; Dispatch(fl); diff --git a/src/pbrt/util/colorspace.cpp b/src/pbrt/util/colorspace.cpp index aa6d5e5db..27994690a 100644 --- a/src/pbrt/util/colorspace.cpp +++ b/src/pbrt/util/colorspace.cpp @@ -3,19 +3,15 @@ // SPDX: Apache-2.0 #include - -#ifdef PBRT_BUILD_GPU_RENDERER -#include -#endif #include namespace pbrt { #ifdef PBRT_BUILD_GPU_RENDERER -PBRT_CONST RGBColorSpace *RGBColorSpace_sRGB; -PBRT_CONST RGBColorSpace *RGBColorSpace_DCI_P3; -PBRT_CONST RGBColorSpace *RGBColorSpace_Rec2020; -PBRT_CONST RGBColorSpace *RGBColorSpace_ACES2065_1; +PBRT_GPU __constant__ RGBColorSpace *RGBColorSpace_sRGB; +PBRT_GPU __constant__ RGBColorSpace *RGBColorSpace_DCI_P3; +PBRT_GPU __constant__ RGBColorSpace *RGBColorSpace_Rec2020; +PBRT_GPU __constant__ RGBColorSpace *RGBColorSpace_ACES2065_1; #endif // RGBColorSpace Method Definitions @@ -40,7 +36,7 @@ SquareMatrix<3> ConvertRGBColorSpace(const RGBColorSpace &from, const RGBColorSp return to.RGBFromXYZ * from.XYZFromRGB; } -RGBSigmoidPolynomial RGBColorSpace::ToRGBCoeffs(RGB rgb) const { +PBRT_CPU_GPU RGBSigmoidPolynomial RGBColorSpace::ToRGBCoeffs(RGB rgb) const { DCHECK(rgb.r >= 0 && rgb.g >= 0 && rgb.b >= 0); return (*rgbToSpectrumTable)(ClampZero(rgb)); } diff --git a/src/pbrt/util/colorspace.h b/src/pbrt/util/colorspace.h index 4fc026a34..c015a8f28 100644 --- a/src/pbrt/util/colorspace.h +++ b/src/pbrt/util/colorspace.h @@ -11,6 +11,9 @@ #include #include #include +#ifdef PBRT_BUILD_GPU_RENDERER +#include +#endif #include @@ -66,10 +69,10 @@ class RGBColorSpace { }; #ifdef PBRT_BUILD_GPU_RENDERER -extern PBRT_CONST RGBColorSpace *RGBColorSpace_sRGB; -extern PBRT_CONST RGBColorSpace *RGBColorSpace_DCI_P3; -extern PBRT_CONST RGBColorSpace *RGBColorSpace_Rec2020; -extern PBRT_CONST RGBColorSpace *RGBColorSpace_ACES2065_1; +extern PBRT_GPU __constant__ RGBColorSpace *RGBColorSpace_sRGB; +extern PBRT_GPU __constant__ RGBColorSpace *RGBColorSpace_DCI_P3; +extern PBRT_GPU __constant__ RGBColorSpace *RGBColorSpace_Rec2020; +extern PBRT_GPU __constant__ RGBColorSpace *RGBColorSpace_ACES2065_1; #endif SquareMatrix<3> ConvertRGBColorSpace(const RGBColorSpace &from, const RGBColorSpace &to); diff --git a/src/pbrt/util/float.h b/src/pbrt/util/float.h index ab9e6447d..a672331ee 100644 --- a/src/pbrt/util/float.h +++ b/src/pbrt/util/float.h @@ -16,8 +16,14 @@ #include #if defined(PBRT_BUILD_GPU_RENDERER) && defined(PBRT_IS_GPU_CODE) +#if defined(__HIPCC__) +#include +#include +#include +#else #include #endif +#endif namespace pbrt { @@ -57,7 +63,7 @@ template inline PBRT_CPU_GPU typename std::enable_if_t, bool> IsNaN( T v) { #ifdef PBRT_IS_GPU_CODE - return isnan(v); + return isnan((float)v); #else return std::isnan(v); #endif @@ -199,9 +205,9 @@ inline constexpr Float gamma(int n) { inline PBRT_CPU_GPU Float AddRoundUp(Float a, Float b) { #ifdef PBRT_IS_GPU_CODE #ifdef PBRT_FLOAT_AS_DOUBLE - return __dadd_ru(a, b); + return __dadd_rn(a, b); #else - return __fadd_ru(a, b); + return __fadd_rn(a, b); #endif #else // CPU return NextFloatUp(a + b); @@ -210,9 +216,9 @@ inline PBRT_CPU_GPU Float AddRoundUp(Float a, Float b) { inline PBRT_CPU_GPU Float AddRoundDown(Float a, Float b) { #ifdef PBRT_IS_GPU_CODE #ifdef PBRT_FLOAT_AS_DOUBLE - return __dadd_rd(a, b); + return __dadd_rn(a, b); #else - return __fadd_rd(a, b); + return __fadd_rn(a, b); #endif #else // CPU return NextFloatDown(a + b); @@ -229,9 +235,9 @@ inline PBRT_CPU_GPU Float SubRoundDown(Float a, Float b) { inline PBRT_CPU_GPU Float MulRoundUp(Float a, Float b) { #ifdef PBRT_IS_GPU_CODE #ifdef PBRT_FLOAT_AS_DOUBLE - return __dmul_ru(a, b); + return __dmul_rn(a, b); #else - return __fmul_ru(a, b); + return __fmul_rn(a, b); #endif #else // CPU return NextFloatUp(a * b); @@ -241,9 +247,9 @@ inline PBRT_CPU_GPU Float MulRoundUp(Float a, Float b) { inline PBRT_CPU_GPU Float MulRoundDown(Float a, Float b) { #ifdef PBRT_IS_GPU_CODE #ifdef PBRT_FLOAT_AS_DOUBLE - return __dmul_rd(a, b); + return __dmul_rn(a, b); #else - return __fmul_rd(a, b); + return __fmul_rn(a, b); #endif #else // CPU return NextFloatDown(a * b); @@ -253,9 +259,9 @@ inline PBRT_CPU_GPU Float MulRoundDown(Float a, Float b) { inline PBRT_CPU_GPU Float DivRoundUp(Float a, Float b) { #ifdef PBRT_IS_GPU_CODE #ifdef PBRT_FLOAT_AS_DOUBLE - return __ddiv_ru(a, b); + return __ddiv_rn(a, b); #else - return __fdiv_ru(a, b); + return __fdiv_rn(a, b); #endif #else // CPU return NextFloatUp(a / b); @@ -265,9 +271,9 @@ inline PBRT_CPU_GPU Float DivRoundUp(Float a, Float b) { inline PBRT_CPU_GPU Float DivRoundDown(Float a, Float b) { #ifdef PBRT_IS_GPU_CODE #ifdef PBRT_FLOAT_AS_DOUBLE - return __ddiv_rd(a, b); + return __ddiv_rn(a, b); #else - return __fdiv_rd(a, b); + return __fdiv_rn(a, b); #endif #else // CPU return NextFloatDown(a / b); @@ -277,9 +283,9 @@ inline PBRT_CPU_GPU Float DivRoundDown(Float a, Float b) { inline PBRT_CPU_GPU Float SqrtRoundUp(Float a) { #ifdef PBRT_IS_GPU_CODE #ifdef PBRT_FLOAT_AS_DOUBLE - return __dsqrt_ru(a); + return __dsqrt_rn(a); #else - return __fsqrt_ru(a); + return __fsqrt_rn(a); #endif #else // CPU return NextFloatUp(std::sqrt(a)); @@ -289,9 +295,9 @@ inline PBRT_CPU_GPU Float SqrtRoundUp(Float a) { inline PBRT_CPU_GPU Float SqrtRoundDown(Float a) { #ifdef PBRT_IS_GPU_CODE #ifdef PBRT_FLOAT_AS_DOUBLE - return __dsqrt_rd(a); + return __dsqrt_rn(a); #else - return __fsqrt_rd(a); + return __fsqrt_rn(a); #endif #else // CPU return std::max(0, NextFloatDown(std::sqrt(a))); @@ -301,9 +307,9 @@ inline PBRT_CPU_GPU Float SqrtRoundDown(Float a) { inline PBRT_CPU_GPU Float FMARoundUp(Float a, Float b, Float c) { #ifdef PBRT_IS_GPU_CODE #ifdef PBRT_FLOAT_AS_DOUBLE - return __fma_ru(a, b, c); // FIXME: what to do here? + return __fma_rn(a, b, c); // FIXME: what to do here? #else - return __fma_ru(a, b, c); + return __fma_rn(a, b, c); #endif #else // CPU return NextFloatUp(FMA(a, b, c)); @@ -313,9 +319,9 @@ inline PBRT_CPU_GPU Float FMARoundUp(Float a, Float b, Float c) { inline PBRT_CPU_GPU Float FMARoundDown(Float a, Float b, Float c) { #ifdef PBRT_IS_GPU_CODE #ifdef PBRT_FLOAT_AS_DOUBLE - return __fma_rd(a, b, c); // FIXME: what to do here? + return __fma_rn(a, b, c); // FIXME: what to do here? #else - return __fma_rd(a, b, c); + return __fma_rn(a, b, c); #endif #else // CPU return NextFloatDown(FMA(a, b, c)); diff --git a/src/pbrt/util/hash.h b/src/pbrt/util/hash.h index 1c53b3a7b..84a0163c0 100644 --- a/src/pbrt/util/hash.h +++ b/src/pbrt/util/hash.h @@ -27,7 +27,7 @@ PBRT_CPU_GPU inline uint64_t MurmurHash64A(const unsigned char *key, size_t len, while (key != end) { uint64_t k; - std::memcpy(&k, key, sizeof(uint64_t)); + memcpy(&k, key, sizeof(uint64_t)); key += 8; k *= m; @@ -67,7 +67,7 @@ PBRT_CPU_GPU inline uint64_t MurmurHash64A(const unsigned char *key, size_t len, // http://zimbry.blogspot.ch/2011/09/better-bit-mixing-improving-on.html PBRT_CPU_GPU inline uint64_t MixBits(uint64_t v); -inline uint64_t MixBits(uint64_t v) { +PBRT_CPU_GPU inline uint64_t MixBits(uint64_t v) { v ^= (v >> 31); v *= 0x7fb5d329728ea185; v ^= (v >> 27); diff --git a/src/pbrt/util/hip_aliases.h b/src/pbrt/util/hip_aliases.h new file mode 100644 index 000000000..fb5e01601 --- /dev/null +++ b/src/pbrt/util/hip_aliases.h @@ -0,0 +1,105 @@ +#ifndef PBRT_UTIL_HIP_ALIASES_H +#define PBRT_UTIL_HIP_ALIASES_H + +#include +#include +#include + +#define cudaError_t hipError_t +#define cudaSuccess hipSuccess +#define cudaErrorNotReady hipErrorNotReady +#define cudaGetLastError hipGetLastError +#define cudaGetErrorString hipGetErrorString + +#define cudaGetDevice hipGetDevice +#define cudaSetDevice hipSetDevice +#define cudaGetDeviceCount hipGetDeviceCount +#define cudaDeviceGetAttribute hipDeviceGetAttribute +#define cudaDeviceProp hipDeviceProp_t +#define cudaDeviceGetLimit hipDeviceGetLimit +#define cudaDeviceSetLimit hipDeviceSetLimit +#define cudaDeviceSetCacheConfig hipDeviceSetCacheConfig +#define cudaLimitStackSize hipLimitStackSize +#define cudaLimitPrintfFifoSize hipLimitPrintfFifoSize +#define cudaFuncCachePreferL1 hipFuncCachePreferL1 +#define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaDevAttrKernelExecTimeout hipDeviceAttributeKernelExecTimeout +#define cudaDevAttrConcurrentManagedAccess hipDeviceAttributeConcurrentManagedAccess +#define cudaDriverGetVersion hipDriverGetVersion +#define cudaRuntimeGetVersion hipRuntimeGetVersion + +#define cudaGraphicsMapResources hipGraphicsMapResources +#define cudaGraphicsUnmapResources hipGraphicsUnmapResources +#define cudaGraphicsResourceGetMappedPointer hipGraphicsResourceGetMappedPointer +#define cudaGraphicsResource hipGraphicsResource +#define cudaGraphicsGLRegisterBuffer hipGraphicsGLRegisterBuffer +#define cudaGraphicsMapResources hipGraphicsMapResources +#define cudaGraphicsMapFlagsWriteDiscard hipGraphicsRegisterFlagsWriteDiscard + +#define cudaGLGetDevices hipGLGetDevices +#define cudaGLDeviceListAll hipGLDeviceListAll + +#define CUdeviceptr hipDeviceptr_t +#define cudaMalloc hipMalloc +#define cudaMallocHost hipHostMalloc +#define cudaMallocManaged hipMallocManaged +#define cudaFree hipFree + +#define cudaMemcpy hipMemcpy +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemcpyToSymbol hipMemcpyToSymbol +#define cudaMemcpyFromSymbol hipMemcpyFromSymbol +#define cudaMemPrefetchAsync hipMemPrefetchAsync +#define cudaMemset hipMemset +#define cudaMemAdvise hipMemAdvise +#define cudaMemAdviseSetReadMostly hipMemAdviseSetReadMostly +#define cudaMemAdviseSetPreferredLocation hipMemAdviseSetPreferredLocation + +#define cudaArray_t hipArray_t +#define cudaMallocArray hipMallocArray +#define cudaMemcpy2DToArray hipMemcpy2DToArray +#define cudaMipmappedArray_t hipMipmappedArray_t +#define cudaMallocMipmappedArray hipMallocMipmappedArray +#define cudaGetMipmappedArrayLevel hipGetMipmappedArrayLevel + +#define cudaExtent hipExtent +#define make_cudaExtent make_hipExtent + +#define cudaTextureObject_t hipTextureObject_t +#define cudaCreateTextureObject hipCreateTextureObject +#define cudaChannelFormatDesc hipChannelFormatDesc +#define cudaCreateChannelDesc hipCreateChannelDesc +#define cudaChannelFormatKindUnsigned hipChannelFormatKindUnsigned +#define cudaChannelFormatKindFloat hipChannelFormatKindFloat +#define cudaFilterModePoint hipFilterModePoint +#define cudaFilterModeLinear hipFilterModeLinear +#define cudaTextureDesc hipTextureDesc +#define cudaTextureAddressMode hipTextureAddressMode +#define cudaTextureReadMode hipTextureReadMode +#define cudaAddressModeWrap hipAddressModeWrap +#define cudaAddressModeClamp hipAddressModeClamp +#define cudaAddressModeBorder hipAddressModeBorder +#define cudaReadModeNormalizedFloat hipReadModeNormalizedFloat +#define cudaReadModeElementType hipReadModeElementType +#define cudaResourceDesc hipResourceDesc +#define cudaResourceTypeArray hipResourceTypeArray +#define cudaResourceTypeMipmappedArray hipResourceTypeMipmappedArray + +#define cudaEvent_t hipEvent_t +#define cudaEventCreate hipEventCreate +#define cudaEventRecord hipEventRecord +#define cudaEventElapsedTime hipEventElapsedTime +#define cudaEventSynchronize hipEventSynchronize +#define cudaEventQuery hipEventQuery + +#define CUstream hipStream_t +#define cudaStream_t hipStream_t +#define cudaStreamCreate hipStreamCreate +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaDeviceSynchronize hipDeviceSynchronize + +#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize + +#endif // PBRT_UTIL_HIP_ALIASES_H diff --git a/src/pbrt/util/image.h b/src/pbrt/util/image.h index 6771eeb9a..a54eec228 100644 --- a/src/pbrt/util/image.h +++ b/src/pbrt/util/image.h @@ -422,7 +422,7 @@ class Image { }; // Image Inline Method Definitions -inline void Image::SetChannel(Point2i p, int c, Float value) { +PBRT_CPU_GPU inline void Image::SetChannel(Point2i p, int c, Float value) { // CHECK(!IsNaN(value)); if (IsNaN(value)) { #ifndef PBRT_IS_GPU_CODE diff --git a/src/pbrt/util/log.cpp b/src/pbrt/util/log.cpp index df730c028..2064f1a07 100644 --- a/src/pbrt/util/log.cpp +++ b/src/pbrt/util/log.cpp @@ -327,7 +327,7 @@ std::string ToString(LogLevel level) { } } -void Log(LogLevel level, const char *file, int line, const char *s) { +PBRT_CPU_GPU void Log(LogLevel level, const char *file, int line, const char *s) { #ifdef PBRT_IS_GPU_CODE auto strlen = [](const char *ptr) { int len = 0; @@ -389,7 +389,7 @@ void Log(LogLevel level, const char *file, int line, const char *s) { #endif } -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__HIPCC__) // warning #1305-D: function declared with "noreturn" does return #ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ #pragma nv_diag_suppress 1305 @@ -398,7 +398,7 @@ void Log(LogLevel level, const char *file, int line, const char *s) { #endif #endif -void LogFatal(LogLevel level, const char *file, int line, const char *s) { +PBRT_CPU_GPU void LogFatal(LogLevel level, const char *file, int line, const char *s) { #ifdef PBRT_IS_GPU_CODE Log(LogLevel::Fatal, file, line, s); __threadfence(); diff --git a/src/pbrt/util/log.h b/src/pbrt/util/log.h index 0cde178d5..45b34a178 100644 --- a/src/pbrt/util/log.h +++ b/src/pbrt/util/log.h @@ -44,16 +44,17 @@ extern FILE *logFile; PBRT_CPU_GPU void Log(LogLevel level, const char *file, int line, const char *s); -PBRT_CPU_GPU [[noreturn]] void LogFatal(LogLevel level, const char *file, int line, - const char *s); +template +PBRT_CPU_GPU inline void LogFatal(LogLevel level, const char *file, int line, + const char *fmt, Args &&...args); + +#ifndef __HIPCC__ +PBRT_CPU_GPU void LogFatal(LogLevel level, const char *file, int line, const char *s); template PBRT_CPU_GPU inline void Log(LogLevel level, const char *file, int line, const char *fmt, Args &&...args); - -template -PBRT_CPU_GPU [[noreturn]] inline void LogFatal(LogLevel level, const char *file, int line, - const char *fmt, Args &&...args); +#endif #define TO_STRING(x) TO_STRING2(x) #define TO_STRING2(x) #x @@ -62,6 +63,23 @@ PBRT_CPU_GPU [[noreturn]] inline void LogFatal(LogLevel level, const char *file, extern __constant__ LogLevel LOGGING_LogLevelGPU; +// printing may cause hang in the device code +#ifdef __HIP_DEVICE_COMPILE__ + +#define LOG_VERBOSE(...) \ + do { \ + } while (false) /* swallow semicolon */ + +#define LOG_ERROR(...) \ + do { \ + } while (false) /* swallow semicolon */ + +#define LOG_FATAL(...) \ + do { \ + } while (false) /* swallow semicolon */ + +#else + #define LOG_VERBOSE(...) \ (pbrt::LogLevel::Verbose >= LOGGING_LogLevelGPU && \ (pbrt::Log(LogLevel::Verbose, __FILE__, __LINE__, __VA_ARGS__), true)) @@ -73,6 +91,8 @@ extern __constant__ LogLevel LOGGING_LogLevelGPU; #define LOG_FATAL(...) \ pbrt::LogFatal(pbrt::LogLevel::Fatal, __FILE__, __LINE__, __VA_ARGS__) +#endif + #else // Logging Macros @@ -96,9 +116,9 @@ extern __constant__ LogLevel LOGGING_LogLevelGPU; namespace pbrt { template -inline void Log(LogLevel level, const char *file, int line, const char *fmt, - Args &&...args) { -#ifdef PBRT_IS_GPU_CODE +PBRT_CPU_GPU inline void Log(LogLevel level, const char *file, int line, const char *fmt, + Args &&...args) { +#if defined(PBRT_IS_GPU_CODE) Log(level, file, line, fmt); // just the format string #yolo #else std::string s = StringPrintf(fmt, std::forward(args)...); @@ -107,9 +127,9 @@ inline void Log(LogLevel level, const char *file, int line, const char *fmt, } template -inline void LogFatal(LogLevel level, const char *file, int line, const char *fmt, - Args &&...args) { -#ifdef PBRT_IS_GPU_CODE +PBRT_CPU_GPU inline void LogFatal(LogLevel level, const char *file, int line, + const char *fmt, Args &&...args) { +#if defined(PBRT_IS_GPU_CODE) || defined(__HIPCC__) LogFatal(level, file, line, fmt); // just the format string #yolo #else std::string s = StringPrintf(fmt, std::forward(args)...); diff --git a/src/pbrt/util/math.h b/src/pbrt/util/math.h index 48f50ddc8..1568a8a42 100644 --- a/src/pbrt/util/math.h +++ b/src/pbrt/util/math.h @@ -25,7 +25,7 @@ namespace pbrt { -#ifdef PBRT_IS_GPU_CODE +#if defined(PBRT_IS_GPU_CODE) && defined(__CUDACC__) #define ShadowEpsilon 0.0001f #define Pi Float(3.14159265358979323846) diff --git a/src/pbrt/util/noise.cpp b/src/pbrt/util/noise.cpp index 77373bea3..646c26800 100644 --- a/src/pbrt/util/noise.cpp +++ b/src/pbrt/util/noise.cpp @@ -53,7 +53,7 @@ static PBRT_CONST int NoisePerm[2 * NoisePermSize] = { }; // Noise Function Definitions -Float Noise(Float x, Float y, Float z) { +PBRT_CPU_GPU Float Noise(Float x, Float y, Float z) { // Compute noise cell coordinates and offsets // Avoid overflow when computing deltas if the coordinates are too large to store in // int32s. @@ -87,11 +87,11 @@ Float Noise(Float x, Float y, Float z) { return Lerp(wz, y0, y1); } -Float Noise(Point3f p) { +PBRT_CPU_GPU Float Noise(Point3f p) { return Noise(p.x, p.y, p.z); } -inline Float Grad(int x, int y, int z, Float dx, Float dy, Float dz) { +PBRT_CPU_GPU inline Float Grad(int x, int y, int z, Float dx, Float dy, Float dz) { int h = NoisePerm[NoisePerm[NoisePerm[x] + y] + z]; h &= 15; Float u = h < 8 || h == 12 || h == 13 ? dx : dy; @@ -99,11 +99,11 @@ inline Float Grad(int x, int y, int z, Float dx, Float dy, Float dz) { return ((h & 1) ? -u : u) + ((h & 2) ? -v : v); } -inline Float NoiseWeight(Float t) { +PBRT_CPU_GPU inline Float NoiseWeight(Float t) { return 6 * Pow<5>(t) - 15 * Pow<4>(t) + 10 * Pow<3>(t); } -Vector3f DNoise(Point3f p) { +PBRT_CPU_GPU Vector3f DNoise(Point3f p) { Float delta = .01f; Float n = Noise(p); Point3f noiseDelta(Noise(p + Vector3f(delta, 0, 0)), Noise(p + Vector3f(0, delta, 0)), @@ -111,7 +111,8 @@ Vector3f DNoise(Point3f p) { return (noiseDelta - Point3f(n, n, n)) / delta; } -Float FBm(Point3f p, Vector3f dpdx, Vector3f dpdy, Float omega, int maxOctaves) { +PBRT_CPU_GPU Float FBm(Point3f p, Vector3f dpdx, Vector3f dpdy, Float omega, + int maxOctaves) { // Compute number of octaves for antialiased FBm Float len2 = std::max(LengthSquared(dpdx), LengthSquared(dpdy)); Float n = Clamp(-1 - Log2(len2) / 2, 0, maxOctaves); @@ -130,7 +131,7 @@ Float FBm(Point3f p, Vector3f dpdx, Vector3f dpdy, Float omega, int maxOctaves) return sum; } -Float Turbulence(Point3f p, Vector3f dpdx, Vector3f dpdy, Float omega, int maxOctaves) { +PBRT_CPU_GPU Float Turbulence(Point3f p, Vector3f dpdx, Vector3f dpdy, Float omega, int maxOctaves) { // Compute number of octaves for antialiased FBm Float len2 = std::max(LengthSquared(dpdx), LengthSquared(dpdy)); Float n = Clamp(-1 - Log2(len2) / 2, 0, maxOctaves); diff --git a/src/pbrt/util/parallel.h b/src/pbrt/util/parallel.h index 48bd6c00b..827d0d3f1 100644 --- a/src/pbrt/util/parallel.h +++ b/src/pbrt/util/parallel.h @@ -213,7 +213,7 @@ class AtomicDouble { void Add(double v) { #if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) atomicAdd(&value, v); -#elif defined(__CUDA_ARCH__) +#elif defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) uint64_t old = bits, assumed; do { @@ -235,7 +235,7 @@ class AtomicDouble { // AtomicDouble Private Data #if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) double value; -#elif defined(__CUDA_ARCH__) +#elif (defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)) uint64_t bits; #else std::atomic bits; diff --git a/src/pbrt/util/progressreporter.h b/src/pbrt/util/progressreporter.h index f018f1b46..a4028fa84 100644 --- a/src/pbrt/util/progressreporter.h +++ b/src/pbrt/util/progressreporter.h @@ -16,7 +16,11 @@ #include #ifdef PBRT_BUILD_GPU_RENDERER +#if defined(__HIPCC__) +#include +#else #include +#endif #include #endif diff --git a/src/pbrt/util/pstd.h b/src/pbrt/util/pstd.h index 2d6065fef..52c3e240b 100644 --- a/src/pbrt/util/pstd.h +++ b/src/pbrt/util/pstd.h @@ -60,7 +60,7 @@ class array { array() = default; PBRT_CPU_GPU - void fill(const T &v) { assert(!"should never be called"); } + void fill(const T &v) { CHECK(!"should never be called"); } PBRT_CPU_GPU bool operator==(const array &a) const { return true; } @@ -81,13 +81,13 @@ class array { PBRT_CPU_GPU T &operator[](size_t i) { - assert(!"should never be called"); + CHECK(!"should never be called"); static T t; return t; } PBRT_CPU_GPU const T &operator[](size_t i) const { - assert(!"should never be called"); + CHECK(!"should never be called"); static T t; return t; } @@ -253,7 +253,7 @@ class optional { bool has_value() const { return set; } private: -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__HIPCC__) // Work-around NVCC bug PBRT_CPU_GPU T *ptr() { return reinterpret_cast(&optionalValue); } diff --git a/src/pbrt/util/sampling.h b/src/pbrt/util/sampling.h index e5caf7831..f0f844c1f 100644 --- a/src/pbrt/util/sampling.h +++ b/src/pbrt/util/sampling.h @@ -1300,7 +1300,7 @@ class PiecewiseLinear2D { private: using FloatStorage = pstd::vector; -#if !defined(_MSC_VER) && !defined(__CUDACC__) +#if !defined(_MSC_VER) && !(defined(__CUDACC__) || defined(__HIPCC__)) static constexpr size_t ArraySize = Dimension; #else static constexpr size_t ArraySize = (Dimension != 0) ? Dimension : 1; diff --git a/src/pbrt/util/scattering.cpp b/src/pbrt/util/scattering.cpp index 0353d1a1f..5dde4610a 100644 --- a/src/pbrt/util/scattering.cpp +++ b/src/pbrt/util/scattering.cpp @@ -7,7 +7,7 @@ namespace pbrt { // BSSRDF Utility Functions -Float FresnelMoment1(Float eta) { +PBRT_CPU_GPU Float FresnelMoment1(Float eta) { Float eta2 = eta * eta, eta3 = eta2 * eta, eta4 = eta3 * eta, eta5 = eta4 * eta; if (eta < 1) return 0.45966f - 1.73965f * eta + 3.37668f * eta2 - 3.904945 * eta3 + @@ -17,7 +17,7 @@ Float FresnelMoment1(Float eta) { 1.27198f * eta4 + 0.12746f * eta5; } -Float FresnelMoment2(Float eta) { +PBRT_CPU_GPU Float FresnelMoment2(Float eta) { Float eta2 = eta * eta, eta3 = eta2 * eta, eta4 = eta3 * eta, eta5 = eta4 * eta; if (eta < 1) { return 0.27614f - 0.87350f * eta + 1.12077f * eta2 - 0.65095f * eta3 + diff --git a/src/pbrt/util/taggedptr.h b/src/pbrt/util/taggedptr.h index a430d575e..d9fb067c7 100644 --- a/src/pbrt/util/taggedptr.h +++ b/src/pbrt/util/taggedptr.h @@ -743,9 +743,14 @@ class TaggedPointer { template PBRT_CPU_GPU TaggedPointer(T *ptr) { uint64_t iptr = reinterpret_cast(ptr); - DCHECK_EQ(iptr & ptrMask, iptr); constexpr unsigned int type = TypeIndex(); +#if defined(PBRT_IS_WINDOWS) && defined(__HIPCC__) + uint64_t aptr = (iptr & tagMask) >> 5ull; + bits = (iptr & ptrMask) | aptr | ((uint64_t)type << (tagShift + 2ull)); +#else + DCHECK_EQ(iptr & ptrMask, iptr); bits = iptr | ((uint64_t)type << tagShift); +#endif } PBRT_CPU_GPU @@ -769,7 +774,13 @@ class TaggedPointer { } PBRT_CPU_GPU - unsigned int Tag() const { return ((bits & tagMask) >> tagShift); } + unsigned int Tag() const { +#if defined(PBRT_IS_WINDOWS) && defined(__HIPCC__) + return ((bits & tagMask) >> (tagShift + 2ull)); +#else + return ((bits & tagMask) >> tagShift); +#endif + } template PBRT_CPU_GPU bool Is() const { @@ -825,10 +836,42 @@ class TaggedPointer { bool operator!=(const TaggedPointer &tp) const { return bits != tp.bits; } PBRT_CPU_GPU - void *ptr() { return reinterpret_cast(bits & ptrMask); } + void *ptr() { +#if defined(PBRT_IS_WINDOWS) && defined(__HIPCC__) + unsigned int aptr = (bits >> tagShift) & 3; + uint64_t iptr = bits & ptrMask; + // lld crashes on Windows + // iptr |= (uint64_t)aptr << 60; + if (aptr == 1) + iptr |= 1ull << 60; + if (aptr == 2) + iptr |= 2ull << 60; + if (aptr == 3) + iptr |= 3ull << 60; + return reinterpret_cast(iptr); +#else + return reinterpret_cast(bits & ptrMask); +#endif + } PBRT_CPU_GPU - const void *ptr() const { return reinterpret_cast(bits & ptrMask); } + const void *ptr() const { +#if defined(PBRT_IS_WINDOWS) && defined(__HIPCC__) + unsigned int aptr = (bits >> tagShift) & 3; + uint64_t iptr = bits & ptrMask; + // lld crashes on Windows + // iptr |= (uint64_t)aptr << 60; + if (aptr == 1) + iptr |= 1ull << 60; + if (aptr == 2) + iptr |= 2ull << 60; + if (aptr == 3) + iptr |= 3ull << 60; + return reinterpret_cast(iptr); +#else + return reinterpret_cast(bits & ptrMask); +#endif + } template PBRT_CPU_GPU decltype(auto) Dispatch(F &&func) { @@ -862,7 +905,12 @@ class TaggedPointer { static_assert(sizeof(uintptr_t) <= sizeof(uint64_t), "Expected pointer size to be <= 64 bits"); // TaggedPointer Private Members +#if defined(PBRT_IS_WINDOWS) && defined(__HIPCC__) + // top two bits are already used by the system on Windows + static constexpr int tagShift = 55; +#else static constexpr int tagShift = 57; +#endif static constexpr int tagBits = 64 - tagShift; static constexpr uint64_t tagMask = ((1ull << tagBits) - 1) << tagShift; static constexpr uint64_t ptrMask = ~tagMask; diff --git a/src/pbrt/util/vecmath_test.cpp b/src/pbrt/util/vecmath_test.cpp index 4569c6e6f..8ad27e1bf 100644 --- a/src/pbrt/util/vecmath_test.cpp +++ b/src/pbrt/util/vecmath_test.cpp @@ -104,6 +104,7 @@ TEST(Point2, InvertBilinear) { } } +#if !defined(__HIPCC__) TEST(Vector, AngleBetween) { EXPECT_EQ(0, AngleBetween(Vector3f(1, 0, 0), Vector3f(1, 0, 0))); @@ -176,6 +177,7 @@ TEST(Vector, AngleBetween) { Float abet = AngleBetween(a, b); EXPECT_EQ(abet, precise) << StringPrintf("vs naive %f", naive); } +#endif TEST(Vector, CoordinateSystem) { // Duff et al 2017 footnote 1 diff --git a/src/pbrt/wavefront/integrator.cpp b/src/pbrt/wavefront/integrator.cpp index c97d995e8..6729a7be5 100644 --- a/src/pbrt/wavefront/integrator.cpp +++ b/src/pbrt/wavefront/integrator.cpp @@ -9,7 +9,11 @@ #include #include #ifdef PBRT_BUILD_GPU_RENDERER +#if defined(__HIPCC__) +#include +#else #include +#endif #include #endif // PBRT_BUILD_GPU_RENDERER #include @@ -36,8 +40,12 @@ #include #ifdef PBRT_BUILD_GPU_RENDERER +#if defined(__HIPCC__) +#include +#else #include #include +#endif #endif // PBRT_BUILD_GPU_RENDERER namespace pbrt { @@ -154,13 +162,28 @@ WavefrontPathIntegrator::WavefrontPathIntegrator( filter = film.GetFilter(); sampler = scene.GetSampler(); + // Compute number of scanlines to render per pass + Vector2i resolution = film.PixelBounds().Diagonal(); + // TODO: make this configurable. Base it on the amount of GPU memory? + int maxSamples = 1024 * 1024; + scanlinesPerPass = std::max(1, maxSamples / resolution.x); + int nPasses = (resolution.y + scanlinesPerPass - 1) / scanlinesPerPass; + scanlinesPerPass = (resolution.y + nPasses - 1) / nPasses; + maxQueueSize = resolution.x * scanlinesPerPass; + if (Options->useGPU) { #ifdef PBRT_BUILD_GPU_RENDERER CUDATrackedMemoryResource *mr = dynamic_cast(memoryResource); CHECK(mr); +#ifdef __HIPCC__ + aggregate = new HiprtAggregate(scene, mr, textures, shapeIndexToAreaLights, media, + namedMaterials, materials, maxQueueSize); +#else aggregate = new OptiXAggregate(scene, mr, textures, shapeIndexToAreaLights, media, namedMaterials, materials); +#endif + #else LOG_FATAL("Options->useGPU was set without PBRT_BUILD_GPU_RENDERER enabled"); #endif @@ -224,14 +247,6 @@ WavefrontPathIntegrator::WavefrontPathIntegrator( } #endif // PBRT_BUILD_GPU_RENDERER - // Compute number of scanlines to render per pass - Vector2i resolution = film.PixelBounds().Diagonal(); - // TODO: make this configurable. Base it on the amount of GPU memory? - int maxSamples = 1024 * 1024; - scanlinesPerPass = std::max(1, maxSamples / resolution.x); - int nPasses = (resolution.y + scanlinesPerPass - 1) / scanlinesPerPass; - scanlinesPerPass = (resolution.y + nPasses - 1) / nPasses; - maxQueueSize = resolution.x * scanlinesPerPass; LOG_VERBOSE("Will render in %d passes %d scanlines per pass\n", nPasses, scanlinesPerPass); diff --git a/src/pbrt/wavefront/intersect.h b/src/pbrt/wavefront/intersect.h index 310173a1e..690aa10e8 100644 --- a/src/pbrt/wavefront/intersect.h +++ b/src/pbrt/wavefront/intersect.h @@ -54,7 +54,7 @@ inline PBRT_CPU_GPU void EnqueueWorkAfterIntersection( intr.mediumInterface ? *intr.mediumInterface : MediumInterface(rayMedium); if (rayMedium) { - assert(mediumSampleQueue); + CHECK(mediumSampleQueue); PBRT_DBG("Enqueuing into medium sample queue\n"); mediumSampleQueue->Push(MediumSampleWorkItem{r.ray, r.depth, diff --git a/src/pbrt/wavefront/wavefront.cpp b/src/pbrt/wavefront/wavefront.cpp index 7482fdb84..c5d86cc68 100644 --- a/src/pbrt/wavefront/wavefront.cpp +++ b/src/pbrt/wavefront/wavefront.cpp @@ -66,7 +66,14 @@ void RenderWavefront(BasicScene &scene) { integrator->camera.InitMetadata(&metadata); metadata.renderTimeSeconds = seconds; metadata.samplesPerPixel = integrator->sampler.SamplesPerPixel(); + // multithreading somehow hangs in HIP +#ifdef __HIPCC__ + if (Options->useGPU) DisableThreadPool(); +#endif integrator->film.WriteImage(metadata); +#ifdef __HIPCC__ + if (Options->useGPU) ReenableThreadPool(); +#endif } } // namespace pbrt diff --git a/src/pbrt/wavefront/workqueue.h b/src/pbrt/wavefront/workqueue.h index 2f0827e4d..10e81f5a9 100644 --- a/src/pbrt/wavefront/workqueue.h +++ b/src/pbrt/wavefront/workqueue.h @@ -33,6 +33,9 @@ #include #endif +#elif defined(__HIPCC__) +#define PBRT_USE_LEGACY_CUDA_ATOMICS + #endif // __CUDACC__ namespace pbrt {