mitsuba-renderer
diff --git a/‎include/drjit-core/array.h
+1-1 b/‎include/drjit-core/array.h
+1-1
diff --git a/‎include/drjit-core/jit.h
+31-17 b/‎include/drjit-core/jit.h
+31-17
diff --git a/‎resources/Makefile
+1-1 b/‎resources/Makefile
+1-1
diff --git a/‎resources/common.h
+4 b/‎resources/common.h
+4
diff --git a/‎resources/compress.cuh
+10 b/‎resources/compress.cuh
+10
@@ -364,7 +364,7 @@ Array empty(size_t size) {
                                                       : AllocType::HostAsync,
                    byte_size);
     return Array::steal(
-        jit_var_map_mem(Array::Backend, Array::Type, ptr, size, 1));
+        jit_var_mem_map(Array::Backend, Array::Type, ptr, size, 1));
 }
 
 template <typename Array>
 
@@ -1592,30 +1592,44 @@ extern JIT_EXPORT void jit_memcpy_async(JIT_ENUM JitBackend backend, void *dst,
  */
 extern JIT_EXPORT void jit_reduce(JIT_ENUM JitBackend backend, JIT_ENUM VarType type,
                                   JIT_ENUM ReduceOp rtype,
-                                  const void *ptr, uint32_t size, void *out);
+                                  const void *in, uint32_t size, void *out);
 
-/**
- * \brief Perform an exclusive scan / prefix sum over an unsigned 32 bit integer
- * array
+/** \brief Compute n prefix sum over the given input array
+ *
+ * Both exclusive and inclusive variants are supported. If desired, the scan
+ * can be performed in-place (i.e., <tt>out == in</tt>). The operation runs
+ * asynchronously.
+ *
+ * The operation is currenly implemented for the following numeric types:
+ * ``VarType::Int32``, ``VarType::UInt32``, ``VarType::UInt64``,
+ * ``VarType::Float32``, and ``VarType::Float64``.
  *
- * If desired, the scan can be performed in-place (i.e. <tt>in == out</tt>).
- * Note that the CUDA implementation will round up \c size to the maximum of
- * the following three values for performance reasons:
+ * Note that the CUDA implementation may round \c size to the maximum of the
+ * following three values for performance and implementation-related reasons
+ * (the prefix sum uses a tree-based parallelization scheme).
  *
- * - the value 4,
+ * - the value 4
  * - the next highest power of two (when size <= 4096),
  * - the next highest multiple of 2K (when size > 4096),
  *
  * For this reason, the the supplied memory regions must be sufficiently large
- * to avoid both out-of-bounds reads and writes. This is not an issue for
- * memory obtained using \ref jit_malloc(), which internally rounds
- * allocations to the next largest power of two and enforces a 64 byte minimum
- * allocation size.
- *
- * Runs asynchronously.
- */
-extern JIT_EXPORT void jit_scan_u32(JIT_ENUM JitBackend backend, const uint32_t *in,
-                                    uint32_t size, uint32_t *out);
+ * to avoid out-of-bounds reads and writes. This is not an issue for memory
+ * obtained using \ref jit_malloc(), which internally rounds allocations to the
+ * next largest power of two and enforces a 64 byte minimum allocation size.
+ *
+ * The CUDA backend implementation for *large* numeric types (double precision
+ * floats, 64 bit integers) has the following technical limitation: when
+ * reducing 64-bit integers, their values must be smaller than 2**62. When
+ * reducing double precision arrays, the two least significant mantissa bits
+ * are clamped to zero when forwarding the prefix from one 512-wide block to
+ * the next (at a very minor loss in accuracy). The reason is that the
+ * operations requires two status bits to coordinate the prefix and status of
+ * each 512-wide block, and those must each fit into a single 64 bit value
+ * (128-bit writes aren't guaranteed to be atomic).
+ */
+extern JIT_EXPORT void jit_scan(JIT_ENUM JitBackend backend,
+                                JIT_ENUM VarType type, int exclusive,
+                                const void *in, uint32_t size, void *out);
 
 /**
  * \brief Compress a mask into a list of nonzero indices
 
@@ -1,6 +1,6 @@
 COMPUTE_CAPABILITY=compute_70
 CUDA_VER=10.2
-NVCC=/usr/local/cuda-$(CUDA_VER)/bin/nvcc -m64 --ptx --expt-relaxed-constexpr
+NVCC=/usr/local/cuda-$(CUDA_VER)/bin/nvcc -m64 --ptx --expt-relaxed-constexpr -std=c++14
 
 all: kernels.h
 
 
@@ -5,6 +5,10 @@
 #include <limits>
 
 #define KERNEL extern "C" __global__
+#define DEVICE __device__
+#define FINLINE __forceinline__
+#define WARP_SIZE 32
+#define FULL_MASK 0xffffffff
 
 template <typename T> struct SharedMemory {
     __device__ inline static T *get() {
 
@@ -10,6 +10,16 @@
 
 #include "common.h"
 
+DEVICE FINLINE void store_cg(uint64_t *ptr, uint64_t val) {
+    asm volatile("st.cg.u64 [%0], %1;" : : "l"(ptr), "l"(val));
+}
+
+DEVICE FINLINE uint64_t load_cg(uint64_t *ptr) {
+    uint64_t retval;
+    asm volatile("ld.cg.u64 %0, [%1];" : "=l"(retval) : "l"(ptr));
+    return retval;
+}
+
 KERNEL void compress_small(const uint8_t *in, uint32_t *out, uint32_t size, uint32_t *count_out) {
     uint32_t *shared = SharedMemory<uint32_t>::get();
Original file line number	Diff line number	Diff line change
`@@ -364,7 +364,7 @@ Array empty(size_t size) {`
`364`	`364`	`: AllocType::HostAsync,`
`365`	`365`	`byte_size);`
`366`	`366`	`return Array::steal(`
`367`		`- jit_var_map_mem(Array::Backend, Array::Type, ptr, size, 1));`
	`367`	`+ jit_var_mem_map(Array::Backend, Array::Type, ptr, size, 1));`
`368`	`368`	`}`
`369`	`369`
`370`	`370`	`template <typename Array>`