@@ -1592,30 +1592,44 @@ extern JIT_EXPORT void jit_memcpy_async(JIT_ENUM JitBackend backend, void *dst,
1592
1592
*/
1593
1593
extern JIT_EXPORT void jit_reduce (JIT_ENUM JitBackend backend , JIT_ENUM VarType type ,
1594
1594
JIT_ENUM ReduceOp rtype ,
1595
- const void * ptr , uint32_t size , void * out );
1595
+ const void * in , uint32_t size , void * out );
1596
1596
1597
- /**
1598
- * \brief Perform an exclusive scan / prefix sum over an unsigned 32 bit integer
1599
- * array
1597
+ /** \brief Compute n prefix sum over the given input array
1598
+ *
1599
+ * Both exclusive and inclusive variants are supported. If desired, the scan
1600
+ * can be performed in-place (i.e., <tt>out == in</tt>). The operation runs
1601
+ * asynchronously.
1602
+ *
1603
+ * The operation is currenly implemented for the following numeric types:
1604
+ * ``VarType::Int32``, ``VarType::UInt32``, ``VarType::UInt64``,
1605
+ * ``VarType::Float32``, and ``VarType::Float64``.
1600
1606
*
1601
- * If desired, the scan can be performed in-place (i.e. <tt>in == out</tt>).
1602
- * Note that the CUDA implementation will round up \c size to the maximum of
1603
- * the following three values for performance reasons:
1607
+ * Note that the CUDA implementation may round \c size to the maximum of the
1608
+ * following three values for performance and implementation-related reasons
1609
+ * ( the prefix sum uses a tree-based parallelization scheme).
1604
1610
*
1605
- * - the value 4,
1611
+ * - the value 4
1606
1612
* - the next highest power of two (when size <= 4096),
1607
1613
* - the next highest multiple of 2K (when size > 4096),
1608
1614
*
1609
1615
* For this reason, the the supplied memory regions must be sufficiently large
1610
- * to avoid both out-of-bounds reads and writes. This is not an issue for
1611
- * memory obtained using \ref jit_malloc(), which internally rounds
1612
- * allocations to the next largest power of two and enforces a 64 byte minimum
1613
- * allocation size.
1614
- *
1615
- * Runs asynchronously.
1616
- */
1617
- extern JIT_EXPORT void jit_scan_u32 (JIT_ENUM JitBackend backend , const uint32_t * in ,
1618
- uint32_t size , uint32_t * out );
1616
+ * to avoid out-of-bounds reads and writes. This is not an issue for memory
1617
+ * obtained using \ref jit_malloc(), which internally rounds allocations to the
1618
+ * next largest power of two and enforces a 64 byte minimum allocation size.
1619
+ *
1620
+ * The CUDA backend implementation for *large* numeric types (double precision
1621
+ * floats, 64 bit integers) has the following technical limitation: when
1622
+ * reducing 64-bit integers, their values must be smaller than 2**62. When
1623
+ * reducing double precision arrays, the two least significant mantissa bits
1624
+ * are clamped to zero when forwarding the prefix from one 512-wide block to
1625
+ * the next (at a very minor loss in accuracy). The reason is that the
1626
+ * operations requires two status bits to coordinate the prefix and status of
1627
+ * each 512-wide block, and those must each fit into a single 64 bit value
1628
+ * (128-bit writes aren't guaranteed to be atomic).
1629
+ */
1630
+ extern JIT_EXPORT void jit_scan (JIT_ENUM JitBackend backend ,
1631
+ JIT_ENUM VarType type , int exclusive ,
1632
+ const void * in , uint32_t size , void * out );
1619
1633
1620
1634
/**
1621
1635
* \brief Compress a mask into a list of nonzero indices
0 commit comments