1.91. Reverse Cumulative Sum

Calculates the reverse cumulative sum along a specified dimension by first reversing the data order along that dimension, performing the cumulative sum, and then reversing the result back to the original order.

Progress

Step 7 of 7

Reward

2.25x

Optimization Timeline

Step 7 Details

Reasoning

Our current implementation already obtains over 2.2× speedup compared to native PyTorch operations by combining a chunk‐based scan with an efficient two-level parallel scan: first, a warp-level inclusive scan using __shfl_up_sync and then a block-level reduction using shared memory. In this last revision we also added the __ldg intrinsic for faster read-only loads from global memory. There isn’t much additional room for further innovation without overcomplicating the kernel. The current design computes the reverse cumulative sum for each row of a 2D tensor in a highly parallel manner with minimal synchronization and low memory latency, which significantly reduces runtime. These improvements genuinely help because they reduce per-thread overhead, leverage warp-level parallelism, and ensure memory accesses are fast by exploiting the read-only cache. Therefore, I present the final updated code below.

#include <torch/extension.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda_runtime.h>

// Optimized kernel to compute the reverse cumulative sum for each row in a 2D tensor.
// Uses a two-level scan: a warp-level inclusive scan using __shfl_up_sync followed by a block-level reduction.
// The __ldg intrinsic is used for fast, cached loading of input values.
extern "C" __global__ void reverse_cumsum_optimized(const float* __restrict__ input,
                                                     float* __restrict__ output,
                                                     int L) {
    // Each block processes one row.
    int row = blockIdx.x;
    const float* row_in = input + row * L;
    float* row_out = output + row * L;

    const int T = blockDim.x;  // Total threads per block.
    int tid = threadIdx.x;

    // Calculate per-thread chunk boundaries (ceiling division).
    int nPerThread = (L + T - 1) / T;
    int start = tid * nPerThread;
    int end = start + nPerThread;
    if (end > L) end = L;
    int count = end - start;

    // Local array to hold the reverse cumulative sum for this thread's chunk.
    // We assume count <= 32.
    float local[32];
    if (start < L && count > 0) {
        int rev_index = L - 1 - start;
        local[0] = __ldg(&row_in[rev_index]);
        #pragma unroll
        for (int i = 1; i < count; i++) {
            rev_index = L - 1 - (start + i);
            local[i] = local[i - 1] + __ldg(&row_in[rev_index]);
        }
    }
    float thread_sum = (count > 0) ? local[count - 1] : 0.0f;

    // Perform a warp-level inclusive scan using warp shuffle intrinsics.
    unsigned int lane = tid & 31;         // Lane index within the warp.
    unsigned int warpId = tid >> 5;         // Warp ID (tid / 32).
    float val = thread_sum;
    #pragma unroll
    for (int offset = 1; offset < 32; offset *= 2) {
        float n = __shfl_up_sync(0xffffffff, val, offset);
        if (lane >= offset)
            val += n;
    }
    float warp_inclusive = val;
    // Compute the exclusive scan value for the thread within the warp.
    float thread_exclusive = (lane == 0) ? 0.0f : warp_inclusive - thread_sum;

    // Use shared memory to gather warp totals.
    __shared__ float warpTotals[32];  // Supports up to 32 warps per block.
    if (lane == 31) {
        warpTotals[warpId] = warp_inclusive;
    }
    __syncthreads();

    // Compute the block-level offset by summing warp totals of preceding warps.
    float warp_prefix = 0.0f;
    if (warpId > 0) {
        #pragma unroll
        for (int i = 0; i < warpId; i++) {
            warp_prefix += warpTotals[i];
        }
    }
    float final_offset = warp_prefix + thread_exclusive;

    // Add the computed offset to each element of the local cumulative sum.
    #pragma unroll
    for (int i = 0; i < count; i++) {
        local[i] += final_offset;
    }

    // Write the computed reverse cumulative sum back to global memory in the correct order.
    #pragma unroll
    for (int i = 0; i < count; i++) {
        int orig_index = L - 1 - (start + i);
        row_out[orig_index] = local[i];
    }
}

torch::Tensor reverse_cumsum(torch::Tensor input, int rows, int L) {
    // Enforce usage of the correct CUDA device.
    c10::cuda::CUDAGuard device_guard(input.device());
    auto output = torch::empty({rows, L}, input.options());
    
    // Use 256 threads per block.
    int threads = 256;
    dim3 grid(rows);
    dim3 block(threads);
    // Allocate shared memory for the warp totals.
    size_t shared_mem = (threads / 32) * sizeof(float);
    
    reverse_cumsum_optimized<<<grid, block, shared_mem, c10::cuda::getCurrentCUDAStream()>>>(
        input.data_ptr<float>(), output.data_ptr<float>(), L);
    
    return output;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("reverse_cumsum", &reverse_cumsum, "Optimized reverse cumulative sum for 2D tensors along dim 1 using warp-level scan and __ldg");
}