1.48. Mean Reduction Operation

Calculates the arithmetic mean of an input tensor along a specified dimension, reducing the tensor by averaging over the chosen axis.

Progress

Step 2 of 2

Reward

1.41x

Optimization Timeline

Step 2 Details

Reasoning

We can further optimize the mean reduction kernel by unrolling the loop over the reduction dimension when its size is divisible by 4. In our view of the input tensor as [outer, reduction, inner], each thread computes one output element by summing over reduction elements (with stride equal to inner). When the reduction size (R) is divisible by 4, we can unroll the loop in groups of 4, reducing loop overhead and improving instruction-level parallelism, thus making the kernel more efficient. The Python module remains essentially the same, as it only needs to load our custom operator. We use c10::cuda::CUDAGuard to enforce correct device usage and c10::cuda::getCurrentCUDAStream() to launch the kernel. These improvements genuinely reduce computation time by reducing loop iterations and overhead, rather than working around the computation.

#include <torch/extension.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda.h>
#include <cuda_runtime.h>

// CUDA kernel to compute mean reduction along a specified dimension.
// We view the input tensor as a 3D tensor with shape [outer, reduction, inner],
// where outer is the product of dimensions before the reduction dimension,
// reduction is the size of the reduction dimension, and inner is the product of dimensions after it.
// Each thread computes one output element.
__global__ void mean_reduce_kernel(const float* __restrict__ input,
                                   float* __restrict__ output,
                                   int outer, int reduction, int inner) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int total = outer * inner;
    if(tid < total) {
        int i = tid / inner;  // index in outer dimension
        int j = tid % inner;  // index in inner dimension
        float sum = 0.0f;
        int base = i * reduction * inner;
        // If reduction is divisible by 4, unroll the loop by a factor of 4.
        if((reduction & 3) == 0) {
            int r4 = reduction >> 2;  // reduction / 4
            for (int k = 0; k < r4; k++) {
                int idx = base + (k * 4) * inner + j;
                sum += input[idx] +
                       input[idx + inner] +
                       input[idx + 2 * inner] +
                       input[idx + 3 * inner];
            }
        } else {
            for (int k = 0; k < reduction; k++) {
                sum += input[base + k * inner + j];
            }
        }
        output[tid] = sum / reduction;
    }
}

torch::Tensor mean_reduce(torch::Tensor x, int dim) {
    // Ensure x is on correct CUDA device.
    c10::cuda::CUDAGuard device_guard(x.device());
    
    int dims = x.dim();
    TORCH_CHECK(dim >= 0 && dim < dims, "Reduction dimension out of range");
    
    // Compute outer, reduction, and inner sizes.
    int outer = 1;
    for (int i = 0; i < dim; i++) {
        outer *= x.size(i);
    }
    int reduction = x.size(dim);
    int inner = 1;
    for (int i = dim + 1; i < dims; i++) {
        inner *= x.size(i);
    }
    
    // Prepare output shape by removing dimension 'dim'.
    std::vector<int64_t> out_shape;
    for (int i = 0; i < dims; i++) {
        if(i != dim) {
            out_shape.push_back(x.size(i));
        }
    }
    
    auto output = torch::empty(out_shape, x.options());
    
    int total = outer * inner;
    int threads = 256;
    int blocks = (total + threads - 1) / threads;
    
    // Launch the kernel on the current CUDA stream.
    mean_reduce_kernel<<<blocks, threads, 0, c10::cuda::getCurrentCUDAStream()>>>(
        x.data_ptr<float>(),
        output.data_ptr<float>(),
        outer,
        reduction,
        inner
    );
    
    return output;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("mean_reduce", &mean_reduce, "Mean reduction along a specified dimension (CUDA) with loop unrolling");
}