1.20. LeakyReLU Activation

Calculates the Leaky ReLU activation on an input tensor by scaling negative input values with a configurable negative slope.

Progress

Step 2 of 2
Reward
1.20x

Optimization Timeline

Step 2 Details

Reasoning

We can optimize the LeakyReLU activation further by vectorizing the computation when possible. In particular, if the total number of elements is divisible by 4 and the tensor is contiguous, we can reinterpret the input and output pointers as float4 pointers and process 4 elements at a time. This reduces the number of memory transactions and leverages coalesced accesses, potentially increasing throughput. If these conditions are not met, we fall back to the original element‐wise (scalar) implementation. We check these conditions in the forward function in the CUDA C++ code. This improved solution genuinely reduces computation time on large tensors without compromising correctness.

#include <torch/extension.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda_runtime.h>

// Scalar kernel for LeakyReLU activation.
__global__ void leaky_relu_kernel_scalar(const float* __restrict__ input,
                                           float* __restrict__ output,
                                           float negative_slope,
                                           int numel) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < numel) {
        float x = input[idx];
        output[idx] = (x >= 0.0f) ? x : negative_slope * x;
    }
}

// Vectorized kernel for LeakyReLU activation using float4.
__global__ void leaky_relu_kernel_vec(const float4* __restrict__ input,
                                        float4* __restrict__ output,
                                        float negative_slope,
                                        int numVec) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < numVec) {
        float4 in_val = input[idx];
        float4 out_val;
        out_val.x = (in_val.x >= 0.0f) ? in_val.x : negative_slope * in_val.x;
        out_val.y = (in_val.y >= 0.0f) ? in_val.y : negative_slope * in_val.y;
        out_val.z = (in_val.z >= 0.0f) ? in_val.z : negative_slope * in_val.z;
        out_val.w = (in_val.w >= 0.0f) ? in_val.w : negative_slope * in_val.w;
        output[idx] = out_val;
    }
}

// Custom CUDA forward function that applies the LeakyReLU activation.
torch::Tensor forward(torch::Tensor input, float negative_slope) {
    // Ensure operations are on the correct CUDA device.
    c10::cuda::CUDAGuard device_guard(input.device());
    
    // Allocate output tensor with the same shape as input.
    auto output = torch::empty_like(input);
    
    int numel = input.numel();
    int threads = 256;
    
    // Check if we can use vectorized processing:
    // the tensor must be contiguous and its number of elements divisible by 4.
    if (input.is_contiguous() && (numel % 4 == 0)) {
        int numVec = numel / 4;
        int blocks = (numVec + threads - 1) / threads;
        // Reinterpret pointers as float4 pointers.
        const float4* input_vec = reinterpret_cast<const float4*>(input.data_ptr<float>());
        float4* output_vec = reinterpret_cast<float4*>(output.data_ptr<float>());
        leaky_relu_kernel_vec<<<blocks, threads, 0, c10::cuda::getCurrentCUDAStream()>>>(
            input_vec, output_vec, negative_slope, numVec);
    } else {
        int blocks = (numel + threads - 1) / threads;
        leaky_relu_kernel_scalar<<<blocks, threads, 0, c10::cuda::getCurrentCUDAStream()>>>(
            input.data_ptr<float>(), output.data_ptr<float>(), negative_slope, numel);
    }
    
    return output;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "Custom LeakyReLU activation using CUDA with vectorized optimization");
}