1.32. HardTanh Activation Operation

Calculates the HardTanh activation by clamping input values to the range [-1, 1].

Progress

Step 5 of 5

Reward

1.47x

Optimization Timeline

Step 5 Details

Reasoning

We can further improve the kernel by using the __ldg intrinsic to load values from global memory into the read‐only cache. This can decrease global memory latency when reading the input tensor. In the scalar kernel, we replace the normal load with __ldg(&input[idx]). In the vectorized kernel, similarly, we use __ldg on the float4 pointer. These changes are noninvasive yet can yield performance improvements by better exploiting the hardware’s caching system. The Python code remains the same, and the overall algorithm is unaltered; we only optimize how data is loaded. These improvements are genuine since they reduce memory load latencies and potentially speed up the overall kernel execution without altering the output.

#include <torch/extension.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda.h>
#include <cuda_runtime.h>

// Scalar HardTanh kernel using grid-stride loop, with __ldg for read-only caching.
__global__ void hardtanh_kernel(const float* __restrict__ input,
                                float* __restrict__ output,
                                int numel) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = gridDim.x * blockDim.x;
    for (; idx < numel; idx += stride) {
        // Use __ldg for faster read access.
        float x = __ldg(&input[idx]);
        float y = fmaxf(-1.f, fminf(1.f, x));
        output[idx] = y;
    }
}

// Vectorized HardTanh kernel using float4 and grid-stride loop, with __ldg for caching.
__global__ void hardtanh_vec_kernel(const float* __restrict__ input,
                                    float* __restrict__ output,
                                    int vec_size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = gridDim.x * blockDim.x;
    const float4* in_vec = reinterpret_cast<const float4*>(input);
    float4* out_vec = reinterpret_cast<float4*>(output);
    for (; idx < vec_size; idx += stride) {
        // Use __ldg for vectorized load.
        float4 in_val = __ldg(&in_vec[idx]);
        float4 out_val;
        out_val.x = fmaxf(-1.f, fminf(1.f, in_val.x));
        out_val.y = fmaxf(-1.f, fminf(1.f, in_val.y));
        out_val.z = fmaxf(-1.f, fminf(1.f, in_val.z));
        out_val.w = fmaxf(-1.f, fminf(1.f, in_val.w));
        out_vec[idx] = out_val;
    }
}

torch::Tensor forward(torch::Tensor input) {
    // Use CUDAGuard to ensure kernel launches on the correct device.
    c10::cuda::CUDAGuard device_guard(input.device());
    
    auto output = torch::empty_like(input);
    int numel = input.numel();
    int threads = 256;
    
    // Use vectorized kernel if the total number of elements is divisible by 4 and input is 16-byte aligned.
    if ((numel % 4 == 0) && ((reinterpret_cast<uintptr_t>(input.data_ptr<float>()) % 16) == 0)) {
        int vec_size = numel / 4;
        int blocks = (vec_size + threads - 1) / threads;
        hardtanh_vec_kernel<<<blocks, threads, 0, c10::cuda::getCurrentCUDAStream()>>>(
            input.data_ptr<float>(),
            output.data_ptr<float>(),
            vec_size
        );
    } else {
        int blocks = (numel + threads - 1) / threads;
        hardtanh_kernel<<<blocks, threads, 0, c10::cuda::getCurrentCUDAStream()>>>(
            input.data_ptr<float>(),
            output.data_ptr<float>(),
            numel
        );
    }
    
    return output;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "HardTanh activation (CUDA) with vectorized grid-stride loop and __ldg optimization");
}