1.30. Softsign Activation

Calculates the softsign activation on input data by dividing each element by (1 + its absolute value), effectively transforming the input into a bounded non-linear representation.

Progress

Step 4 of 4

Reward

2.52x

Optimization Timeline

Step 4 Details

Reasoning

We may further optimize the softsign activation by ensuring that our vectorized kernel takes full advantage of the GPU’s read‐only cache by using __ldg in every load. In our revised vectorized kernel, each thread processes 4 contiguous elements. Instead of a plain float4 load (which may or may not use the read‐only cache), we load each scalar element with __ldg and then compute the softsign. For the element‐wise kernel, we also use __ldg when loading each element. These changes help improve memory throughput and reduce latency. The Python module remains largely unchanged.

#include <torch/extension.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda_runtime.h>
#include <cmath>

// Elementwise kernel for Softsign, using __ldg for read-only memory access.
__global__ void softsign_kernel(const float* __restrict__ input,
                                float* __restrict__ output,
                                int total_elements) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    if (index < total_elements) {
        float x = __ldg(input + index);
        float inv = 1.0f / (1.0f + fabsf(x));
        output[index] = x * inv;
    }
}

// Vectorized kernel for Softsign using manual scalar loads via __ldg.
// Each thread processes 4 contiguous float elements.
__global__ void softsign_vec_kernel(const float* __restrict__ input,
                                    float* __restrict__ output,
                                    int total_vec) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    if (index < total_vec) {
        int base = index * 4;
        float a = __ldg(input + base);
        float b = __ldg(input + base + 1);
        float c = __ldg(input + base + 2);
        float d = __ldg(input + base + 3);

        float inv_a = 1.0f / (1.0f + fabsf(a));
        float inv_b = 1.0f / (1.0f + fabsf(b));
        float inv_c = 1.0f / (1.0f + fabsf(c));
        float inv_d = 1.0f / (1.0f + fabsf(d));

        output[base]     = a * inv_a;
        output[base + 1] = b * inv_b;
        output[base + 2] = c * inv_c;
        output[base + 3] = d * inv_d;
    }
}

torch::Tensor softsign(torch::Tensor input) {
    // Ensure we operate on the correct CUDA device.
    c10::cuda::CUDAGuard device_guard(input.device());

    auto output = torch::empty_like(input);
    int total_elements = input.numel();
    int threads = 256;

    // Use the vectorized kernel if the total number of elements is divisible by 4.
    if (total_elements % 4 == 0) {
        int total_vec = total_elements / 4;
        int blocks = (total_vec + threads - 1) / threads;
        softsign_vec_kernel<<<blocks, threads, 0, c10::cuda::getCurrentCUDAStream()>>>(
            input.data_ptr<float>(), output.data_ptr<float>(), total_vec);
    } else {
        int blocks = (total_elements + threads - 1) / threads;
        softsign_kernel<<<blocks, threads, 0, c10::cuda::getCurrentCUDAStream()>>>(
            input.data_ptr<float>(), output.data_ptr<float>(), total_elements);
    }
    
    return output;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("softsign", &softsign, "Apply Softsign activation (CUDA)");
}