1.30. Softsign Activation
Calculates the softsign activation on input data by dividing each element by (1 + its absolute value), effectively transforming the input into a bounded non-linear representation.
Progress
Optimization Timeline
Step 4 Details
Reasoning
We may further optimize the softsign activation by ensuring that our vectorized kernel takes full advantage of the GPU’s read‐only cache by using __ldg in every load. In our revised vectorized kernel, each thread processes 4 contiguous elements. Instead of a plain float4 load (which may or may not use the read‐only cache), we load each scalar element with __ldg and then compute the softsign. For the element‐wise kernel, we also use __ldg when loading each element. These changes help improve memory throughput and reduce latency. The Python module remains largely unchanged.
#include <torch/extension.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda_runtime.h>
#include <cmath>
// Elementwise kernel for Softsign, using __ldg for read-only memory access.
__global__ void softsign_kernel(const float* __restrict__ input,
float* __restrict__ output,
int total_elements) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < total_elements) {
float x = __ldg(input + index);
float inv = 1.0f / (1.0f + fabsf(x));
output[index] = x * inv;
}
}
// Vectorized kernel for Softsign using manual scalar loads via __ldg.
// Each thread processes 4 contiguous float elements.
__global__ void softsign_vec_kernel(const float* __restrict__ input,
float* __restrict__ output,
int total_vec) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < total_vec) {
int base = index * 4;
float a = __ldg(input + base);
float b = __ldg(input + base + 1);
float c = __ldg(input + base + 2);
float d = __ldg(input + base + 3);
float inv_a = 1.0f / (1.0f + fabsf(a));
float inv_b = 1.0f / (1.0f + fabsf(b));
float inv_c = 1.0f / (1.0f + fabsf(c));
float inv_d = 1.0f / (1.0f + fabsf(d));
output[base] = a * inv_a;
output[base + 1] = b * inv_b;
output[base + 2] = c * inv_c;
output[base + 3] = d * inv_d;
}
}
torch::Tensor softsign(torch::Tensor input) {
// Ensure we operate on the correct CUDA device.
c10::cuda::CUDAGuard device_guard(input.device());
auto output = torch::empty_like(input);
int total_elements = input.numel();
int threads = 256;
// Use the vectorized kernel if the total number of elements is divisible by 4.
if (total_elements % 4 == 0) {
int total_vec = total_elements / 4;
int blocks = (total_vec + threads - 1) / threads;
softsign_vec_kernel<<<blocks, threads, 0, c10::cuda::getCurrentCUDAStream()>>>(
input.data_ptr<float>(), output.data_ptr<float>(), total_vec);
} else {
int blocks = (total_elements + threads - 1) / threads;
softsign_kernel<<<blocks, threads, 0, c10::cuda::getCurrentCUDAStream()>>>(
input.data_ptr<float>(), output.data_ptr<float>(), total_elements);
}
return output;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("softsign", &softsign, "Apply Softsign activation (CUDA)");
}