2.30. GEMM with GroupNorm and HardTanh

Calculates a linear transformation using matrix multiplication, then normalizes the output across defined groups, and finally clamps the values into a specified range using a HardTanh activation.

Progress

Step 8 of 8

Reward

1.82x

Optimization Timeline

Step 8 Details

Reasoning

In this final solution we take a different approach by not fusing the entire linear (GEMM) operation with the normalization and activation phases. Instead, we exploit the highly optimized cuBLAS GEMM (via torch::addmm) for the linear transformation step and then fuse the inexpensive GroupNorm and HardTanh operations in a custom CUDA kernel. In the custom kernel each block processes one sample; we use shared memory to compute per‐group mean and variance and then normalize and clamp each element accordingly. This design reduces the extra memory round-trip that would occur if the operations were done separately, while leveraging decades of optimization in cuBLAS for the GEMM. The resulting solution shows a significant speedup (roughly 1.76× faster than native) while keeping the implementation clean and maintainable. These improvements are genuine as they combine state‑of‑the‑art library routines with specialized fusion for the light element‐wise operations.

#include <torch/extension.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda_runtime.h>
#include <math.h>

// CUDA kernel to fuse Group Normalization and HardTanh activation.
// Input tensor Y has shape [batch, out_features] and is computed by cuBLAS GEMM (torch::addmm)
// in the host code. For each sample, the kernel computes per-group mean and variance, and then
// normalizes the values in Y, applies the GroupNorm affine parameters (gn_weight and gn_bias),
// and finally applies HardTanh clamping.
//
// Each block processes one sample. The block dimension should be set to out_features.
__global__ void gn_hardtanh_kernel(
    const float* __restrict__ Y,         // Input: [batch, out_features]
    const float* __restrict__ gn_weight,   // GroupNorm scale: [out_features]
    const float* __restrict__ gn_bias,     // GroupNorm shift: [out_features]
    float* __restrict__ output,            // Output: [batch, out_features]
    int out_features,
    int num_groups,
    float hardtanh_min,
    float hardtanh_max)
{
    int sample = blockIdx.x;
    int tid = threadIdx.x;  // index in [0, out_features)
    int channels_per_group = out_features / num_groups;

    // Shared memory for group statistics (mean and variance per group).
    extern __shared__ float shared_mem[]; // size: 2 * num_groups floats
    float* group_mean = shared_mem;         // [num_groups]
    float* group_var  = group_mean + num_groups;  // [num_groups]

    // Use the first num_groups threads to compute group statistics.
    if (tid < num_groups) {
        int start = tid * channels_per_group;
        int end = start + channels_per_group;
        float sum = 0.0f, sum_sq = 0.0f;
        for (int j = start; j < end; j++) {
            float val = Y[sample * out_features + j];
            sum += val;
            sum_sq += val * val;
        }
        float mean = sum / channels_per_group;
        float var = sum_sq / channels_per_group - mean * mean;
        group_mean[tid] = mean;
        group_var[tid] = var;
    }
    __syncthreads();

    // Each thread normalizes its output value, applies the affine transform, and HardTanh.
    if (tid < out_features) {
        int group_id = tid / channels_per_group;
        float mean = group_mean[group_id];
        float var = group_var[group_id];
        float eps = 1e-5f;
        float val = Y[sample * out_features + tid];
        float norm = (val - mean) / sqrtf(var + eps);
        float res = norm * gn_weight[tid] + gn_bias[tid];
        res = fminf(fmaxf(res, hardtanh_min), hardtanh_max);
        output[sample * out_features + tid] = res;
    }
}

// The forward function first computes Y = x * (W^T) + b using torch::addmm, then launches
// the gn_hardtanh_kernel to fuse GroupNorm and HardTanh.
torch::Tensor forward(torch::Tensor x,
                      torch::Tensor linear_weight,
                      torch::Tensor linear_bias,
                      torch::Tensor gn_weight,
                      torch::Tensor gn_bias,
                      int num_groups,
                      float hardtanh_min,
                      float hardtanh_max) {
    // Ensure we are on the appropriate device.
    c10::cuda::CUDAGuard device_guard(x.device());

    // Compute Y = x * (W^T) + b.
    // linear_weight is [out_features, in_features] so its transpose is [in_features, out_features].
    torch::Tensor Y = torch::addmm(linear_bias, x, linear_weight.t());
    int batch_size = Y.size(0);
    int out_features = Y.size(1);
    
    // Allocate output tensor.
    auto output = torch::empty_like(Y);
    
    // Shared memory size: we need 2 * num_groups floats.
    size_t shared_mem_size = 2 * num_groups * sizeof(float);
    
    // Launch one block per sample; blockDim should equal out_features.
    dim3 grid(batch_size);
    dim3 block(out_features);
    
    gn_hardtanh_kernel<<<grid, block, shared_mem_size, c10::cuda::getCurrentCUDAStream()>>>(
        Y.data_ptr<float>(),
        gn_weight.data_ptr<float>(),
        gn_bias.data_ptr<float>(),
        output.data_ptr<float>(),
        out_features,
        num_groups,
        hardtanh_min,
        hardtanh_max
    );
    
    return output;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "Fused GroupNorm and HardTanh kernel (using cuBLAS GEMM for linear)");
}