2.31. Conv-Min-Add-Multiply Operation

Computes a 2D convolution over the input data, then calculates the element-wise minimum between the convolution result and a constant value, adds a bias term, and finally scales the output by a specified factor.

Progress

Step 3 of 3

Reward

3.13x

Optimization Timeline

Step 3 Details

Reasoning

We can further optimize the fused convolution kernel by reducing loop overhead and maximizing instruction-level parallelism specifically for the common case where K==3 and IC==3. In this specialized kernel we unroll the inner loops completely with explicit indexing, reducing branch overhead and function call overhead. In addition, we explicitly compute the base indices for the input tiles once per channel, which minimizes redundant arithmetic in the inner loops. For the general case, we retain a loop-based implementation. After reviewing the design, I conclude that these improvements genuinely help by reducing kernel overhead for the common case without “cheating” – they simply maximize the performance when the typical inputs (3-channel RGB with 3x3 kernels) are used. The Python module remains unchanged as it properly dispatches to the specialized kernel when possible, ensuring that our improvements are both genuine and beneficial.

#include <torch/extension.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda_runtime.h>
#include <math.h>

// General kernel for arbitrary kernel sizes and input channels.
__global__ void fused_conv_min_bias_scale_kernel_out_general(
    const float* __restrict__ x,
    const float* __restrict__ weight,
    const float* __restrict__ conv_bias,
    const float* __restrict__ extra_bias,
    float constant_value,
    float scaling_factor,
    float* __restrict__ y,
    int B, int IC, int H, int W, int OC, int K, int OH, int OW)
{
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int total = B * OC * OH * OW;
   if (index < total) {
       int ow = index % OW;
       int tmp = index / OW;
       int oh = tmp % OH;
       tmp = tmp / OH;
       int oc = tmp % OC;
       int b = tmp / OC;
       float sum = 0.0f;
       for (int ic = 0; ic < IC; ic++) {
           for (int p = 0; p < K; p++) {
               for (int q = 0; q < K; q++) {
                   int in_row = oh + p;
                   int in_col = ow + q;
                   int x_index = ((b * IC + ic) * H + in_row) * W + in_col;
                   int w_index = ((oc * IC + ic) * K + p) * K + q;
                   sum += x[x_index] * weight[w_index];
               }
           }
       }
       sum += conv_bias[oc];
       float min_val = fminf(sum, constant_value);
       float result = (min_val + extra_bias[oc]) * scaling_factor;
       int out_index = ((b * OC + oc) * OH + oh) * OW + ow;
       y[out_index] = result;
   }
}

// Specialized kernel for kernel size 3 and input channels 3 (completely unrolled inner loops).
__global__ void fused_conv_min_bias_scale_kernel_out_3(
    const float* __restrict__ x,
    const float* __restrict__ weight,
    const float* __restrict__ conv_bias,
    const float* __restrict__ extra_bias,
    float constant_value,
    float scaling_factor,
    float* __restrict__ y,
    int B, int IC, int H, int W, int OC, int OH, int OW)
{
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int total = B * OC * OH * OW;
   if (index < total) {
       int ow = index % OW;
       int tmp = index / OW;
       int oh = tmp % OH;
       tmp = tmp / OH;
       int oc = tmp % OC;
       int b = tmp / OC;
       float sum = 0.f;
       
       // Unroll for each input channel (IC==3) and kernel window (3x3)
       // Channel 0
       {
         int base_x = ((b * 3 + 0) * H + oh) * W + ow;
         int base_w = ((oc * 3 + 0) * 9);
         sum += x[base_x + 0*W + 0] * weight[base_w + 0];
         sum += x[base_x + 0*W + 1] * weight[base_w + 1];
         sum += x[base_x + 0*W + 2] * weight[base_w + 2];
         sum += x[base_x + 1*W + 0] * weight[base_w + 3];
         sum += x[base_x + 1*W + 1] * weight[base_w + 4];
         sum += x[base_x + 1*W + 2] * weight[base_w + 5];
         sum += x[base_x + 2*W + 0] * weight[base_w + 6];
         sum += x[base_x + 2*W + 1] * weight[base_w + 7];
         sum += x[base_x + 2*W + 2] * weight[base_w + 8];
       }
       // Channel 1
       {
         int base_x = ((b * 3 + 1) * H + oh) * W + ow;
         int base_w = ((oc * 3 + 1) * 9);
         sum += x[base_x + 0*W + 0] * weight[base_w + 0];
         sum += x[base_x + 0*W + 1] * weight[base_w + 1];
         sum += x[base_x + 0*W + 2] * weight[base_w + 2];
         sum += x[base_x + 1*W + 0] * weight[base_w + 3];
         sum += x[base_x + 1*W + 1] * weight[base_w + 4];
         sum += x[base_x + 1*W + 2] * weight[base_w + 5];
         sum += x[base_x + 2*W + 0] * weight[base_w + 6];
         sum += x[base_x + 2*W + 1] * weight[base_w + 7];
         sum += x[base_x + 2*W + 2] * weight[base_w + 8];
       }
       // Channel 2
       {
         int base_x = ((b * 3 + 2) * H + oh) * W + ow;
         int base_w = ((oc * 3 + 2) * 9);
         sum += x[base_x + 0*W + 0] * weight[base_w + 0];
         sum += x[base_x + 0*W + 1] * weight[base_w + 1];
         sum += x[base_x + 0*W + 2] * weight[base_w + 2];
         sum += x[base_x + 1*W + 0] * weight[base_w + 3];
         sum += x[base_x + 1*W + 1] * weight[base_w + 4];
         sum += x[base_x + 1*W + 2] * weight[base_w + 5];
         sum += x[base_x + 2*W + 0] * weight[base_w + 6];
         sum += x[base_x + 2*W + 1] * weight[base_w + 7];
         sum += x[base_x + 2*W + 2] * weight[base_w + 8];
       }
       
       sum += conv_bias[oc];
       float min_val = fminf(sum, constant_value);
       float result = (min_val + extra_bias[oc]) * scaling_factor;
       int out_index = ((b * OC + oc) * OH + oh) * OW + ow;
       y[out_index] = result;
   }
}

torch::Tensor forward(torch::Tensor x,
                      torch::Tensor conv_weight,
                      torch::Tensor conv_bias,
                      torch::Tensor extra_bias,
                      float constant_value,
                      float scaling_factor) {
    c10::cuda::CUDAGuard device_guard(x.device());
    
    int B = x.size(0);
    int IC = x.size(1);
    int H = x.size(2);
    int W = x.size(3);
    int OC = conv_weight.size(0);
    int K = conv_weight.size(2);
    
    int OH = H - K + 1;
    int OW = W - K + 1;
    
    auto y = torch::empty({B, OC, OH, OW}, x.options());
    
    int total = B * OC * OH * OW;
    int threads = 256;
    int blocks = (total + threads - 1) / threads;
    
    // Choose specialized kernel for common case K==3 and IC==3.
    if (K == 3 && IC == 3) {
        fused_conv_min_bias_scale_kernel_out_3<<<blocks, threads, 0, c10::cuda::getCurrentCUDAStream()>>>(
            x.data_ptr<float>(),
            conv_weight.data_ptr<float>(),
            conv_bias.data_ptr<float>(),
            extra_bias.data_ptr<float>(),
            constant_value,
            scaling_factor,
            y.data_ptr<float>(),
            B, IC, H, W, OC, OH, OW
        );
    } else {
        fused_conv_min_bias_scale_kernel_out_general<<<blocks, threads, 0, c10::cuda::getCurrentCUDAStream()>>>(
            x.data_ptr<float>(),
            conv_weight.data_ptr<float>(),
            conv_bias.data_ptr<float>(),
            extra_bias.data_ptr<float>(),
            constant_value,
            scaling_factor,
            y.data_ptr<float>(),
            B, IC, H, W, OC, K, OH, OW
        );
    }
    
    return y;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "Fused convolution, min, bias addition, and scaling using a custom CUDA kernel");
}