2.69. Conv2d HardSwish ReLU Operation

Calculates an output tensor by performing a 2D convolution on the input data, then applies a HardSwish activation to introduce a smooth non-linearity, and finally applies a ReLU activation to set negative values to zero.
Progress

Step 5 of 5
Reward
0.00x
Optimization Timeline

Step 5 Details

Reasoning

We can improve the current fused convolution+activation kernel further by exploiting the common case where kernel_size==3 and in_channels==3 (a common scenario for RGB images). In this case we can use shared memory for tiling the input and completely unroll the convolution loops. This approach reduces redundant global memory accesses and improves register reuse. In our new kernel “fused_conv_activation_shared_kernel” we assign each block (with a 2D thread arrangement) to compute an output tile for one (batch, out_channel) pair. The corresponding patch of the input is loaded into shared memory (for all 3 channels) in one cooperative step. Then, each thread computes its output using the shared memory, loading the required weights from global memory into registers and unrolling the convolution. For other cases we fall back to the previous unrolled implementation. These improvements are genuine – they reduce memory bandwidth load and decrease latency by improving data reuse.
// Includes
#include <torch/extension.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda_runtime.h>
#include <math.h>

// Optimized fused activation function for HardSwish followed by ReLU:
//   if x < 0:        return 0.0f;
//   else if x < 3:   return x*(x+3)*0.16666667f;  (i.e. x*(x+3)/6)
//   else:            return x.
__device__ inline float fused_activation(float x) {
    if (x < 0.0f)
        return 0.0f;
    else if (x < 3.0f)
        return x * (x + 3.0f) * 0.16666667f;
    else
        return x;
}

// ---------------------------------------------------------------------------
// Fallback kernel: already implemented unrolled kernel for generic cases.
// This kernel is used when either kernel_size != 3 or in_channels != 3.
__global__ void fused_conv_activation_kernel(const float* __restrict__ x,
                                               const float* __restrict__ weight,
                                               const float* __restrict__ bias,
                                               float* __restrict__ output,
                                               int batch,
                                               int in_channels,
                                               int out_channels,
                                               int H_in,
                                               int W_in,
                                               int kernel_size,
                                               int H_out,
                                               int W_out) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int total = batch * out_channels * H_out * W_out;
    if (index < total) {
        int w_out = index % W_out;
        int h_out = (index / W_out) % H_out;
        int oc = (index / (W_out * H_out)) % out_channels;
        int b = index / (W_out * H_out * out_channels);
        
        float sum = bias[oc];
        
        if (kernel_size == 3) {
            // Unrolled loop for 3x3 kernel.
            for (int ic = 0; ic < in_channels; ic++) {
                int input_base = b * (in_channels * H_in * W_in) + ic * (H_in * W_in);
                int weight_base = oc * (in_channels * 9) + ic * 9;
                int h0 = h_out, h1 = h_out + 1, h2 = h_out + 2;
                int w0 = w_out, w1 = w_out + 1, w2 = w_out + 2;
                sum += weight[weight_base + 0] * x[input_base + h0 * W_in + w0]
                     + weight[weight_base + 1] * x[input_base + h0 * W_in + w1]
                     + weight[weight_base + 2] * x[input_base + h0 * W_in + w2]
                     + weight[weight_base + 3] * x[input_base + h1 * W_in + w0]
                     + weight[weight_base + 4] * x[input_base + h1 * W_in + w1]
                     + weight[weight_base + 5] * x[input_base + h1 * W_in + w2]
                     + weight[weight_base + 6] * x[input_base + h2 * W_in + w0]
                     + weight[weight_base + 7] * x[input_base + h2 * W_in + w1]
                     + weight[weight_base + 8] * x[input_base + h2 * W_in + w2];
            }
        } else {
            // Generic convolution for arbitrary kernel sizes.
            for (int ic = 0; ic < in_channels; ic++) {
                for (int kh = 0; kh < kernel_size; kh++) {
                    for (int kw = 0; kw < kernel_size; kw++) {
                        int h_in = h_out + kh;
                        int w_in = w_out + kw;
                        int x_index = b * (in_channels * H_in * W_in) + ic * (H_in * W_in) + h_in * W_in + w_in;
                        int w_index = oc * (in_channels * kernel_size * kernel_size) + ic * (kernel_size * kernel_size) + kh * kernel_size + kw;
                        sum += weight[w_index] * x[x_index];
                    }
                }
            }
        }
        output[index] = fused_activation(sum);
    }
}

// ---------------------------------------------------------------------------
// Shared-memory optimized kernel for the case kernel_size==3 and in_channels==3.
#define TILE_DIM 8  // Tile dimension for output tile.
extern __shared__ float shared_data[]; // Shared memory for input tile. Size: in_channels * (TILE_DIM+2) * (TILE_DIM+2).

__global__ void fused_conv_activation_shared_kernel(const float* __restrict__ x,
                                                      const float* __restrict__ weight,
                                                      const float* __restrict__ bias,
                                                      float* __restrict__ output,
                                                      int batch,
                                                      int H_in,
                                                      int W_in,
                                                      int out_channels,
                                                      int H_out,  // = H_in - 2 for kernel_size==3.
                                                      int W_out) { // = W_in - 2 for kernel_size==3.
    // Block index (z-dimension encodes batch and out_channel).
    int bc = blockIdx.z; 
    int b = bc / out_channels;       // batch index.
    int oc = bc % out_channels;      // output channel.
    
    // 2D tile indices for output.
    int tile_x = blockIdx.x * TILE_DIM;
    int tile_y = blockIdx.y * TILE_DIM;
    
    // Thread indices within the block.
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    
    // Shared memory per channel; offset for each channel.
    int tile_size = (TILE_DIM + 2) * (TILE_DIM + 2);
    
    // Each block processes one (b, oc) pair.
    // Load input tile for all 3 channels into shared memory.
    // Total elements = 3 * (TILE_DIM+2)^2.
    int num_elements = 3 * (TILE_DIM + 2) * (TILE_DIM + 2);
    int thread_id = threadIdx.y * blockDim.x + threadIdx.x;
    int block_threads = blockDim.x * blockDim.y;
    for (int i = thread_id; i < num_elements; i += block_threads) {
        int c = i / ((TILE_DIM + 2) * (TILE_DIM + 2));
        int rem = i % ((TILE_DIM + 2) * (TILE_DIM + 2));
        int sh_row = rem / (TILE_DIM + 2);
        int sh_col = rem % (TILE_DIM + 2);
        // Global input coordinate corresponding to shared memory.
        int in_row = tile_y + sh_row;
        int in_col = tile_x + sh_col;
        float value = 0.0f;
        if (in_row < H_in && in_col < W_in)
            value = x[b * (3 * H_in * W_in) + c * (H_in * W_in) + in_row * W_in + in_col];
        shared_data[c * tile_size + sh_row * (TILE_DIM + 2) + sh_col] = value;
    }
    __syncthreads();
    
    // Compute output coordinate.
    int out_x = tile_x + tx;
    int out_y = tile_y + ty;
    if (out_x < W_out && out_y < H_out) {
        // Load weights for this output channel. We have 3 channels, each 3x3.
        // Weight layout: [out_channels, in_channels, 3, 3]
        int weight_base = oc * 9 * 3; // 27 floats per output channel.
        // For channel 0.
        float w00 = weight[weight_base + 0];
        float w01 = weight[weight_base + 1];
        float w02 = weight[weight_base + 2];
        // For channel 1.
        float w10 = weight[weight_base + 9 + 0];
        float w11 = weight[weight_base + 9 + 1];
        float w12 = weight[weight_base + 9 + 2];
        // For channel 2.
        float w20 = weight[weight_base + 18 + 0];
        float w21 = weight[weight_base + 18 + 1];
        float w22 = weight[weight_base + 18 + 2];
        
        float sum = bias[oc];
        // For each channel, compute convolution using shared memory.
        int sm_offset = 0;
        // Channel 0.
        int base0 = 0; // shared_data offset for channel 0.
        float r0 = shared_data[base0 + (ty + 0) * (TILE_DIM + 2) + (tx + 0)];
        float r1 = shared_data[base0 + (ty + 0) * (TILE_DIM + 2) + (tx + 1)];
        float r2 = shared_data[base0 + (ty + 0) * (TILE_DIM + 2) + (tx + 2)];
        float r3 = shared_data[base0 + (ty + 1) * (TILE_DIM + 2) + (tx + 0)];
        float r4 = shared_data[base0 + (ty + 1) * (TILE_DIM + 2) + (tx + 1)];
        float r5 = shared_data[base0 + (ty + 1) * (TILE_DIM + 2) + (tx + 2)];
        float r6 = shared_data[base0 + (ty + 2) * (TILE_DIM + 2) + (tx + 0)];
        float r7 = shared_data[base0 + (ty + 2) * (TILE_DIM + 2) + (tx + 1)];
        float r8 = shared_data[base0 + (ty + 2) * (TILE_DIM + 2) + (tx + 2)];
        sum += w00*r0 + w01*r1 + w02*r2;
        // Channel 1.
        int base1 = tile_size;  // offset for channel 1.
        r0 = shared_data[base1 + (ty + 0) * (TILE_DIM + 2) + (tx + 0)];
        r1 = shared_data[base1 + (ty + 0) * (TILE_DIM + 2) + (tx + 1)];
        r2 = shared_data[base1 + (ty + 0) * (TILE_DIM + 2) + (tx + 2)];
        r3 = shared_data[base1 + (ty + 1) * (TILE_DIM + 2) + (tx + 0)];
        r4 = shared_data[base1 + (ty + 1) * (TILE_DIM + 2) + (tx + 1)];
        r5 = shared_data[base1 + (ty + 1) * (TILE_DIM + 2) + (tx + 2)];
        r6 = shared_data[base1 + (ty + 2) * (TILE_DIM + 2) + (tx + 0)];
        r7 = shared_data[base1 + (ty + 2) * (TILE_DIM + 2) + (tx + 1)];
        r8 = shared_data[base1 + (ty + 2) * (TILE_DIM + 2) + (tx + 2)];
        sum += w10*r0 + w11*r1 + w12*r2;
        // Channel 2.
        int base2 = 2 * tile_size;  // offset for channel 2.
        r0 = shared_data[base2 + (ty + 0) * (TILE_DIM + 2) + (tx + 0)];
        r1 = shared_data[base2 + (ty + 0) * (TILE_DIM + 2) + (tx + 1)];
        r2 = shared_data[base2 + (ty + 0) * (TILE_DIM + 2) + (tx + 2)];
        r3 = shared_data[base2 + (ty + 1) * (TILE_DIM + 2) + (tx + 0)];
        r4 = shared_data[base2 + (ty + 1) * (TILE_DIM + 2) + (tx + 1)];
        r5 = shared_data[base2 + (ty + 1) * (TILE_DIM + 2) + (tx + 2)];
        r6 = shared_data[base2 + (ty + 2) * (TILE_DIM + 2) + (tx + 0)];
        r7 = shared_data[base2 + (ty + 2) * (TILE_DIM + 2) + (tx + 1)];
        r8 = shared_data[base2 + (ty + 2) * (TILE_DIM + 2) + (tx + 2)];
        sum += w20*r0 + w21*r1 + w22*r2;
        
        float result = fused_activation(sum);
        // Write the result to output.
        // Output index: [b, oc, out_y, out_x] with H_out = H_in-2, W_out = W_in-2.
        int out_idx = b * (out_channels * H_out * W_out) + oc * (H_out * W_out) + (out_y) * W_out + (out_x);
        output[out_idx] = result;
    }
}

// ---------------------------------------------------------------------------
// Forward function: select the proper kernel version based on kernel_size and in_channels.
torch::Tensor forward(torch::Tensor x, torch::Tensor weight, torch::Tensor bias, int kernel_size) {
    // Ensure operations occur on the correct device.
    c10::cuda::CUDAGuard device_guard(x.device());
    
    int batch = x.size(0);
    int in_channels = x.size(1);
    int H_in = x.size(2);
    int W_in = x.size(3);
    int out_channels = weight.size(0);
    
    int H_out = H_in - kernel_size + 1;
    int W_out = W_in - kernel_size + 1;
    
    auto output = torch::empty({batch, out_channels, H_out, W_out}, x.options());
    
    // Use shared-memory optimized kernel for common case.
    if (kernel_size == 3 && in_channels == 3) {
        // Use TILE_DIM.
        dim3 block(TILE_DIM, TILE_DIM);
        dim3 grid((W_out + TILE_DIM - 1) / TILE_DIM, (H_out + TILE_DIM - 1) / TILE_DIM, batch * out_channels);
        size_t shared_mem_size = in_channels * (TILE_DIM + 2) * (TILE_DIM + 2) * sizeof(float);
        fused_conv_activation_shared_kernel<<<grid, block, shared_mem_size, c10::cuda::getCurrentCUDAStream()>>>(
            x.data_ptr<float>(),
            weight.data_ptr<float>(),
            bias.data_ptr<float>(),
            output.data_ptr<float>(),
            batch,
            H_in,
            W_in,
            out_channels,
            H_out,
            W_out
        );
    } else {
        // Use the generic unrolled kernel.
        int total = batch * out_channels * H_out * W_out;
        int threads = 256;
        int blocks = (total + threads - 1) / threads;
        fused_conv_activation_kernel<<<blocks, threads, 0, c10::cuda::getCurrentCUDAStream()>>>(
            x.data_ptr<float>(),
            weight.data_ptr<float>(),
            bias.data_ptr<float>(),
            output.data_ptr<float>(),
            batch,
            in_channels,
            out_channels,
            H_in,
            W_in,
            kernel_size,
            H_out,
            W_out
        );
    }
    return output;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "Fused 2D convolution with HardSwish and ReLU activation (optimized with shared memory for 3x3 and 3 input channels)");
}