2.100. 3D Transposed Convolution with Clamping and Division

Performs a 3D transposed convolution on input data, clamps the resulting tensor to not fall below a specified minimum value, and then scales the output by dividing it by a constant.

Progress

Step 3 of 3

Reward

1.22x

Optimization Timeline

Step 3 Details

Reasoning

We can further optimize by vectorizing the kernel: if the total number of elements is divisible by 4 and the input pointer is properly aligned (i.e. on a 16‐byte boundary), we can reinterpret the memory as an array of float4. Then, each thread processes 4 floats at once, applying the clamp (via fmaxf) and multiplying by a precomputed reciprocal of the divisor (to avoid per-thread division). This should reduce the total number of thread iterations and improve memory throughput. The Python module remains essentially unchanged except for an updated description. In the CUDA code, we add a vectorized kernel, compute the reciprocal (inv_divisor) on the host once, and check both if input.numel() is divisible by 4 and if the input pointer has proper alignment before dispatching the vectorized kernel. These improvements genuinely help by reducing per-element operations and memory transactions.

#include <torch/extension.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda_runtime.h>
#include <stdint.h>

// Scalar kernel: For each element, compute y = max(x, min_value) * inv_divisor.
__global__ void fused_clamp_div_kernel(const float* __restrict__ input,
                                       float* __restrict__ output,
                                       int total,
                                       float min_value,
                                       float inv_divisor) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < total) {
        float val = input[idx];
        output[idx] = fmaxf(val, min_value) * inv_divisor;
    }
}

// Vectorized kernel: Process four floats at a time using float4.
__global__ void fused_clamp_div_kernel_vec(const float4* __restrict__ input,
                                           float4* __restrict__ output,
                                           int total_vec,  // total elements in terms of float4 objects
                                           float min_value,
                                           float inv_divisor) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < total_vec) {
        float4 v = input[idx];
        v.x = fmaxf(v.x, min_value) * inv_divisor;
        v.y = fmaxf(v.y, min_value) * inv_divisor;
        v.z = fmaxf(v.z, min_value) * inv_divisor;
        v.w = fmaxf(v.w, min_value) * inv_divisor;
        output[idx] = v;
    }
}

torch::Tensor forward(torch::Tensor input, float min_value, float divisor) {
    // Ensure we are on the correct CUDA device.
    c10::cuda::CUDAGuard device_guard(input.device());
    
    // Allocate the output tensor with same shape and options as input.
    auto output = torch::empty_like(input);
    
    int total = input.numel();
    // Precompute reciprocal once.
    float inv_divisor = 1.0f / divisor;
    int threads = 256;
    
    // Check if we can run the vectorized version.
    // Conditions: total is divisible by 4 and input pointer is 16-byte aligned.
    if ((total % 4 == 0) &&
        ((reinterpret_cast<uintptr_t>(input.data_ptr<float>()) % 16) == 0) &&
        ((reinterpret_cast<uintptr_t>(output.data_ptr<float>()) % 16) == 0)) {
        int total_vec = total / 4;
        int blocks = (total_vec + threads - 1) / threads;
        const float4* input_vec = reinterpret_cast<const float4*>(input.data_ptr<float>());
        float4* output_vec = reinterpret_cast<float4*>(output.data_ptr<float>());
        fused_clamp_div_kernel_vec<<<blocks, threads, 0, c10::cuda::getCurrentCUDAStream()>>>(
            input_vec, output_vec, total_vec, min_value, inv_divisor);
    } else {
        int blocks = (total + threads - 1) / threads;
        fused_clamp_div_kernel<<<blocks, threads, 0, c10::cuda::getCurrentCUDAStream()>>>(
            input.data_ptr<float>(), output.data_ptr<float>(), total, min_value, inv_divisor);
    }
    
    return output;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "Fused clamp and division operation with vectorized path when possible");
}