1.74. 1D Dilated Transposed Convolution

Calculates a transposed 1D convolution on an input tensor using an asymmetric kernel. The operation supports configurable stride, padding, and dilation to control how the input is upsampled.

Progress

Step 2 of 2

Reward

3.33x

Optimization Timeline

Step 2 Details

Reasoning

In our previous implementation, the transposed 1D convolution kernel was generic and handled all cases by checking, for every output element, whether (p + padding - k*dilation) is divisible by stride. However, in many practical cases – especially when stride equals 1 – this check is unnecessary. When stride is 1, the convolution formula simplifies and we can remove the modulo operation and directly compute the index. This specialization will reduce instruction overhead and improve performance. Thus, we add a new kernel, conv_transpose1d_kernel_stride1, which is optimized for the case stride==1. In this kernel, for each output element we compute: t = p + padding - k*dilation and then, if t is in bounds, we accumulate over the input channels and kernel indices. We also use a #pragma unroll on the inner loop (over kernel indices) since kernel_size is small, thus aiding performance. In the host function (forward), we check if stride==1 to dispatch the specialized kernel; otherwise we use the generic kernel. The Python module remains largely unchanged. This targeted optimization is genuine: by bypassing unnecessary checks and unrolling small loops, we reduce computational overhead, leading to improved throughput.

#include <torch/extension.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda.h>
#include <cuda_runtime.h>

// Kernel for generic transposed 1D convolution handling arbitrary stride.
__global__ void conv_transpose1d_kernel_generic(const float* __restrict__ x,
                                                const float* __restrict__ weight,
                                                float* __restrict__ y,
                                                int N, int IC, int L_in,
                                                int OC, int kernel_size,
                                                int stride, int padding, int dilation,
                                                int L_out) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int total = N * OC * L_out;
    for(; index < total; index += blockDim.x * gridDim.x) {
        int p = index % L_out;
        int tmp = index / L_out;
        int oc = tmp % OC;
        int n = tmp / OC;
        float sum = 0.0f;
        for (int ic = 0; ic < IC; ic++) {
            #pragma unroll
            for (int k = 0; k < kernel_size; k++) {
                int value = p + padding - k * dilation;
                if (value % stride == 0) {
                    int t = value / stride;
                    if (t >= 0 && t < L_in) {
                        int x_index = n * (IC * L_in) + ic * L_in + t;
                        int w_index = ic * (OC * kernel_size) + oc * kernel_size + k;
                        sum += x[x_index] * weight[w_index];
                    }
                }
            }
        }
        y[index] = sum;
    }
}

// Specialized kernel for transposed 1D convolution when stride == 1.
__global__ void conv_transpose1d_kernel_stride1(const float* __restrict__ x,
                                                const float* __restrict__ weight,
                                                float* __restrict__ y,
                                                int N, int IC, int L_in,
                                                int OC, int kernel_size,
                                                int padding, int dilation,
                                                int L_out) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int total = N * OC * L_out;
    for(; index < total; index += blockDim.x * gridDim.x) {
        int p = index % L_out;
        int tmp = index / L_out;
        int oc = tmp % OC;
        int n = tmp / OC;
        float sum = 0.0f;
        for (int ic = 0; ic < IC; ic++) {
            #pragma unroll
            for (int k = 0; k < kernel_size; k++) {
                // For stride==1, t is computed directly.
                int t = p + padding - k * dilation;
                if (t >= 0 && t < L_in) {
                    int x_index = n * (IC * L_in) + ic * L_in + t;
                    int w_index = ic * (OC * kernel_size) + oc * kernel_size + k;
                    sum += x[x_index] * weight[w_index];
                }
            }
        }
        y[index] = sum;
    }
}

torch::Tensor forward(torch::Tensor x, torch::Tensor weight,
                      int stride, int padding, int dilation, int kernel_size,
                      int in_channels, int out_channels) {
    // Ensure proper device operations.
    c10::cuda::CUDAGuard device_guard(x.device());
    
    // x shape: [N, IC, L_in]
    int N = x.size(0);
    int IC = x.size(1);
    int L_in = x.size(2);

    // Compute L_out = (L_in - 1)*stride - 2*padding + dilation*(kernel_size - 1) + 1.
    int L_out = (L_in - 1) * stride - 2 * padding + dilation * (kernel_size - 1) + 1;
    
    // Allocate output y of shape [N, out_channels, L_out].
    auto y = torch::empty({N, out_channels, L_out}, x.options());

    int total = N * out_channels * L_out;
    int threads = 256;
    int blocks = (total + threads - 1) / threads;
    
    if (stride == 1) {
        // Use optimized kernel for stride==1.
        conv_transpose1d_kernel_stride1<<<blocks, threads, 0, c10::cuda::getCurrentCUDAStream()>>>(
            x.data_ptr<float>(),
            weight.data_ptr<float>(),
            y.data_ptr<float>(),
            N, in_channels, L_in,
            out_channels, kernel_size,
            padding, dilation,
            L_out
        );
    } else {
        // Use generic kernel.
        conv_transpose1d_kernel_generic<<<blocks, threads, 0, c10::cuda::getCurrentCUDAStream()>>>(
            x.data_ptr<float>(),
            weight.data_ptr<float>(),
            y.data_ptr<float>(),
            N, in_channels, L_in,
            out_channels, kernel_size,
            stride, padding, dilation,
            L_out
        );
    }
    
    return y;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &forward, "Custom transposed 1D convolution with dilation using a CUDA kernel");
}