1.44. 1D Average Pooling
Calculates a one-dimensional average pooling operation by computing the mean of values within a sliding window, using configurable kernel size, stride, and padding to aggregate portions of the input sequence.
Progress
Optimization Timeline
Step 3 Details
Reasoning
We can further optimize the 1D average pooling kernel by reducing division overhead and utilizing the read-only __ldg intrinsic in the valid-window branch. In most cases the pooling window is fully valid (i.e. completely within the input bounds), so we can precompute the reciprocal of kernel_size once and use it to avoid an expensive division for each output element. In the valid branch, we also use __ldg to fetch the input values from read-only cache. For boundary cases we still need to check each element. These optimizations reduce per-thread overhead and improve memory throughput. This approach genuinely speeds up the operation by avoiding redundant computations and utilizing hardware-friendly memory access, rather than just working around the operation.
#include <torch/extension.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda.h>
#include <cuda_runtime.h>
// Optimized CUDA kernel for 1D average pooling.
__global__ void avg_pool1d_kernel(const float* __restrict__ input,
float* __restrict__ output,
int batch_size,
int channels,
int input_length,
int output_length,
int kernel_size,
int stride,
int padding) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int total = batch_size * channels * output_length;
if (index < total) {
// Map linear index to (batch, channel, out_idx)
int tmp = index;
int out_idx = tmp % output_length;
tmp /= output_length;
int channel = tmp % channels;
int batch = tmp / channels;
// Compute starting index for the pooling window.
int start = out_idx * stride - padding;
int input_base = batch * channels * input_length + channel * input_length;
float sum = 0.0f;
float invKernel = 1.0f / kernel_size;
// If the entire pooling window is within bounds, use optimized load with __ldg.
if (start >= 0 && (start + kernel_size) <= input_length) {
#pragma unroll
for (int k = 0; k < kernel_size; k++) {
sum += __ldg(&input[input_base + start + k]);
}
} else {
// Otherwise, check each element for boundary conditions.
for (int k = 0; k < kernel_size; k++) {
int in_idx = start + k;
if (in_idx >= 0 && in_idx < input_length) {
sum += input[input_base + in_idx];
}
}
}
output[index] = sum * invKernel;
}
}
torch::Tensor forward(torch::Tensor x, int kernel_size, int stride, int padding) {
// Enforce correct CUDA device usage.
c10::cuda::CUDAGuard device_guard(x.device());
// Get dimensions: x shape is [batch_size, channels, input_length]
int batch_size = x.size(0);
int channels = x.size(1);
int input_length = x.size(2);
// Calculate output length: floor((input_length + 2*padding - kernel_size) / stride) + 1.
int output_length = (input_length + 2 * padding - kernel_size) / stride + 1;
// Allocate output tensor.
auto output = torch::empty({batch_size, channels, output_length}, x.options());
int total = batch_size * channels * output_length;
int threads = 256;
int blocks = (total + threads - 1) / threads;
avg_pool1d_kernel<<<blocks, threads, 0, c10::cuda::getCurrentCUDAStream()>>>(
x.data_ptr<float>(),
output.data_ptr<float>(),
batch_size,
channels,
input_length,
output_length,
kernel_size,
stride,
padding
);
return output;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &forward, "1D average pooling (optimized CUDA kernel with reciprocal division and __ldg)");
}