3.46. NetVLAD Feature Aggregation with Ghost Clusters

Calculates a fixed-size global representation from a set of local feature descriptors. The operation first computes soft assignment scores using learned weights and batch normalization, then removes the ghost cluster assignments. It aggregates the residuals between features and learned cluster centers, applies intra-cluster (L2) normalization, and finally normalizes the flattened output to yield a compact descriptor.

Progress

Step 4 of 4
Reward
1.58x

Optimization Timeline

Step 4 Details

Reasoning

We've carefully reviewed the fused NetVLAD kernel and identified additional micro-optimizations such as more aggressive use of warp shuffle reductions and slight improvements in code clarity. The main approach in our current design—aggregating the VLAD descriptors, applying intranormalization per cluster, and then a final global normalization—remains optimal. We now add extra inline comments for clarity and make sure the reduction loops are as tight as possible. In our optimized CUDA code, we use warp‐shuffle based reduction in both the intra‐normalization and final normalization kernels to reduce shared memory usage and synchronization overhead. These improvements genuinely help by lowering latency and increasing throughput while preserving correctness. No radical restructuring is needed; we are simply fine‐tuning the existing efficient approach.

#include <torch/extension.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda_runtime.h>
#include <math.h>

// Inline warp-reduction function using warp shuffle instructions.
__inline__ __device__ float warpReduceSum(float val) {
    for (int offset = warpSize/2; offset > 0; offset /= 2)
        val += __shfl_down_sync(0xffffffff, val, offset);
    return val;
}

// --------------------------------------------------------------------------------
// Kernel 1: VLAD Aggregation.
// For each sample b, each cluster k, and each feature dimension d, the kernel computes:
//   vlad[b,d,k] = (Σₙ assignment[b,n,k] * x[b,n,d])
//                 - (Σₙ assignment[b,n,k]) * clusters2[d,k]
//
// Input dimensions:
//    x: [B, N, D]
//    assignment: [B, N, K]  where K equals cluster_size
//    clusters2: [D, K]
// Output:
//    vlad: [B, D, K]
__global__ void vlad_aggregate_kernel(const float* __restrict__ x,
                                      const float* __restrict__ assignment,
                                      const float* __restrict__ clusters2,
                                      float* __restrict__ vlad,
                                      int B, int N, int D, int K) {
    int b = blockIdx.x;      // Batch index.
    int k = blockIdx.y;      // Cluster index.
    int d = threadIdx.x;     // Feature dimension index.
    if (d >= D) return;
    float sum = 0.0f;        // Accumulated contribution: Σₙ (assignment * x).
    float sum_a = 0.0f;      // Accumulated assignment: Σₙ assignment.
    int x_base = b * N * D;
    int assign_base = b * N * K;
    #pragma unroll
    for (int n = 0; n < N; n++) {
        int assign_idx = assign_base + n * K + k;
        float a_val = assignment[assign_idx];
        sum_a += a_val;
        int x_idx = x_base + n * D + d;
        sum += a_val * x[x_idx];
    }
    int vlad_idx = b * D * K + d * K + k;
    vlad[vlad_idx] = sum - sum_a * clusters2[d * K + k];
}

// --------------------------------------------------------------------------------
// Kernel 2: Intra-normalization per cluster using warp shuffle reduction.
// For each sample b and each cluster k, normalize the D-dimensional vector:
//   norm = sqrt( Σ_{d=0}^{D-1} (vlad[b,d,k]²) + 1e-6 )
//   Then, vlad[b,d,k] /= norm for each d.
__global__ void intranorm_kernel(float* __restrict__ vlad, int D, int K) {
    int b = blockIdx.x;
    int k = blockIdx.y;
    int tid = threadIdx.x;
    if (tid >= D) return;
    int index = b * D * K + tid * K + k;
    float val = vlad[index];
    float sq = val * val;
    float warp_sum = warpReduceSum(sq);
    __shared__ float norm_shared[32];  // Shared memory for warp sums.
    int warpId = tid / warpSize;
    int lane = tid & (warpSize - 1);
    if (lane == 0) {
        norm_shared[warpId] = warp_sum;
    }
    __syncthreads();
    float block_sum = 0.0f;
    if (tid < blockDim.x / warpSize) {
        block_sum = norm_shared[tid];
    }
    if (tid < warpSize) {
        block_sum = warpReduceSum(block_sum);
    }
    __shared__ float norm_final;
    if (tid == 0) {
        norm_final = sqrtf(block_sum + 1e-6f);
    }
    __syncthreads();
    vlad[index] = vlad[index] / norm_final;
}

// --------------------------------------------------------------------------------
// Kernel 3: Final L2 Normalization over each sample using warp shuffle reduction.
// Flatten vlad to shape [B, L] where L = D*K and then compute:
//   norm = sqrt( Σ_{i=0}^{L-1} (vlad[i]²) + 1e-6 )
//   vlad[i] /= norm for each i in the sample.
__global__ void final_norm_kernel(float* __restrict__ vlad, int L) {
    int b = blockIdx.x;  // Each block handles one sample.
    int tid = threadIdx.x;
    int base = b * L;
    float partial = 0.0f;
    for (int i = tid; i < L; i += blockDim.x) {
        float val = vlad[base + i];
        partial += val * val;
    }
    partial = warpReduceSum(partial);
    __shared__ float warp_sums[32];  // For sums from each warp.
    int lane = tid & (warpSize - 1);
    int warpId = tid / warpSize;
    if(lane == 0) {
        warp_sums[warpId] = partial;
    }
    __syncthreads();
    float total = 0.0f;
    if (tid < blockDim.x / warpSize) {
        total = warp_sums[tid];
    }
    if (tid < warpSize) {
        total = warpReduceSum(total);
    }
    __shared__ float norm_val;
    if (tid == 0) {
        norm_val = sqrtf(total + 1e-6f);
    }
    __syncthreads();
    for (int i = tid; i < L; i += blockDim.x) {
        vlad[base + i] /= norm_val;
    }
}

// --------------------------------------------------------------------------------
// Fused NetVLAD Forward Function.
// Inputs:
//   x: [B, N, D] tensor (features)
//   assignment: [B, N, K] tensor (softmax assignments, ghost clusters removed)
//   clusters2: [D, K] tensor (residual centers)
// Returns:
//   A tensor of shape [B, D*K] representing the final normalized NetVLAD descriptor.
torch::Tensor netvlad_forward(torch::Tensor x, torch::Tensor assignment, torch::Tensor clusters2) {
    TORCH_CHECK(x.dim() == 3, "x must be [B, N, D]");
    TORCH_CHECK(assignment.dim() == 3, "assignment must be [B, N, K]");
    TORCH_CHECK(clusters2.dim() == 2, "clusters2 must be [D, K]");
    int B = x.size(0);
    int N = x.size(1);
    int D = x.size(2);
    int K = assignment.size(2);
    // Ensure contiguous memory.
    x = x.contiguous();
    assignment = assignment.contiguous();
    clusters2 = clusters2.contiguous();
    auto options = x.options();
    auto vlad = torch::empty({B, D, K}, options);
    
    c10::cuda::CUDAGuard device_guard(x.device());
    cudaStream_t stream = c10::cuda::getCurrentCUDAStream();

    // Launch Kernel 1: VLAD aggregation.
    {
        dim3 grid(B, K);
        dim3 block(D);
        vlad_aggregate_kernel<<<grid, block, 0, stream>>>(
            x.data_ptr<float>(),
            assignment.data_ptr<float>(),
            clusters2.data_ptr<float>(),
            vlad.data_ptr<float>(),
            B, N, D, K
        );
    }
    
    // Launch Kernel 2: Intra-normalization.
    {
        dim3 grid(B, K);
        dim3 block(D);
        intranorm_kernel<<<grid, block, 0, stream>>>(
            vlad.data_ptr<float>(),
            D, K
        );
    }
    
    // Launch Kernel 3: Final L2 normalization.
    {
        int L = D * K;
        int threads = 256;
        dim3 grid(B);
        final_norm_kernel<<<grid, threads, 0, stream>>>(
            vlad.data_ptr<float>(),
            L
        );
    }
    
    return vlad.view({B, D * K});
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &netvlad_forward, "Fused NetVLAD forward (aggregation + normalization)");
}