1.14. Upper Triangular Matrix Multiplication

Calculates the matrix product of two upper triangular matrices and returns the result as an upper triangular matrix by applying an upper triangular mask to the standard matrix multiplication outcome.

Progress

Step 4 of 4

Reward

1.26x

Optimization Timeline

Step 4 Details

Reasoning

We can further reduce loop overhead by increasing the unroll factor from 4 to 8 and by using __ldg to load values of A and B from global memory. This should reduce the number of loop iterations for output elements that require long summations—especially for the top rows where j ≫ i—and further improve performance. The changes are genuine as they reduce per-thread overhead and leverage cache more efficiently, without altering the mathematical correctness. The Python module remains unchanged.

#include <torch/extension.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda.h>
#include <cuda_runtime.h>

// Optimized CUDA kernel for multiplying two upper triangular matrices using loop unrolling.
// For indices i > j, the output is zero.
// For indices i ≤ j, the output element is computed as:
//   C[i,j] = sum_{k=i}^{j} A[i,k] * B[k,j]
// This version unrolls the inner loop by a factor of 8 and uses __ldg for cached memory loads.
__global__ void upper_tri_matmul_kernel(const float* __restrict__ A,
                                        const float* __restrict__ B,
                                        float* __restrict__ C,
                                        int N) {
    int j = blockIdx.x * blockDim.x + threadIdx.x;
    int i = blockIdx.y * blockDim.y + threadIdx.y;
    
    if(i < N && j < N) {
        int index = i * N + j;
        if(i > j) {
            C[index] = 0.0f;
        } else {
            float sum = 0.0f;
            int start = i;
            int end = j;
            int len = end - start + 1;
            int k = start;
            int row_stride = N;
            int factor = 8;
            int limit = start + (len / factor) * factor;
            for(; k < limit; k += factor) {
                sum += __ldg(&A[i * row_stride + k]) * __ldg(&B[k * row_stride + j])
                    + __ldg(&A[i * row_stride + k + 1]) * __ldg(&B[(k + 1) * row_stride + j])
                    + __ldg(&A[i * row_stride + k + 2]) * __ldg(&B[(k + 2) * row_stride + j])
                    + __ldg(&A[i * row_stride + k + 3]) * __ldg(&B[(k + 3) * row_stride + j])
                    + __ldg(&A[i * row_stride + k + 4]) * __ldg(&B[(k + 4) * row_stride + j])
                    + __ldg(&A[i * row_stride + k + 5]) * __ldg(&B[(k + 5) * row_stride + j])
                    + __ldg(&A[i * row_stride + k + 6]) * __ldg(&B[(k + 6) * row_stride + j])
                    + __ldg(&A[i * row_stride + k + 7]) * __ldg(&B[(k + 7) * row_stride + j]);
            }
            for(; k <= end; k++) {
                sum += __ldg(&A[i * row_stride + k]) * __ldg(&B[k * row_stride + j]);
            }
            C[index] = sum;
        }
    }
}

torch::Tensor upper_tri_matmul(torch::Tensor A, torch::Tensor B) {
    c10::cuda::CUDAGuard device_guard(A.device());
    int N = A.size(0);
    auto C = torch::empty({N, N}, A.options());
    
    // Launch configuration: using 32x32 blocks.
    const int BLOCK_DIM = 32;
    dim3 threads(BLOCK_DIM, BLOCK_DIM);
    dim3 blocks((N + BLOCK_DIM - 1) / BLOCK_DIM, (N + BLOCK_DIM - 1) / BLOCK_DIM);
    
    upper_tri_matmul_kernel<<<blocks, threads, 0, c10::cuda::getCurrentCUDAStream()>>>(
        A.data_ptr<float>(),
        B.data_ptr<float>(),
        C.data_ptr<float>(),
        N
    );
    
    return C;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("upper_tri_matmul", &upper_tri_matmul,
          "Multiply two upper triangular matrices using a custom CUDA kernel with loop unrolling (factor 8) and __ldg loads");
}