limit D == 256 to 8 warps

JohannesGaessler · JohannesGaessler · commit e4badc1647e0 · 2024-03-29T12:04:46.000+01:00
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -7506,8 +7506,8 @@ static __global__ void flash_attn_f32(
     }
 }
 
-template<int D, int ncols> // D head size
-__launch_bounds__(ncols == 8 ? (D + D % 32) : 2*D, 1)
+template<int D, int ncols> // D == head size
+__launch_bounds__(ncols == 8 || D > 128 ? D : 2*D, 1)
 static __global__ void flash_attn_ext_f16(
         const char * __restrict__ Q,
         const char * __restrict__ K,
@@ -7545,9 +7545,11 @@ static __global__ void flash_attn_ext_f16(
     typedef nvcuda::wmma::fragment<nvcuda::wmma::matrix_b,    frag_m, frag_n, 16, half, nvcuda::wmma::col_major> frag_b;
     typedef nvcuda::wmma::fragment<nvcuda::wmma::accumulator, frag_m, frag_n, 16, half>                          frag_c;
 
-    constexpr int nwarps = D / frag_m;
+    constexpr int nwarps = (D <= 128 || ncols == 8 ? D : D/2) / frag_m;
     constexpr int nthreads = nwarps*WARP_SIZE;
     static_assert(nthreads % D == 0, "nthreads not divisible by D.");
+    constexpr int tc_vals_per_iter = nwarps*frag_m;
+    static_assert(D % tc_vals_per_iter == 0, "D not divisible by tensor core vals per iter.");
     const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
     __builtin_assume(tid < nthreads);
     constexpr int D_padded = D + 8; // Pad internal representation of KQ, KQV to reduce shared memory bank conflicts.
@@ -7608,25 +7610,28 @@ static __global__ void flash_attn_ext_f16(
         const bool has_valid_data = 256 % D == 0 || k_VKQ_0 + frag_m*threadIdx.y < ne11;
 
         // Calculate tile of KQ:
-        frag_c KQ_c[ncols/frag_n];
 #pragma unroll
-        for (int j = 0; j < ncols/frag_n; ++j) {
-            nvcuda::wmma::fill_fragment(KQ_c[j], 0.0f);
-        }
-        if (has_valid_data) {
+        for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += tc_vals_per_iter) {
+            frag_c KQ_c[ncols/frag_n];
 #pragma unroll
-            for (int k_KQ_0 = 0; k_KQ_0 < D; k_KQ_0 += 16) {
-                frag_a_K K_a;
-                nvcuda::wmma::load_matrix_sync(K_a, K_h + (k_VKQ_0 + frag_m*threadIdx.y)*stride_KV + k_KQ_0, stride_KV);
+            for (int j = 0; j < ncols/frag_n; ++j) {
+                nvcuda::wmma::fill_fragment(KQ_c[j], 0.0f);
+            }
+            if (has_valid_data) {
 #pragma unroll
-                for (int j = 0; j < ncols/frag_n; ++j) {
-                    nvcuda::wmma::mma_sync(KQ_c[j], K_a, Q_b[k_KQ_0/16][j], KQ_c[j]);
+                for (int k_KQ_0 = 0; k_KQ_0 < D; k_KQ_0 += 16) {
+                    frag_a_K K_a;
+                    nvcuda::wmma::load_matrix_sync(K_a, K_h + (k_VKQ_0 + i_KQ_0 + frag_m*threadIdx.y)*stride_KV + k_KQ_0, stride_KV);
+#pragma unroll
+                    for (int j = 0; j < ncols/frag_n; ++j) {
+                        nvcuda::wmma::mma_sync(KQ_c[j], K_a, Q_b[k_KQ_0/16][j], KQ_c[j]);
+                    }
                 }
             }
-        }
 #pragma unroll
-        for (int j0 = 0; j0 < ncols; j0 += frag_n) {
-            nvcuda::wmma::store_matrix_sync(KQ + j0*D_padded + frag_m*threadIdx.y, KQ_c[j0/frag_n], D_padded, nvcuda::wmma::mem_col_major);
+            for (int j0 = 0; j0 < ncols; j0 += frag_n) {
+                nvcuda::wmma::store_matrix_sync(KQ + j0*D_padded + i_KQ_0 + frag_m*threadIdx.y, KQ_c[j0/frag_n], D_padded, nvcuda::wmma::mem_col_major);
+            }
         }
 
         __syncthreads();
@@ -7687,31 +7692,40 @@ static __global__ void flash_attn_ext_f16(
             }
         }
 
-        frag_c VKQ_c[ncols/frag_n];
+        frag_c VKQ_c[D/tc_vals_per_iter][ncols/frag_n];
 #pragma unroll
-        for (int j = 0; j < ncols/frag_n; ++j) {
-            nvcuda::wmma::fill_fragment(VKQ_c[j], 0.0f);
-        }
-
-#pragma unroll
-        for (int k0 = 0; k0 < D; k0 += 16) {
-            if (256 % D != 0 && k_VKQ_0 + k0 >= ne11) {
-                break;
+        for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += tc_vals_per_iter) {
+    #pragma unroll
+            for (int j = 0; j < ncols/frag_n; ++j) {
+                nvcuda::wmma::fill_fragment(VKQ_c[i_KQ_0/tc_vals_per_iter][j], 0.0f);
             }
 
-            frag_a_V v_a;
-            nvcuda::wmma::load_matrix_sync(v_a, V_h + (k_VKQ_0 + k0)*stride_KV + frag_m*threadIdx.y, stride_KV);
-#pragma unroll
-            for (int j = 0; j < ncols/frag_n; ++j) {
-                nvcuda::wmma::mma_sync(VKQ_c[j], v_a, KQ_b[k0/16][j], VKQ_c[j]);
+    #pragma unroll
+            for (int k0 = 0; k0 < D; k0 += 16) {
+                if (256 % D != 0 && k_VKQ_0 + k0 >= ne11) {
+                    break;
+                }
+
+                frag_a_V v_a;
+                nvcuda::wmma::load_matrix_sync(v_a, V_h + (k_VKQ_0 + k0)*stride_KV + i_KQ_0 + frag_m*threadIdx.y, stride_KV);
+    #pragma unroll
+                for (int j = 0; j < ncols/frag_n; ++j) {
+                    nvcuda::wmma::mma_sync(VKQ_c[i_KQ_0/tc_vals_per_iter][j], v_a, KQ_b[k0/16][j], VKQ_c[i_KQ_0/tc_vals_per_iter][j]);
+                }
             }
         }
 
         __syncthreads();
 
 #pragma unroll
-        for (int j0 = 0; j0 < ncols; j0 += frag_n) {
-            nvcuda::wmma::store_matrix_sync(KQ + j0*D_padded + frag_m*threadIdx.y, VKQ_c[j0/frag_n], D_padded, nvcuda::wmma::mem_col_major);
+        for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += tc_vals_per_iter) {
+#pragma unroll
+            for (int j0 = 0; j0 < ncols; j0 += frag_n) {
+                nvcuda::wmma::store_matrix_sync(
+                    KQ + j0*D_padded + i_KQ_0 + frag_m*threadIdx.y,
+                    VKQ_c[i_KQ_0/tc_vals_per_iter][j0/frag_n],
+                    D_padded, nvcuda::wmma::mem_col_major);
+            }
         }
 
         __syncthreads();
@@ -11453,7 +11467,7 @@ inline void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, const ggml
         cols_per_block = 8;
     }
     const int frag_m = cols_per_block == 8 ? 32 : 16;
-    const int nwarps = Q->ne[0] / frag_m;
+    const int nwarps = (Q->ne[0] <= 128 || cols_per_block == 8 ? Q->ne[0] : Q->ne[0]/2) / frag_m;
     const dim3 blocks_num((Q->ne[1] + cols_per_block - 1) / cols_per_block, Q->ne[2], Q->ne[3]);
     const dim3 block_dim(WARP_SIZE, nwarps, 1);
     const size_t shmem = 0;