| 1 | #include "mmq.cuh" |
| 2 | #include "quantize.cuh" |
| 3 | #include "mmid.cuh" |
| 4 | |
| 5 | static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) { |
| 6 | switch (args.type_x) { |
| 7 | case GGML_TYPE_Q4_0: |
| 8 | mul_mat_q_case<GGML_TYPE_Q4_0>(ctx, args, stream); |
| 9 | break; |
| 10 | case GGML_TYPE_Q4_1: |
| 11 | mul_mat_q_case<GGML_TYPE_Q4_1>(ctx, args, stream); |
| 12 | break; |
| 13 | case GGML_TYPE_Q5_0: |
| 14 | mul_mat_q_case<GGML_TYPE_Q5_0>(ctx, args, stream); |
| 15 | break; |
| 16 | case GGML_TYPE_Q5_1: |
| 17 | mul_mat_q_case<GGML_TYPE_Q5_1>(ctx, args, stream); |
| 18 | break; |
| 19 | case GGML_TYPE_Q8_0: |
| 20 | mul_mat_q_case<GGML_TYPE_Q8_0>(ctx, args, stream); |
| 21 | break; |
| 22 | case GGML_TYPE_MXFP4: |
| 23 | mul_mat_q_case<GGML_TYPE_MXFP4>(ctx, args, stream); |
| 24 | break; |
| 25 | case GGML_TYPE_Q2_K: |
| 26 | mul_mat_q_case<GGML_TYPE_Q2_K>(ctx, args, stream); |
| 27 | break; |
| 28 | case GGML_TYPE_Q3_K: |
| 29 | mul_mat_q_case<GGML_TYPE_Q3_K>(ctx, args, stream); |
| 30 | break; |
| 31 | case GGML_TYPE_Q4_K: |
| 32 | mul_mat_q_case<GGML_TYPE_Q4_K>(ctx, args, stream); |
| 33 | break; |
| 34 | case GGML_TYPE_Q5_K: |
| 35 | mul_mat_q_case<GGML_TYPE_Q5_K>(ctx, args, stream); |
| 36 | break; |
| 37 | case GGML_TYPE_Q6_K: |
| 38 | mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream); |
| 39 | break; |
| 40 | case GGML_TYPE_IQ2_XXS: |
| 41 | mul_mat_q_case<GGML_TYPE_IQ2_XXS>(ctx, args, stream); |
| 42 | break; |
| 43 | case GGML_TYPE_IQ2_XS: |
| 44 | mul_mat_q_case<GGML_TYPE_IQ2_XS>(ctx, args, stream); |
| 45 | break; |
| 46 | case GGML_TYPE_IQ2_S: |
| 47 | mul_mat_q_case<GGML_TYPE_IQ2_S>(ctx, args, stream); |
| 48 | break; |
| 49 | case GGML_TYPE_IQ3_XXS: |
| 50 | mul_mat_q_case<GGML_TYPE_IQ3_XXS>(ctx, args, stream); |
| 51 | break; |
| 52 | case GGML_TYPE_IQ3_S: |
| 53 | mul_mat_q_case<GGML_TYPE_IQ3_S>(ctx, args, stream); |
| 54 | break; |
| 55 | case GGML_TYPE_IQ1_S: |
| 56 | mul_mat_q_case<GGML_TYPE_IQ1_S>(ctx, args, stream); |
| 57 | break; |
| 58 | case GGML_TYPE_IQ4_XS: |
| 59 | mul_mat_q_case<GGML_TYPE_IQ4_XS>(ctx, args, stream); |
| 60 | break; |
| 61 | case GGML_TYPE_IQ4_NL: |
| 62 | mul_mat_q_case<GGML_TYPE_IQ4_NL>(ctx, args, stream); |
| 63 | break; |
| 64 | default: |
| 65 | GGML_ABORT("fatal error" ); |
| 66 | break; |
| 67 | } |
| 68 | } |
| 69 | |
| 70 | void ggml_cuda_mul_mat_q( |
| 71 | ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { |
| 72 | GGML_ASSERT( src1->type == GGML_TYPE_F32); |
| 73 | GGML_ASSERT( dst->type == GGML_TYPE_F32); |
| 74 | GGML_ASSERT(!ids || ids->type == GGML_TYPE_I32); // Optional, used for batched GGML_MUL_MAT_ID. |
| 75 | |
| 76 | GGML_TENSOR_BINARY_OP_LOCALS; |
| 77 | |
| 78 | cudaStream_t stream = ctx.stream(); |
| 79 | const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; |
| 80 | |
| 81 | const size_t ts_src0 = ggml_type_size(src0->type); |
| 82 | const size_t ts_src1 = ggml_type_size(src1->type); |
| 83 | const size_t ts_dst = ggml_type_size(dst->type); |
| 84 | |
| 85 | GGML_ASSERT( nb00 == ts_src0); |
| 86 | GGML_ASSERT( nb10 == ts_src1); |
| 87 | GGML_ASSERT( nb0 == ts_dst); |
| 88 | GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type)); |
| 89 | |
| 90 | const char * src0_d = (const char *) src0->data; |
| 91 | const float * src1_d = (const float *) src1->data; |
| 92 | float * dst_d = (float *) dst->data; |
| 93 | |
| 94 | // If src0 is a temporary compute buffer, clear any potential padding. |
| 95 | if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) { |
| 96 | const size_t size_data = ggml_nbytes(src0); |
| 97 | const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0); |
| 98 | if (size_alloc > size_data) { |
| 99 | GGML_ASSERT(ggml_is_contiguously_allocated(src0)); |
| 100 | GGML_ASSERT(!src0->view_src); |
| 101 | CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream)); |
| 102 | } |
| 103 | } |
| 104 | |
| 105 | const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING); |
| 106 | |
| 107 | const int64_t s01 = src0->nb[1] / ts_src0; |
| 108 | const int64_t s1 = dst->nb[1] / ts_dst; |
| 109 | const int64_t s02 = src0->nb[2] / ts_src0; |
| 110 | const int64_t s2 = dst->nb[2] / ts_dst; |
| 111 | const int64_t s03 = src0->nb[3] / ts_src0; |
| 112 | const int64_t s3 = dst->nb[3] / ts_dst; |
| 113 | |
| 114 | const bool use_stream_k = (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(arch: cc) >= GGML_CUDA_CC_VOLTA) |
| 115 | || GGML_CUDA_CC_IS_CDNA(cc); |
| 116 | |
| 117 | if (!ids) { |
| 118 | const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 + |
| 119 | get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq); |
| 120 | ggml_cuda_pool_alloc<char> src1_q8_1(ctx.pool(), nbytes_src1_q8_1); |
| 121 | |
| 122 | { |
| 123 | const int64_t s11 = src1->nb[1] / ts_src1; |
| 124 | const int64_t s12 = src1->nb[2] / ts_src1; |
| 125 | const int64_t s13 = src1->nb[3] / ts_src1; |
| 126 | quantize_mmq_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, |
| 127 | ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream); |
| 128 | CUDA_CHECK(cudaGetLastError()); |
| 129 | } |
| 130 | |
| 131 | const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int)); |
| 132 | const int64_t s13 = ne12*s12; |
| 133 | |
| 134 | const mmq_args args = { |
| 135 | src0_d, src0->type, (const int *) src1_q8_1.ptr, nullptr, nullptr, dst_d, |
| 136 | ne00, ne01, ne1, s01, ne11, s1, |
| 137 | ne02, ne12, s02, s12, s2, |
| 138 | ne03, ne13, s03, s13, s3, |
| 139 | use_stream_k, ne1}; |
| 140 | ggml_cuda_mul_mat_q_switch_type(ctx, args, stream); |
| 141 | return; |
| 142 | } |
| 143 | |
| 144 | GGML_ASSERT(ne13 == 1); |
| 145 | GGML_ASSERT(nb12 % nb11 == 0); |
| 146 | GGML_ASSERT(nb2 % nb1 == 0); |
| 147 | |
| 148 | const int64_t n_expert_used = ids->ne[0]; |
| 149 | const int64_t ne_get_rows = ne12 * n_expert_used; |
| 150 | GGML_ASSERT(ne1 == n_expert_used); |
| 151 | |
| 152 | ggml_cuda_pool_alloc<int32_t> ids_src1(ctx.pool(), ne_get_rows); |
| 153 | ggml_cuda_pool_alloc<int32_t> ids_dst(ctx.pool(), ne_get_rows); |
| 154 | ggml_cuda_pool_alloc<int32_t> expert_bounds(ctx.pool(), ne02 + 1); |
| 155 | |
| 156 | { |
| 157 | GGML_ASSERT(ids->nb[0] == ggml_element_size(ids)); |
| 158 | const int si1 = ids->nb[1] / ggml_element_size(ids); |
| 159 | const int sis1 = nb12 / nb11; |
| 160 | |
| 161 | ggml_cuda_launch_mm_ids_helper((const int32_t *) ids->data, ids_src1.get(), ids_dst.get(), expert_bounds.get(), |
| 162 | ne02, ne12, n_expert_used, ne11, si1, sis1, stream); |
| 163 | CUDA_CHECK(cudaGetLastError()); |
| 164 | } |
| 165 | |
| 166 | const size_t nbytes_src1_q8_1 = ne12*n_expert_used*ne10_padded * sizeof(block_q8_1)/QK8_1 + |
| 167 | get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq); |
| 168 | ggml_cuda_pool_alloc<char> src1_q8_1(ctx.pool(), nbytes_src1_q8_1); |
| 169 | |
| 170 | const int64_t ne11_flat = ne12*n_expert_used; |
| 171 | const int64_t ne12_flat = 1; |
| 172 | const int64_t ne13_flat = 1; |
| 173 | |
| 174 | { |
| 175 | const int64_t s11 = src1->nb[1] / ts_src1; |
| 176 | const int64_t s12 = src1->nb[2] / ts_src1; |
| 177 | const int64_t s13 = src1->nb[2] / ts_src1; |
| 178 | quantize_mmq_q8_1_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, |
| 179 | ne10, s11, s12, s13, ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream); |
| 180 | CUDA_CHECK(cudaGetLastError()); |
| 181 | } |
| 182 | |
| 183 | const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int)); |
| 184 | const int64_t s13 = ne12*s12; |
| 185 | |
| 186 | // Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid. |
| 187 | const mmq_args args = { |
| 188 | src0_d, src0->type, (const int *) src1_q8_1.get(), ids_dst.get(), expert_bounds.get(), dst_d, |
| 189 | ne00, ne01, ne_get_rows, s01, ne_get_rows, s1, |
| 190 | ne02, ne02, s02, s12, s2, |
| 191 | ne03, ne13, s03, s13, s3, |
| 192 | use_stream_k, ne12}; |
| 193 | |
| 194 | ggml_cuda_mul_mat_q_switch_type(ctx, args, stream); |
| 195 | } |
| 196 | |
| 197 | void ggml_cuda_op_mul_mat_q( |
| 198 | ggml_backend_cuda_context & ctx, |
| 199 | const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, |
| 200 | const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, |
| 201 | const int64_t src1_padded_row_size, cudaStream_t stream) { |
| 202 | |
| 203 | const int64_t ne00 = src0->ne[0]; |
| 204 | |
| 205 | const int64_t ne10 = src1->ne[0]; |
| 206 | const int64_t ne11 = src1->ne[1]; |
| 207 | GGML_ASSERT(ne10 % QK8_1 == 0); |
| 208 | |
| 209 | const int64_t ne0 = dst->ne[0]; |
| 210 | |
| 211 | const int64_t row_diff = row_high - row_low; |
| 212 | const int64_t stride01 = ne00 / ggml_blck_size(src0->type); |
| 213 | |
| 214 | const int id = ggml_cuda_get_device(); |
| 215 | const int cc = ggml_cuda_info().devices[id].cc; |
| 216 | |
| 217 | // the main device has a larger memory buffer to hold the results from all GPUs |
| 218 | // nrows_dst == nrows of the matrix that the kernel writes into |
| 219 | const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff; |
| 220 | |
| 221 | // The stream-k decomposition is only faster for recent NVIDIA GPUs. |
| 222 | // Also its fixup needs to allocate a temporary buffer in the memory pool. |
| 223 | // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer. |
| 224 | const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(arch: cc) >= GGML_CUDA_CC_VOLTA) |
| 225 | || GGML_CUDA_CC_IS_CDNA(cc)) |
| 226 | && src1_ncols == ne11; |
| 227 | const mmq_args args = { |
| 228 | src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i, |
| 229 | ne00, row_diff, src1_ncols, stride01, ne11, nrows_dst, |
| 230 | 1, 1, 0, 0, 0, |
| 231 | 1, 1, 0, 0, 0, |
| 232 | use_stream_k, src1_ncols}; |
| 233 | |
| 234 | ggml_cuda_mul_mat_q_switch_type(ctx, args, stream); |
| 235 | |
| 236 | GGML_UNUSED_VARS(src1, dst, src1_ddf_i, src1_padded_row_size); |
| 237 | } |
| 238 | |
| 239 | bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) { |
| 240 | #ifdef GGML_CUDA_FORCE_CUBLAS |
| 241 | return false; |
| 242 | #endif // GGML_CUDA_FORCE_CUBLAS |
| 243 | |
| 244 | bool mmq_supported; |
| 245 | |
| 246 | switch (type) { |
| 247 | case GGML_TYPE_Q4_0: |
| 248 | case GGML_TYPE_Q4_1: |
| 249 | case GGML_TYPE_Q5_0: |
| 250 | case GGML_TYPE_Q5_1: |
| 251 | case GGML_TYPE_Q8_0: |
| 252 | case GGML_TYPE_MXFP4: |
| 253 | case GGML_TYPE_Q2_K: |
| 254 | case GGML_TYPE_Q3_K: |
| 255 | case GGML_TYPE_Q4_K: |
| 256 | case GGML_TYPE_Q5_K: |
| 257 | case GGML_TYPE_Q6_K: |
| 258 | case GGML_TYPE_IQ2_XXS: |
| 259 | case GGML_TYPE_IQ2_XS: |
| 260 | case GGML_TYPE_IQ2_S: |
| 261 | case GGML_TYPE_IQ3_XXS: |
| 262 | case GGML_TYPE_IQ3_S: |
| 263 | case GGML_TYPE_IQ1_S: |
| 264 | case GGML_TYPE_IQ4_XS: |
| 265 | case GGML_TYPE_IQ4_NL: |
| 266 | mmq_supported = true; |
| 267 | break; |
| 268 | default: |
| 269 | mmq_supported = false; |
| 270 | break; |
| 271 | } |
| 272 | |
| 273 | if (!mmq_supported) { |
| 274 | return false; |
| 275 | } |
| 276 | |
| 277 | if (turing_mma_available(cc)) { |
| 278 | return true; |
| 279 | } |
| 280 | |
| 281 | if (ggml_cuda_highest_compiled_arch(arch: cc) < GGML_CUDA_CC_DP4A) { |
| 282 | return false; |
| 283 | } |
| 284 | |
| 285 | #ifdef GGML_CUDA_FORCE_MMQ |
| 286 | return true; |
| 287 | #endif //GGML_CUDA_FORCE_MMQ |
| 288 | |
| 289 | if (GGML_CUDA_CC_IS_NVIDIA(cc)) { |
| 290 | return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE; |
| 291 | } |
| 292 | |
| 293 | if (amd_mfma_available(cc)) { |
| 294 | // As of ROCM 7.0 rocblas/tensile performs very poorly on CDNA3 and hipblaslt (via ROCBLAS_USE_HIPBLASLT) |
| 295 | // performs better but is currently suffering from a crash on this architecture. |
| 296 | // TODO: Revisit when hipblaslt is fixed on CDNA3 |
| 297 | if (GGML_CUDA_CC_IS_CDNA3(cc)) { |
| 298 | return true; |
| 299 | } |
| 300 | if (ne11 <= 128 || type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) { |
| 301 | return true; |
| 302 | } |
| 303 | if (ne11 <= 256 && (type == GGML_TYPE_Q4_K || type == GGML_TYPE_Q5_K)) { |
| 304 | return true; |
| 305 | } |
| 306 | return false; |
| 307 | } |
| 308 | |
| 309 | return (!GGML_CUDA_CC_IS_RDNA4(cc) && !GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE; |
| 310 | } |
| 311 | |