1#include "mmq.cuh"
2#include "quantize.cuh"
3#include "mmid.cuh"
4
5static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
6 switch (args.type_x) {
7 case GGML_TYPE_Q4_0:
8 mul_mat_q_case<GGML_TYPE_Q4_0>(ctx, args, stream);
9 break;
10 case GGML_TYPE_Q4_1:
11 mul_mat_q_case<GGML_TYPE_Q4_1>(ctx, args, stream);
12 break;
13 case GGML_TYPE_Q5_0:
14 mul_mat_q_case<GGML_TYPE_Q5_0>(ctx, args, stream);
15 break;
16 case GGML_TYPE_Q5_1:
17 mul_mat_q_case<GGML_TYPE_Q5_1>(ctx, args, stream);
18 break;
19 case GGML_TYPE_Q8_0:
20 mul_mat_q_case<GGML_TYPE_Q8_0>(ctx, args, stream);
21 break;
22 case GGML_TYPE_MXFP4:
23 mul_mat_q_case<GGML_TYPE_MXFP4>(ctx, args, stream);
24 break;
25 case GGML_TYPE_Q2_K:
26 mul_mat_q_case<GGML_TYPE_Q2_K>(ctx, args, stream);
27 break;
28 case GGML_TYPE_Q3_K:
29 mul_mat_q_case<GGML_TYPE_Q3_K>(ctx, args, stream);
30 break;
31 case GGML_TYPE_Q4_K:
32 mul_mat_q_case<GGML_TYPE_Q4_K>(ctx, args, stream);
33 break;
34 case GGML_TYPE_Q5_K:
35 mul_mat_q_case<GGML_TYPE_Q5_K>(ctx, args, stream);
36 break;
37 case GGML_TYPE_Q6_K:
38 mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream);
39 break;
40 case GGML_TYPE_IQ2_XXS:
41 mul_mat_q_case<GGML_TYPE_IQ2_XXS>(ctx, args, stream);
42 break;
43 case GGML_TYPE_IQ2_XS:
44 mul_mat_q_case<GGML_TYPE_IQ2_XS>(ctx, args, stream);
45 break;
46 case GGML_TYPE_IQ2_S:
47 mul_mat_q_case<GGML_TYPE_IQ2_S>(ctx, args, stream);
48 break;
49 case GGML_TYPE_IQ3_XXS:
50 mul_mat_q_case<GGML_TYPE_IQ3_XXS>(ctx, args, stream);
51 break;
52 case GGML_TYPE_IQ3_S:
53 mul_mat_q_case<GGML_TYPE_IQ3_S>(ctx, args, stream);
54 break;
55 case GGML_TYPE_IQ1_S:
56 mul_mat_q_case<GGML_TYPE_IQ1_S>(ctx, args, stream);
57 break;
58 case GGML_TYPE_IQ4_XS:
59 mul_mat_q_case<GGML_TYPE_IQ4_XS>(ctx, args, stream);
60 break;
61 case GGML_TYPE_IQ4_NL:
62 mul_mat_q_case<GGML_TYPE_IQ4_NL>(ctx, args, stream);
63 break;
64 default:
65 GGML_ABORT("fatal error");
66 break;
67 }
68}
69
70void ggml_cuda_mul_mat_q(
71 ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
72 GGML_ASSERT( src1->type == GGML_TYPE_F32);
73 GGML_ASSERT( dst->type == GGML_TYPE_F32);
74 GGML_ASSERT(!ids || ids->type == GGML_TYPE_I32); // Optional, used for batched GGML_MUL_MAT_ID.
75
76 GGML_TENSOR_BINARY_OP_LOCALS;
77
78 cudaStream_t stream = ctx.stream();
79 const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
80
81 const size_t ts_src0 = ggml_type_size(src0->type);
82 const size_t ts_src1 = ggml_type_size(src1->type);
83 const size_t ts_dst = ggml_type_size(dst->type);
84
85 GGML_ASSERT( nb00 == ts_src0);
86 GGML_ASSERT( nb10 == ts_src1);
87 GGML_ASSERT( nb0 == ts_dst);
88 GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type));
89
90 const char * src0_d = (const char *) src0->data;
91 const float * src1_d = (const float *) src1->data;
92 float * dst_d = (float *) dst->data;
93
94 // If src0 is a temporary compute buffer, clear any potential padding.
95 if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
96 const size_t size_data = ggml_nbytes(src0);
97 const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
98 if (size_alloc > size_data) {
99 GGML_ASSERT(ggml_is_contiguously_allocated(src0));
100 GGML_ASSERT(!src0->view_src);
101 CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
102 }
103 }
104
105 const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
106
107 const int64_t s01 = src0->nb[1] / ts_src0;
108 const int64_t s1 = dst->nb[1] / ts_dst;
109 const int64_t s02 = src0->nb[2] / ts_src0;
110 const int64_t s2 = dst->nb[2] / ts_dst;
111 const int64_t s03 = src0->nb[3] / ts_src0;
112 const int64_t s3 = dst->nb[3] / ts_dst;
113
114 const bool use_stream_k = (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(arch: cc) >= GGML_CUDA_CC_VOLTA)
115 || GGML_CUDA_CC_IS_CDNA(cc);
116
117 if (!ids) {
118 const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
119 get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq);
120 ggml_cuda_pool_alloc<char> src1_q8_1(ctx.pool(), nbytes_src1_q8_1);
121
122 {
123 const int64_t s11 = src1->nb[1] / ts_src1;
124 const int64_t s12 = src1->nb[2] / ts_src1;
125 const int64_t s13 = src1->nb[3] / ts_src1;
126 quantize_mmq_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type,
127 ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream);
128 CUDA_CHECK(cudaGetLastError());
129 }
130
131 const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int));
132 const int64_t s13 = ne12*s12;
133
134 const mmq_args args = {
135 src0_d, src0->type, (const int *) src1_q8_1.ptr, nullptr, nullptr, dst_d,
136 ne00, ne01, ne1, s01, ne11, s1,
137 ne02, ne12, s02, s12, s2,
138 ne03, ne13, s03, s13, s3,
139 use_stream_k, ne1};
140 ggml_cuda_mul_mat_q_switch_type(ctx, args, stream);
141 return;
142 }
143
144 GGML_ASSERT(ne13 == 1);
145 GGML_ASSERT(nb12 % nb11 == 0);
146 GGML_ASSERT(nb2 % nb1 == 0);
147
148 const int64_t n_expert_used = ids->ne[0];
149 const int64_t ne_get_rows = ne12 * n_expert_used;
150 GGML_ASSERT(ne1 == n_expert_used);
151
152 ggml_cuda_pool_alloc<int32_t> ids_src1(ctx.pool(), ne_get_rows);
153 ggml_cuda_pool_alloc<int32_t> ids_dst(ctx.pool(), ne_get_rows);
154 ggml_cuda_pool_alloc<int32_t> expert_bounds(ctx.pool(), ne02 + 1);
155
156 {
157 GGML_ASSERT(ids->nb[0] == ggml_element_size(ids));
158 const int si1 = ids->nb[1] / ggml_element_size(ids);
159 const int sis1 = nb12 / nb11;
160
161 ggml_cuda_launch_mm_ids_helper((const int32_t *) ids->data, ids_src1.get(), ids_dst.get(), expert_bounds.get(),
162 ne02, ne12, n_expert_used, ne11, si1, sis1, stream);
163 CUDA_CHECK(cudaGetLastError());
164 }
165
166 const size_t nbytes_src1_q8_1 = ne12*n_expert_used*ne10_padded * sizeof(block_q8_1)/QK8_1 +
167 get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq);
168 ggml_cuda_pool_alloc<char> src1_q8_1(ctx.pool(), nbytes_src1_q8_1);
169
170 const int64_t ne11_flat = ne12*n_expert_used;
171 const int64_t ne12_flat = 1;
172 const int64_t ne13_flat = 1;
173
174 {
175 const int64_t s11 = src1->nb[1] / ts_src1;
176 const int64_t s12 = src1->nb[2] / ts_src1;
177 const int64_t s13 = src1->nb[2] / ts_src1;
178 quantize_mmq_q8_1_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type,
179 ne10, s11, s12, s13, ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
180 CUDA_CHECK(cudaGetLastError());
181 }
182
183 const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int));
184 const int64_t s13 = ne12*s12;
185
186 // Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid.
187 const mmq_args args = {
188 src0_d, src0->type, (const int *) src1_q8_1.get(), ids_dst.get(), expert_bounds.get(), dst_d,
189 ne00, ne01, ne_get_rows, s01, ne_get_rows, s1,
190 ne02, ne02, s02, s12, s2,
191 ne03, ne13, s03, s13, s3,
192 use_stream_k, ne12};
193
194 ggml_cuda_mul_mat_q_switch_type(ctx, args, stream);
195}
196
197void ggml_cuda_op_mul_mat_q(
198 ggml_backend_cuda_context & ctx,
199 const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
200 const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
201 const int64_t src1_padded_row_size, cudaStream_t stream) {
202
203 const int64_t ne00 = src0->ne[0];
204
205 const int64_t ne10 = src1->ne[0];
206 const int64_t ne11 = src1->ne[1];
207 GGML_ASSERT(ne10 % QK8_1 == 0);
208
209 const int64_t ne0 = dst->ne[0];
210
211 const int64_t row_diff = row_high - row_low;
212 const int64_t stride01 = ne00 / ggml_blck_size(src0->type);
213
214 const int id = ggml_cuda_get_device();
215 const int cc = ggml_cuda_info().devices[id].cc;
216
217 // the main device has a larger memory buffer to hold the results from all GPUs
218 // nrows_dst == nrows of the matrix that the kernel writes into
219 const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
220
221 // The stream-k decomposition is only faster for recent NVIDIA GPUs.
222 // Also its fixup needs to allocate a temporary buffer in the memory pool.
223 // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
224 const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(arch: cc) >= GGML_CUDA_CC_VOLTA)
225 || GGML_CUDA_CC_IS_CDNA(cc))
226 && src1_ncols == ne11;
227 const mmq_args args = {
228 src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i,
229 ne00, row_diff, src1_ncols, stride01, ne11, nrows_dst,
230 1, 1, 0, 0, 0,
231 1, 1, 0, 0, 0,
232 use_stream_k, src1_ncols};
233
234 ggml_cuda_mul_mat_q_switch_type(ctx, args, stream);
235
236 GGML_UNUSED_VARS(src1, dst, src1_ddf_i, src1_padded_row_size);
237}
238
239bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
240#ifdef GGML_CUDA_FORCE_CUBLAS
241 return false;
242#endif // GGML_CUDA_FORCE_CUBLAS
243
244 bool mmq_supported;
245
246 switch (type) {
247 case GGML_TYPE_Q4_0:
248 case GGML_TYPE_Q4_1:
249 case GGML_TYPE_Q5_0:
250 case GGML_TYPE_Q5_1:
251 case GGML_TYPE_Q8_0:
252 case GGML_TYPE_MXFP4:
253 case GGML_TYPE_Q2_K:
254 case GGML_TYPE_Q3_K:
255 case GGML_TYPE_Q4_K:
256 case GGML_TYPE_Q5_K:
257 case GGML_TYPE_Q6_K:
258 case GGML_TYPE_IQ2_XXS:
259 case GGML_TYPE_IQ2_XS:
260 case GGML_TYPE_IQ2_S:
261 case GGML_TYPE_IQ3_XXS:
262 case GGML_TYPE_IQ3_S:
263 case GGML_TYPE_IQ1_S:
264 case GGML_TYPE_IQ4_XS:
265 case GGML_TYPE_IQ4_NL:
266 mmq_supported = true;
267 break;
268 default:
269 mmq_supported = false;
270 break;
271 }
272
273 if (!mmq_supported) {
274 return false;
275 }
276
277 if (turing_mma_available(cc)) {
278 return true;
279 }
280
281 if (ggml_cuda_highest_compiled_arch(arch: cc) < GGML_CUDA_CC_DP4A) {
282 return false;
283 }
284
285#ifdef GGML_CUDA_FORCE_MMQ
286 return true;
287#endif //GGML_CUDA_FORCE_MMQ
288
289 if (GGML_CUDA_CC_IS_NVIDIA(cc)) {
290 return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
291 }
292
293 if (amd_mfma_available(cc)) {
294 // As of ROCM 7.0 rocblas/tensile performs very poorly on CDNA3 and hipblaslt (via ROCBLAS_USE_HIPBLASLT)
295 // performs better but is currently suffering from a crash on this architecture.
296 // TODO: Revisit when hipblaslt is fixed on CDNA3
297 if (GGML_CUDA_CC_IS_CDNA3(cc)) {
298 return true;
299 }
300 if (ne11 <= 128 || type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) {
301 return true;
302 }
303 if (ne11 <= 256 && (type == GGML_TYPE_Q4_K || type == GGML_TYPE_Q5_K)) {
304 return true;
305 }
306 return false;
307 }
308
309 return (!GGML_CUDA_CC_IS_RDNA4(cc) && !GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
310}
311