mmq.cu source code [llama.cpp/ggml/src/ggml-cuda/mmq.cu]

1	#include "mmq.cuh"
2	#include "quantize.cuh"
3	#include "mmid.cuh"
4
5	static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
6	switch (args.type_x) {
7	case GGML_TYPE_Q4_0:
8	mul_mat_q_case<GGML_TYPE_Q4_0>(ctx, args, stream);
9	break;
10	case GGML_TYPE_Q4_1:
11	mul_mat_q_case<GGML_TYPE_Q4_1>(ctx, args, stream);
12	break;
13	case GGML_TYPE_Q5_0:
14	mul_mat_q_case<GGML_TYPE_Q5_0>(ctx, args, stream);
15	break;
16	case GGML_TYPE_Q5_1:
17	mul_mat_q_case<GGML_TYPE_Q5_1>(ctx, args, stream);
18	break;
19	case GGML_TYPE_Q8_0:
20	mul_mat_q_case<GGML_TYPE_Q8_0>(ctx, args, stream);
21	break;
22	case GGML_TYPE_MXFP4:
23	mul_mat_q_case<GGML_TYPE_MXFP4>(ctx, args, stream);
24	break;
25	case GGML_TYPE_Q2_K:
26	mul_mat_q_case<GGML_TYPE_Q2_K>(ctx, args, stream);
27	break;
28	case GGML_TYPE_Q3_K:
29	mul_mat_q_case<GGML_TYPE_Q3_K>(ctx, args, stream);
30	break;
31	case GGML_TYPE_Q4_K:
32	mul_mat_q_case<GGML_TYPE_Q4_K>(ctx, args, stream);
33	break;
34	case GGML_TYPE_Q5_K:
35	mul_mat_q_case<GGML_TYPE_Q5_K>(ctx, args, stream);
36	break;
37	case GGML_TYPE_Q6_K:
38	mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream);
39	break;
40	case GGML_TYPE_IQ2_XXS:
41	mul_mat_q_case<GGML_TYPE_IQ2_XXS>(ctx, args, stream);
42	break;
43	case GGML_TYPE_IQ2_XS:
44	mul_mat_q_case<GGML_TYPE_IQ2_XS>(ctx, args, stream);
45	break;
46	case GGML_TYPE_IQ2_S:
47	mul_mat_q_case<GGML_TYPE_IQ2_S>(ctx, args, stream);
48	break;
49	case GGML_TYPE_IQ3_XXS:
50	mul_mat_q_case<GGML_TYPE_IQ3_XXS>(ctx, args, stream);
51	break;
52	case GGML_TYPE_IQ3_S:
53	mul_mat_q_case<GGML_TYPE_IQ3_S>(ctx, args, stream);
54	break;
55	case GGML_TYPE_IQ1_S:
56	mul_mat_q_case<GGML_TYPE_IQ1_S>(ctx, args, stream);
57	break;
58	case GGML_TYPE_IQ4_XS:
59	mul_mat_q_case<GGML_TYPE_IQ4_XS>(ctx, args, stream);
60	break;
61	case GGML_TYPE_IQ4_NL:
62	mul_mat_q_case<GGML_TYPE_IQ4_NL>(ctx, args, stream);
63	break;
64	default:
65	GGML_ABORT("fatal error");
66	break;
67	}
68	}
69
70	void ggml_cuda_mul_mat_q(
71	ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
72	GGML_ASSERT( src1->type == GGML_TYPE_F32);
73	GGML_ASSERT( dst->type == GGML_TYPE_F32);
74	GGML_ASSERT(!ids \|\| ids->type == GGML_TYPE_I32); // Optional, used for batched GGML_MUL_MAT_ID.
75
76	GGML_TENSOR_BINARY_OP_LOCALS;
77
78	cudaStream_t stream = ctx.stream();
79	const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
80
81	const size_t ts_src0 = ggml_type_size(src0->type);
82	const size_t ts_src1 = ggml_type_size(src1->type);
83	const size_t ts_dst = ggml_type_size(dst->type);
84
85	GGML_ASSERT( nb00 == ts_src0);
86	GGML_ASSERT( nb10 == ts_src1);
87	GGML_ASSERT( nb0 == ts_dst);
88	GGML_ASSERT(!ids \|\| ids->nb[`0`] == ggml_type_size(ids->type));
89
90	const char * src0_d = (const char *) src0->data;
91	const float * src1_d = (const float *) src1->data;
92	float * dst_d = (float *) dst->data;
93
94	// If src0 is a temporary compute buffer, clear any potential padding.
95	if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
96	const size_t size_data = ggml_nbytes(src0);
97	const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
98	if (size_alloc > size_data) {
99	GGML_ASSERT(ggml_is_contiguously_allocated(src0));
100	GGML_ASSERT(!src0->view_src);
101	CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, `0`, size_alloc - size_data, stream));
102	}
103	}
104
105	const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
106
107	const int64_t s01 = src0->nb[`1`] / ts_src0;
108	const int64_t s1 = dst->nb[`1`] / ts_dst;
109	const int64_t s02 = src0->nb[`2`] / ts_src0;
110	const int64_t s2 = dst->nb[`2`] / ts_dst;
111	const int64_t s03 = src0->nb[`3`] / ts_src0;
112	const int64_t s3 = dst->nb[`3`] / ts_dst;
113
114	const bool use_stream_k = (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(arch: cc) >= GGML_CUDA_CC_VOLTA)
115	\|\| GGML_CUDA_CC_IS_CDNA(cc);
116
117	if (!ids) {
118	const size_t nbytes_src1_q8_1 = ne13ne12 ne11ne10_padded sizeof(block_q8_1)/QK8_1 +
119	get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq);
120	ggml_cuda_pool_alloc<char> src1_q8_1(ctx.pool(), nbytes_src1_q8_1);
121
122	{
123	const int64_t s11 = src1->nb[`1`] / ts_src1;
124	const int64_t s12 = src1->nb[`2`] / ts_src1;
125	const int64_t s13 = src1->nb[`3`] / ts_src1;
126	quantize_mmq_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type,
127	ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream);
128	CUDA_CHECK(cudaGetLastError());
129	}
130
131	const int64_t s12 = ne11ne10_padded sizeof(block_q8_1)/(QK8_1*sizeof(int));
132	const int64_t s13 = ne12*s12;
133
134	const mmq_args args = {
135	src0_d, src0->type, (const int ) src1_q8_1.ptr, nullptr, nullptr*, dst_d,
136	ne00, ne01, ne1, s01, ne11, s1,
137	ne02, ne12, s02, s12, s2,
138	ne03, ne13, s03, s13, s3,
139	use_stream_k, ne1};
140	ggml_cuda_mul_mat_q_switch_type(ctx, args, stream);
141	return;
142	}
143
144	GGML_ASSERT(ne13 == `1`);
145	GGML_ASSERT(nb12 % nb11 == `0`);
146	GGML_ASSERT(nb2 % nb1 == `0`);
147
148	const int64_t n_expert_used = ids->ne[`0`];
149	const int64_t ne_get_rows = ne12 * n_expert_used;
150	GGML_ASSERT(ne1 == n_expert_used);
151
152	ggml_cuda_pool_alloc<int32_t> ids_src1(ctx.pool(), ne_get_rows);
153	ggml_cuda_pool_alloc<int32_t> ids_dst(ctx.pool(), ne_get_rows);
154	ggml_cuda_pool_alloc<int32_t> expert_bounds(ctx.pool(), ne02 + `1`);
155
156	{
157	GGML_ASSERT(ids->nb[`0`] == ggml_element_size(ids));
158	const int si1 = ids->nb[`1`] / ggml_element_size(ids);
159	const int sis1 = nb12 / nb11;
160
161	ggml_cuda_launch_mm_ids_helper((const int32_t *) ids->data, ids_src1.get(), ids_dst.get(), expert_bounds.get(),
162	ne02, ne12, n_expert_used, ne11, si1, sis1, stream);
163	CUDA_CHECK(cudaGetLastError());
164	}
165
166	const size_t nbytes_src1_q8_1 = ne12n_expert_usedne10_padded * sizeof(block_q8_1)/QK8_1 +
167	get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq);
168	ggml_cuda_pool_alloc<char> src1_q8_1(ctx.pool(), nbytes_src1_q8_1);
169
170	const int64_t ne11_flat = ne12*n_expert_used;
171	const int64_t ne12_flat = `1`;
172	const int64_t ne13_flat = `1`;
173
174	{
175	const int64_t s11 = src1->nb[`1`] / ts_src1;
176	const int64_t s12 = src1->nb[`2`] / ts_src1;
177	const int64_t s13 = src1->nb[`2`] / ts_src1;
178	quantize_mmq_q8_1_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type,
179	ne10, s11, s12, s13, ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
180	CUDA_CHECK(cudaGetLastError());
181	}
182
183	const int64_t s12 = ne11ne10_padded sizeof(block_q8_1)/(QK8_1*sizeof(int));
184	const int64_t s13 = ne12*s12;
185
186	// Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid.
187	const mmq_args args = {
188	src0_d, src0->type, (const int *) src1_q8_1.get(), ids_dst.get(), expert_bounds.get(), dst_d,
189	ne00, ne01, ne_get_rows, s01, ne_get_rows, s1,
190	ne02, ne02, s02, s12, s2,
191	ne03, ne13, s03, s13, s3,
192	use_stream_k, ne12};
193
194	ggml_cuda_mul_mat_q_switch_type(ctx, args, stream);
195	}
196
197	void ggml_cuda_op_mul_mat_q(
198	ggml_backend_cuda_context & ctx,
199	const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
200	const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
201	const int64_t src1_padded_row_size, cudaStream_t stream) {
202
203	const int64_t ne00 = src0->ne[`0`];
204
205	const int64_t ne10 = src1->ne[`0`];
206	const int64_t ne11 = src1->ne[`1`];
207	GGML_ASSERT(ne10 % QK8_1 == `0`);
208
209	const int64_t ne0 = dst->ne[`0`];
210
211	const int64_t row_diff = row_high - row_low;
212	const int64_t stride01 = ne00 / ggml_blck_size(src0->type);
213
214	const int id = ggml_cuda_get_device();
215	const int cc = ggml_cuda_info().devices[id].cc;
216
217	// the main device has a larger memory buffer to hold the results from all GPUs
218	// nrows_dst == nrows of the matrix that the kernel writes into
219	const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
220
221	// The stream-k decomposition is only faster for recent NVIDIA GPUs.
222	// Also its fixup needs to allocate a temporary buffer in the memory pool.
223	// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
224	const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(arch: cc) >= GGML_CUDA_CC_VOLTA)
225	\|\| GGML_CUDA_CC_IS_CDNA(cc))
226	&& src1_ncols == ne11;
227	const mmq_args args = {
228	src0_dd_i, src0->type, (const int ) src1_ddq_i, nullptr, nullptr*, dst_dd_i,
229	ne00, row_diff, src1_ncols, stride01, ne11, nrows_dst,
230	`1`, `1`, `0`, `0`, `0`,
231	`1`, `1`, `0`, `0`, `0`,
232	use_stream_k, src1_ncols};
233
234	ggml_cuda_mul_mat_q_switch_type(ctx, args, stream);
235
236	GGML_UNUSED_VARS(src1, dst, src1_ddf_i, src1_padded_row_size);
237	}
238
239	bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
240	#ifdef GGML_CUDA_FORCE_CUBLAS
241	return false;
242	#endif // GGML_CUDA_FORCE_CUBLAS
243
244	bool mmq_supported;
245
246	switch (type) {
247	case GGML_TYPE_Q4_0:
248	case GGML_TYPE_Q4_1:
249	case GGML_TYPE_Q5_0:
250	case GGML_TYPE_Q5_1:
251	case GGML_TYPE_Q8_0:
252	case GGML_TYPE_MXFP4:
253	case GGML_TYPE_Q2_K:
254	case GGML_TYPE_Q3_K:
255	case GGML_TYPE_Q4_K:
256	case GGML_TYPE_Q5_K:
257	case GGML_TYPE_Q6_K:
258	case GGML_TYPE_IQ2_XXS:
259	case GGML_TYPE_IQ2_XS:
260	case GGML_TYPE_IQ2_S:
261	case GGML_TYPE_IQ3_XXS:
262	case GGML_TYPE_IQ3_S:
263	case GGML_TYPE_IQ1_S:
264	case GGML_TYPE_IQ4_XS:
265	case GGML_TYPE_IQ4_NL:
266	mmq_supported = true;
267	break;
268	default:
269	mmq_supported = false;
270	break;
271	}
272
273	if (!mmq_supported) {
274	return false;
275	}
276
277	if (turing_mma_available(cc)) {
278	return true;
279	}
280
281	if (ggml_cuda_highest_compiled_arch(arch: cc) < GGML_CUDA_CC_DP4A) {
282	return false;
283	}
284
285	#ifdef GGML_CUDA_FORCE_MMQ
286	return true;
287	#endif //GGML_CUDA_FORCE_MMQ
288
289	if (GGML_CUDA_CC_IS_NVIDIA(cc)) {
290	return !fp16_mma_hardware_available(cc) \|\| ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
291	}
292
293	if (amd_mfma_available(cc)) {
294	// As of ROCM 7.0 rocblas/tensile performs very poorly on CDNA3 and hipblaslt (via ROCBLAS_USE_HIPBLASLT)
295	// performs better but is currently suffering from a crash on this architecture.
296	// TODO: Revisit when hipblaslt is fixed on CDNA3
297	if (GGML_CUDA_CC_IS_CDNA3(cc)) {
298	return true;
299	}
300	if (ne11 <= `128` \|\| type == GGML_TYPE_Q4_0 \|\| type == GGML_TYPE_Q4_1 \|\| type == GGML_TYPE_Q5_0 \|\| type == GGML_TYPE_Q5_1) {
301	return true;
302	}
303	if (ne11 <= `256` && (type == GGML_TYPE_Q4_K \|\| type == GGML_TYPE_Q5_K)) {
304	return true;
305	}
306	return false;
307	}
308
309	return (!GGML_CUDA_CC_IS_RDNA4(cc) && !GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) \|\| ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
310	}
311

Browse the source code of llama.cpp/ggml/src/ggml-cuda/mmq.cu