mean.cu source code [llama.cpp/ggml/src/ggml-cuda/mean.cu]

1	#include "mean.cuh"
2	#include "reduce_rows.cuh"
3
4	#ifdef GGML_CUDA_USE_CUB
5	#include <cub/cub.cuh>
6	using namespace cub;
7	#endif // GGML_CUDA_USE_CUB
8
9	template <typename T> __global__ void divide_by_count(T * result, size_t count) {
10	result /= static_cast*<T>(count);
11	}
12
13	void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
14	const ggml_tensor * src0 = dst->src[`0`];
15	const float * src0_d = (const float *) src0->data;
16	float * dst_d = (float *) dst->data;
17	cudaStream_t stream = ctx.stream();
18
19	GGML_ASSERT(src0->type == GGML_TYPE_F32);
20	GGML_ASSERT(dst->type == GGML_TYPE_F32);
21	GGML_ASSERT(ggml_is_contiguous(src0));
22
23	const int64_t ncols = src0->ne[`0`];
24	const int64_t nrows = ggml_nrows(src0);
25
26	// Special case for reducing vectors
27	#ifdef GGML_CUDA_USE_CUB
28	#ifdef USE_CUDA_GRAPH
29	cudaStreamCaptureStatus iscapturing;
30	CUDA_CHECK(cudaStreamIsCapturing(stream, &iscapturing));
31	#endif // USE_CUDA_GRAPH
32	if ((nrows == `1`) &&
33	#ifdef USE_CUDA_GRAPH
34	// CUDA_GRAPHS_DISABLED
35	((ncols > `65536`) &&
36	((ctx.cuda_graph ->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) \|\|
37	ctx.cuda_graph ->disable_due_to_gpu_arch \|\| ctx.cuda_graph ->disable_due_to_too_many_updates \|\|
38	ctx.cuda_graph ->disable_due_to_failed_graph_capture)) \|\|
39	// CUDA_GRAPHS ENABLED
40	((ncols > `32768`) &&
41	!((ctx.cuda_graph ->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) \|\|
42	ctx.cuda_graph ->disable_due_to_gpu_arch \|\| ctx.cuda_graph ->disable_due_to_too_many_updates \|\|
43	ctx.cuda_graph ->disable_due_to_failed_graph_capture))) {
44	#else
45	(ncols > `65536`)) {
46	#endif // USE_CUDA_GRAPH
47	// Single row - use device-wide reduction
48	size_t tmp_size = `0`;
49	ggml_cuda_pool & pool = ctx.pool();
50
51	DeviceReduce::Sum(nullptr, tmp_size, src0_d, dst_d, ncols, stream);
52
53	ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size);
54	DeviceReduce::Sum(tmp_alloc.ptr, tmp_size, src0_d, dst_d, ncols, stream);
55
56	// Divide by ncols
57	divide_by_count<float><<<gridDim: `1`, blockDim: `1`, sharedMem: `0`, stream>>>(result: dst_d, count: ncols);
58	return;
59	}
60	#endif // GGML_CUDA_USE_CUB
61
62	const dim3 block_nums(nrows, `1`, `1`);
63
64	const int id = ggml_cuda_get_device();
65	const int nsm = ggml_cuda_info().devices[id].nsm;
66	if ((nrows / nsm) < `2`) {
67	const dim3 block_dims(`512`, `1`, `1`);
68	reduce_rows_f32</norm=/true><<<gridDim: block_nums, blockDim: block_dims, sharedMem: `0`, stream>>>(x: src0_d, dst: dst_d, ncols);
69	} else {
70	const dim3 block_dims(ncols < `1024` ? `32` : `128`, `1`, `1`);
71	reduce_rows_f32</norm=/true><<<gridDim: block_nums, blockDim: block_dims, sharedMem: `0`, stream>>>(x: src0_d, dst: dst_d, ncols);
72	}
73	}
74

Browse the source code of llama.cpp/ggml/src/ggml-cuda/mean.cu