| 1 | #include "mean.cuh" |
| 2 | #include "reduce_rows.cuh" |
| 3 | |
| 4 | #ifdef GGML_CUDA_USE_CUB |
| 5 | #include <cub/cub.cuh> |
| 6 | using namespace cub; |
| 7 | #endif // GGML_CUDA_USE_CUB |
| 8 | |
| 9 | template <typename T> __global__ void divide_by_count(T * result, size_t count) { |
| 10 | *result /= static_cast<T>(count); |
| 11 | } |
| 12 | |
| 13 | void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { |
| 14 | const ggml_tensor * src0 = dst->src[0]; |
| 15 | const float * src0_d = (const float *) src0->data; |
| 16 | float * dst_d = (float *) dst->data; |
| 17 | cudaStream_t stream = ctx.stream(); |
| 18 | |
| 19 | GGML_ASSERT(src0->type == GGML_TYPE_F32); |
| 20 | GGML_ASSERT(dst->type == GGML_TYPE_F32); |
| 21 | GGML_ASSERT(ggml_is_contiguous(src0)); |
| 22 | |
| 23 | const int64_t ncols = src0->ne[0]; |
| 24 | const int64_t nrows = ggml_nrows(src0); |
| 25 | |
| 26 | // Special case for reducing vectors |
| 27 | #ifdef GGML_CUDA_USE_CUB |
| 28 | #ifdef USE_CUDA_GRAPH |
| 29 | cudaStreamCaptureStatus iscapturing; |
| 30 | CUDA_CHECK(cudaStreamIsCapturing(stream, &iscapturing)); |
| 31 | #endif // USE_CUDA_GRAPH |
| 32 | if ((nrows == 1) && |
| 33 | #ifdef USE_CUDA_GRAPH |
| 34 | // CUDA_GRAPHS_DISABLED |
| 35 | ((ncols > 65536) && |
| 36 | ((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) || |
| 37 | ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates || |
| 38 | ctx.cuda_graph->disable_due_to_failed_graph_capture)) || |
| 39 | // CUDA_GRAPHS ENABLED |
| 40 | ((ncols > 32768) && |
| 41 | !((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) || |
| 42 | ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates || |
| 43 | ctx.cuda_graph->disable_due_to_failed_graph_capture))) { |
| 44 | #else |
| 45 | (ncols > 65536)) { |
| 46 | #endif // USE_CUDA_GRAPH |
| 47 | // Single row - use device-wide reduction |
| 48 | size_t tmp_size = 0; |
| 49 | ggml_cuda_pool & pool = ctx.pool(); |
| 50 | |
| 51 | DeviceReduce::Sum(nullptr, tmp_size, src0_d, dst_d, ncols, stream); |
| 52 | |
| 53 | ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size); |
| 54 | DeviceReduce::Sum(tmp_alloc.ptr, tmp_size, src0_d, dst_d, ncols, stream); |
| 55 | |
| 56 | // Divide by ncols |
| 57 | divide_by_count<float><<<gridDim: 1, blockDim: 1, sharedMem: 0, stream>>>(result: dst_d, count: ncols); |
| 58 | return; |
| 59 | } |
| 60 | #endif // GGML_CUDA_USE_CUB |
| 61 | |
| 62 | const dim3 block_nums(nrows, 1, 1); |
| 63 | |
| 64 | const int id = ggml_cuda_get_device(); |
| 65 | const int nsm = ggml_cuda_info().devices[id].nsm; |
| 66 | if ((nrows / nsm) < 2) { |
| 67 | const dim3 block_dims(512, 1, 1); |
| 68 | reduce_rows_f32</*norm=*/true><<<gridDim: block_nums, blockDim: block_dims, sharedMem: 0, stream>>>(x: src0_d, dst: dst_d, ncols); |
| 69 | } else { |
| 70 | const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1); |
| 71 | reduce_rows_f32</*norm=*/true><<<gridDim: block_nums, blockDim: block_dims, sharedMem: 0, stream>>>(x: src0_d, dst: dst_d, ncols); |
| 72 | } |
| 73 | } |
| 74 | |