| 1 | #include "arange.cuh" |
| 2 | |
| 3 | static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) { |
| 4 | // blockIDx.x: idx of ne0 / BLOCK_SIZE |
| 5 | int nidx = threadIdx.x + blockIdx.x * blockDim.x; |
| 6 | if (nidx >= ne0) { |
| 7 | return; |
| 8 | } |
| 9 | dst[nidx] = start + step * nidx; |
| 10 | } |
| 11 | |
| 12 | static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) { |
| 13 | int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE; |
| 14 | arange_f32<<<gridDim: num_blocks, CUDA_ARANGE_BLOCK_SIZE, sharedMem: 0, stream>>>(dst, ne0, start, step); |
| 15 | } |
| 16 | |
| 17 | void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { |
| 18 | float * dst_d = (float *)dst->data; |
| 19 | cudaStream_t stream = ctx.stream(); |
| 20 | |
| 21 | GGML_ASSERT(dst->type == GGML_TYPE_F32); |
| 22 | |
| 23 | float start; |
| 24 | float stop; |
| 25 | float step; |
| 26 | memcpy(dest: &start, src: (float *)dst->op_params + 0, n: sizeof(float)); |
| 27 | memcpy(dest: &stop, src: (float *)dst->op_params + 1, n: sizeof(float)); |
| 28 | memcpy(dest: &step, src: (float *)dst->op_params + 2, n: sizeof(float)); |
| 29 | |
| 30 | int64_t steps = (int64_t)ceil(x: (stop - start) / step); |
| 31 | GGML_ASSERT(ggml_nelements(dst) == steps); |
| 32 | |
| 33 | arange_f32_cuda(dst_d, dst->ne[0], start, step, stream); |
| 34 | } |
| 35 | |