| 1 | /******************************************************************************* |
| 2 | * Copyright 2017-2018 Intel Corporation |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | *******************************************************************************/ |
| 16 | |
| 17 | #include <assert.h> |
| 18 | |
| 19 | #include "mkldnn_thread.hpp" |
| 20 | #include "mkldnn_types.h" |
| 21 | #include "nstl.hpp" |
| 22 | #include "utils.hpp" |
| 23 | |
| 24 | #include "cpu_reducer.hpp" |
| 25 | |
| 26 | namespace mkldnn { |
| 27 | namespace impl { |
| 28 | namespace cpu { |
| 29 | |
| 30 | using namespace memory_tracking::names; |
| 31 | |
| 32 | void reduce_balancer_t::balance() { |
| 33 | using namespace nstl; |
| 34 | using namespace utils; |
| 35 | |
| 36 | assert(nthr_ > 0 && job_size_ > 0 && njobs_ > 0 && reduction_size_ > 0); |
| 37 | |
| 38 | const int job_complexity = 1; |
| 39 | |
| 40 | const int min_njobs_per_group = max(1, njobs_ / nthr_); |
| 41 | const int max_njobs_per_group = max(1, |
| 42 | static_cast<int>(max_buffer_size_ / (nthr_ * job_size_))); |
| 43 | |
| 44 | /* initial guess */ |
| 45 | int ngroups = min(njobs_ / min_njobs_per_group, nthr_); |
| 46 | int nthr_per_group = syncable_ ? min(nthr_ / ngroups, reduction_size_) : 1; |
| 47 | int njobs_per_group_ub = div_up(njobs_, ngroups); |
| 48 | |
| 49 | /* rough upper-bound estimation, will be fixed during brute force */ |
| 50 | size_t thread_complexity_ub = njobs_ * job_size_ * reduction_size_; |
| 51 | |
| 52 | /* brute force parameters for the best balance... */ |
| 53 | for (int c_njobs_per_group = min_njobs_per_group; |
| 54 | c_njobs_per_group < njobs_; ++c_njobs_per_group) { |
| 55 | /* current assumption */ |
| 56 | int c_ngroups = min(njobs_ / c_njobs_per_group, nthr_); |
| 57 | int c_nthr_per_group = syncable_ |
| 58 | ? min(nthr_ / c_ngroups, reduction_size_) : 1; |
| 59 | int c_njobs_per_group_ub = div_up(njobs_, c_ngroups); |
| 60 | |
| 61 | if (c_nthr_per_group > 1 && c_njobs_per_group_ub > max_njobs_per_group) |
| 62 | continue; |
| 63 | |
| 64 | int c_thread_reduction_ub = div_up(reduction_size_, c_nthr_per_group); |
| 65 | size_t c_group_size_ub = job_size_ * c_njobs_per_group_ub; |
| 66 | size_t c_thread_complexity_ub = c_group_size_ub * ( |
| 67 | job_complexity * c_thread_reduction_ub |
| 68 | + (c_nthr_per_group != 1)); |
| 69 | |
| 70 | if (c_thread_complexity_ub < thread_complexity_ub) { |
| 71 | ngroups = c_ngroups; |
| 72 | nthr_per_group = c_nthr_per_group; |
| 73 | njobs_per_group_ub = c_njobs_per_group_ub; |
| 74 | thread_complexity_ub = c_thread_complexity_ub; |
| 75 | } |
| 76 | } |
| 77 | |
| 78 | assert(njobs_per_group_ub <= max_njobs_per_group || nthr_per_group == 1); |
| 79 | assert(ngroups * nthr_per_group <= nthr_); |
| 80 | assert((size_t)njobs_per_group_ub * job_size_ * nthr_ <= max_buffer_size_ |
| 81 | || nthr_per_group == 1); /* no reduction buffer overflow */ |
| 82 | assert(IMPLICATION(!syncable_, nthr_per_group == 1)); |
| 83 | |
| 84 | ngroups_ = ngroups; |
| 85 | nthr_per_group_ = nthr_per_group; |
| 86 | njobs_per_group_ub_ = njobs_per_group_ub; |
| 87 | } |
| 88 | |
| 89 | /* reducer jit-ted driver */ |
| 90 | |
| 91 | using namespace Xbyak; |
| 92 | |
| 93 | template <impl::data_type_t data_type> |
| 94 | struct reducer_2d_driver_t: public c_compatible { |
| 95 | typedef typename prec_traits<data_type>::type data_t; |
| 96 | |
| 97 | reducer_2d_driver_t(int n_src, size_t src_ld, |
| 98 | size_t src_step, size_t dst_step, bool nullify_dst) |
| 99 | : n_src_(n_src), src_ld_(src_ld), src_step_(src_step) |
| 100 | , dst_step_(dst_step), nullify_dst_(nullify_dst), ker_(nullptr) {} |
| 101 | virtual ~reducer_2d_driver_t() {} |
| 102 | void operator()(data_t *dst, const data_t *srcs, size_t ny, size_t nx) |
| 103 | { assert(ker_); ker_(dst, srcs, ny, nx); } |
| 104 | |
| 105 | protected: |
| 106 | int n_src_; |
| 107 | size_t src_ld_, src_step_, dst_step_; |
| 108 | bool nullify_dst_; |
| 109 | void (*ker_)(data_t *dst, const data_t *srcs, size_t ny, size_t nx); |
| 110 | }; |
| 111 | |
| 112 | template <impl::data_type_t data_type, cpu_isa_t isa> |
| 113 | struct reducer_2d_driver_f_s_32_t: public reducer_2d_driver_t<data_type>, |
| 114 | public jit_generator |
| 115 | { |
| 116 | DECLARE_CPU_JIT_AUX_FUNCTIONS(reducer_2d_driver_f_s_32_t) |
| 117 | |
| 118 | /* cpu specific part */ |
| 119 | using Vmm = typename utils::conditional<isa == avx2, Ymm, Zmm>::type; |
| 120 | const AddressFrame &vmmword = (isa == avx2) ? yword : zword; |
| 121 | void uni_vadd(const Xmm& x1, const Xmm& x2, const Operand& op) |
| 122 | { if (data_type == data_type::f32) vaddps(x1, x2, op); |
| 123 | else vpaddd(x1, x2, op); } |
| 124 | void uni_add(const Xmm& x1, const Operand& op) |
| 125 | { if (data_type == data_type::f32) addss(x1, op); else paddd(x1, op); } |
| 126 | |
| 127 | const int vlen = cpu_isa_traits<isa>::vlen; |
| 128 | const int typesize |
| 129 | = sizeof(typename mkldnn::impl::prec_traits<data_type>::type); |
| 130 | Xbyak::Reg64 reg_dst = abi_param1; |
| 131 | Xbyak::Reg64 reg_src = abi_param2; |
| 132 | Xbyak::Reg64 reg_ny = abi_param3; |
| 133 | Xbyak::Reg64 reg_nx = abi_param4; |
| 134 | |
| 135 | Xbyak::Reg64 reg_x = rax; |
| 136 | Xbyak::Reg64 reg_src_id = r10; |
| 137 | |
| 138 | reducer_2d_driver_f_s_32_t(int n_src, size_t src_ld, size_t src_step, |
| 139 | size_t dst_step, bool nullify_dst) |
| 140 | : reducer_2d_driver_t<data_type>(n_src, src_ld, src_step, |
| 141 | dst_step, nullify_dst) |
| 142 | { generate(); } |
| 143 | |
| 144 | void nullify_dst(int nloads, int load_len) { |
| 145 | UNUSED(load_len); |
| 146 | for (int i = 0; i < nloads; ++i) |
| 147 | uni_vpxor(Vmm(i), Vmm(i), Vmm(i)); |
| 148 | /* prefetches[dst] ? */ |
| 149 | } |
| 150 | |
| 151 | void load_dst(int nloads, int load_len) { |
| 152 | for (int i = 0; i < nloads; ++i) { |
| 153 | if (load_len == typesize) |
| 154 | movd(Xmm(i), ptr[reg_dst + i * load_len]); |
| 155 | else if (load_len == vlen) |
| 156 | vmovups(Vmm(i), ptr[reg_dst + i * load_len]); |
| 157 | else |
| 158 | assert(!"unsupported" ); |
| 159 | } |
| 160 | } |
| 161 | |
| 162 | void store_dst(int nloads, int load_len) { |
| 163 | for (int i = 0; i < nloads; ++i) { |
| 164 | if (load_len == typesize) |
| 165 | movd(ptr[reg_dst + i * load_len], Xmm(i)); |
| 166 | else if (load_len == vlen) |
| 167 | vmovups(ptr[reg_dst + i * load_len], Vmm(i)); |
| 168 | else |
| 169 | assert(!"unsupported" ); |
| 170 | } |
| 171 | } |
| 172 | |
| 173 | void accumulate(int nloads, int load_len, size_t base_off) { |
| 174 | for (int i = 0; i < nloads; ++i) { |
| 175 | size_t off = base_off + i * load_len; |
| 176 | |
| 177 | if (load_len == typesize) |
| 178 | uni_add(Xmm(i), ptr[reg_src + off]); |
| 179 | else if (load_len == vlen) |
| 180 | uni_vadd(Vmm(i), Vmm(i), vmmword[reg_src + off]); |
| 181 | else |
| 182 | assert(!"unsupported" ); |
| 183 | } |
| 184 | } |
| 185 | |
| 186 | void loop_x() { |
| 187 | const int nloads[] = {cpu_isa_traits<isa>::n_vregs, 1, 1}; |
| 188 | const int nbranches = sizeof(nloads) / sizeof(nloads[0]); |
| 189 | |
| 190 | const int load_len[nbranches] = {vlen, vlen, typesize}; |
| 191 | Label loop_x_label[nbranches + 1]; |
| 192 | |
| 193 | mov(reg_x, reg_nx); |
| 194 | |
| 195 | for (int id = 0; id < nbranches; ++id) { |
| 196 | L(loop_x_label[id]); |
| 197 | |
| 198 | cmp(reg_x, nloads[id] * load_len[id]); |
| 199 | jl(loop_x_label[id + 1], T_NEAR); |
| 200 | |
| 201 | if (this->nullify_dst_) |
| 202 | nullify_dst(nloads[id], load_len[id]); |
| 203 | else |
| 204 | load_dst(nloads[id], load_len[id]); |
| 205 | |
| 206 | if (nloads[id] > 1) { |
| 207 | Label loop_srcs; |
| 208 | mov(reg_src_id, this->n_src_); |
| 209 | L(loop_srcs); |
| 210 | |
| 211 | accumulate(nloads[id], load_len[id], 0); |
| 212 | add(reg_src, this->src_ld_ * typesize); |
| 213 | |
| 214 | dec(reg_src_id); |
| 215 | jnz(loop_srcs, T_NEAR); |
| 216 | |
| 217 | sub(reg_src, this->n_src_ * this->src_ld_ * typesize); |
| 218 | } else { |
| 219 | for (int src_id = 0; src_id < this->n_src_; ++src_id) { |
| 220 | const size_t base_off = src_id * this->src_ld_ * typesize; |
| 221 | accumulate(nloads[id], load_len[id], base_off); |
| 222 | } |
| 223 | } |
| 224 | |
| 225 | store_dst(nloads[id], load_len[id]); |
| 226 | |
| 227 | add(reg_src, nloads[id] * load_len[id]); |
| 228 | add(reg_dst, nloads[id] * load_len[id]); |
| 229 | |
| 230 | sub(reg_x, nloads[id] * load_len[id]); |
| 231 | |
| 232 | jmp(loop_x_label[id], T_NEAR); |
| 233 | } |
| 234 | |
| 235 | L(loop_x_label[nbranches]); |
| 236 | |
| 237 | /* restore address registers */ |
| 238 | sub(reg_src, reg_nx); |
| 239 | sub(reg_dst, reg_nx); |
| 240 | } |
| 241 | |
| 242 | void generate() { |
| 243 | assert(isa == avx2 || isa == avx512_common || isa == avx512_mic); |
| 244 | |
| 245 | preamble(); |
| 246 | |
| 247 | shl(reg_nx, 2); |
| 248 | |
| 249 | Label ny_loop; |
| 250 | L(ny_loop); |
| 251 | |
| 252 | loop_x(); |
| 253 | |
| 254 | add(reg_dst, this->dst_step_ * typesize); |
| 255 | add(reg_src, this->src_step_ * typesize); |
| 256 | |
| 257 | dec(reg_ny); |
| 258 | jnz(ny_loop, T_NEAR); |
| 259 | |
| 260 | postamble(); |
| 261 | this->ker_ = reinterpret_cast<decltype(this->ker_)>( |
| 262 | const_cast<uint8_t*>(this->getCode())); |
| 263 | } |
| 264 | }; |
| 265 | |
| 266 | template <impl::data_type_t data_type> |
| 267 | inline reducer_2d_driver_t<data_type> *create_reduce_2d_drv(int n_src, |
| 268 | size_t src_ld, size_t src_step, size_t dst_step, bool nullify_dst) { |
| 269 | if (mayiuse(avx512_common)) |
| 270 | return new reducer_2d_driver_f_s_32_t<data_type, avx512_common>(n_src, |
| 271 | src_ld, src_step, dst_step, nullify_dst); |
| 272 | else if (mayiuse(avx2)) |
| 273 | return new reducer_2d_driver_f_s_32_t<data_type, avx2>(n_src, src_ld, |
| 274 | src_step, dst_step, nullify_dst); |
| 275 | assert(!"unimplemented" ); |
| 276 | return nullptr; |
| 277 | } |
| 278 | |
| 279 | /* cpu_reducer_t */ |
| 280 | |
| 281 | template <impl::data_type_t data_type> |
| 282 | void cpu_reducer_t<data_type>::conf_t::init_scratchpad( |
| 283 | memory_tracking::registrar_t &scratchpad) const { |
| 284 | if (balancer_.nthr_per_group_ == 1) return; |
| 285 | |
| 286 | const size_t space_size = balancer_.ngroups_ |
| 287 | * (balancer_.nthr_per_group_ - 1) |
| 288 | * cpu_reducer_t<data_type>::space_per_thread(balancer_); |
| 289 | scratchpad.book(key_reducer_space, sizeof(data_t) * space_size, PAGE_4K); |
| 290 | scratchpad.book(key_reducer_space_bctx, |
| 291 | sizeof(simple_barrier::ctx_t) * balancer_.ngroups_); |
| 292 | } |
| 293 | |
| 294 | template <impl::data_type_t data_type> |
| 295 | cpu_reducer_t<data_type>::cpu_reducer_t(const conf_t &conf) |
| 296 | : conf_(conf), drv_(nullptr) |
| 297 | { |
| 298 | if (balancer().nthr_per_group_ == 1) return; |
| 299 | |
| 300 | drv_ = create_reduce_2d_drv<data_type>(balancer().nthr_per_group_ - 1, |
| 301 | space_per_thread(balancer()), 0, 0, false); |
| 302 | } |
| 303 | |
| 304 | template <impl::data_type_t data_type> |
| 305 | cpu_reducer_t<data_type>::~cpu_reducer_t() { delete drv_; } |
| 306 | |
| 307 | template <impl::data_type_t data_type> |
| 308 | typename cpu_reducer_t<data_type>::data_t * |
| 309 | cpu_reducer_t<data_type>::get_local_ptr(int ithr, data_t *dst, |
| 310 | const memory_tracking::grantor_t &scratchpad) const { |
| 311 | const int id_in_grp = balancer().id_in_group(ithr); |
| 312 | |
| 313 | /* threads 0 from each group writes directly to the destination */ |
| 314 | if (id_in_grp == 0) |
| 315 | return dst + balancer().ithr_job_off(ithr) * balancer().job_size_; |
| 316 | |
| 317 | const int grp_id = balancer().group_id(ithr); |
| 318 | const int offset_factor = grp_id * (balancer().nthr_per_group_ - 1) |
| 319 | + (id_in_grp - 1); |
| 320 | |
| 321 | auto space = scratchpad.template get<data_t>(key_reducer_space); |
| 322 | return space + offset_factor * space_per_thread(balancer()); |
| 323 | } |
| 324 | |
| 325 | template <impl::data_type_t data_type> |
| 326 | void cpu_reducer_t<data_type>::reduce_nolock(int ithr, data_t *dst, |
| 327 | const memory_tracking::grantor_t &scratchpad) const { |
| 328 | bool redundant_reduction = balancer().nthr_per_group_ == 1 |
| 329 | || balancer().idle(ithr); |
| 330 | if (redundant_reduction) return; |
| 331 | |
| 332 | #ifdef SIMPLE_IMPL |
| 333 | if (balancer().id_in_group(ithr) != 0) |
| 334 | return; /* only threads 0 do the reduction */ |
| 335 | |
| 336 | const int njobs_in_grp = balancer().ithr_njobs(ithr); |
| 337 | data_t *d = get_local_ptr(ithr, dst, scratchpad); |
| 338 | for (int id_in_grp = 1; id_in_grp < balancer_.nthr_per_group_; ++id_in_grp) |
| 339 | { |
| 340 | const data_t *space = get_local_ptr(ithr + id_in_grp, dst, scratchpad); |
| 341 | for (size_t i = 0; i < (size_t)njobs_in_grp * balancer().job_size_; ++i) |
| 342 | d[i] += space[i]; |
| 343 | } |
| 344 | #else |
| 345 | using namespace utils; |
| 346 | |
| 347 | const int id_in_grp = balancer().id_in_group(ithr); |
| 348 | const int njobs_in_grp = balancer().ithr_njobs(ithr); |
| 349 | const size_t cl = 64 / sizeof(data_t); |
| 350 | |
| 351 | const size_t reduction_size = njobs_in_grp * balancer().job_size_; |
| 352 | size_t start{0}, end{0}; |
| 353 | balance211(div_up(reduction_size, cl), balancer().nthr_per_group_, |
| 354 | id_in_grp, start, end); |
| 355 | |
| 356 | if (start == end) return; |
| 357 | |
| 358 | data_t *d = get_local_ptr(ithr - id_in_grp, dst, scratchpad) + start * cl; |
| 359 | const data_t *space = get_local_ptr(ithr - id_in_grp + 1, dst, scratchpad) |
| 360 | + start * cl; |
| 361 | const size_t len = nstl::min(end * cl, reduction_size) - start * cl; |
| 362 | |
| 363 | (*drv_)(d, space, 1, len); |
| 364 | #endif |
| 365 | } |
| 366 | |
| 367 | template struct cpu_reducer_t<data_type::f32>; |
| 368 | template struct cpu_reducer_t<data_type::s32>; |
| 369 | |
| 370 | /* cpu_reducer_2d_t */ |
| 371 | |
| 372 | template <impl::data_type_t data_type> |
| 373 | void cpu_reducer_2d_t<data_type>::conf_t::init_scratchpad( |
| 374 | memory_tracking::registrar_t &scratchpad) const { |
| 375 | if (balancer_.nthr_per_group_ == 1) return; |
| 376 | |
| 377 | const size_t space_size = balancer_.ngroups_ * balancer_.nthr_per_group_ |
| 378 | * cpu_reducer_2d_t<data_type>::space_per_thread(balancer_); |
| 379 | scratchpad.book(key_reducer_space, sizeof(data_t) * space_size); |
| 380 | scratchpad.book(key_reducer_space_bctx, |
| 381 | sizeof(simple_barrier::ctx_t) * balancer_.ngroups_); |
| 382 | } |
| 383 | |
| 384 | template <impl::data_type_t data_type> |
| 385 | cpu_reducer_2d_t<data_type>::cpu_reducer_2d_t(const conf_t &conf) |
| 386 | : conf_(conf), drv_(nullptr) |
| 387 | { |
| 388 | if (balancer().nthr_per_group_ == 1) return; |
| 389 | |
| 390 | drv_ = create_reduce_2d_drv<data_type>(balancer().nthr_per_group_, |
| 391 | space_per_thread(balancer()), conf_.job_size_x_, conf_.dst_x_, |
| 392 | true); |
| 393 | } |
| 394 | |
| 395 | template <impl::data_type_t data_type> |
| 396 | cpu_reducer_2d_t<data_type>::~cpu_reducer_2d_t() { delete drv_; } |
| 397 | |
| 398 | template <impl::data_type_t data_type> |
| 399 | typename cpu_reducer_2d_t<data_type>::data_t *cpu_reducer_2d_t<data_type>:: |
| 400 | get_local_ptr(int ithr, const memory_tracking::grantor_t &scratchpad) const { |
| 401 | const int id_in_grp = balancer().id_in_group(ithr); |
| 402 | const int grp_id = balancer().group_id(ithr); |
| 403 | const int offset_factor = grp_id * balancer().nthr_per_group_ + id_in_grp; |
| 404 | auto space = scratchpad.template get<data_t>(key_reducer_space); |
| 405 | return space + offset_factor * space_per_thread(balancer()); |
| 406 | } |
| 407 | |
| 408 | template <impl::data_type_t data_type> |
| 409 | int cpu_reducer_2d_t<data_type>::choose_x_blocking(int nx, int ny, |
| 410 | int nthr_per_grp) const { |
| 411 | // find x_blocking for better balance reducing work between threads |
| 412 | assert(conf_.x_block_ > 0 && nx > conf_.x_block_ |
| 413 | && nx % conf_.x_block_ == 0); |
| 414 | int x_blocking = nx / conf_.x_block_; |
| 415 | int min_x_blocking = |
| 416 | utils::div_up(x_blocking, nstl::max(1, nthr_per_grp / ny)); |
| 417 | while (true) { |
| 418 | if (x_blocking % 2 == 0 && x_blocking >= min_x_blocking * 2) |
| 419 | x_blocking /= 2; |
| 420 | else if (x_blocking % 3 == 0 && x_blocking >= min_x_blocking * 3) |
| 421 | x_blocking /= 3; |
| 422 | else |
| 423 | break; |
| 424 | } |
| 425 | if (x_blocking >= min_x_blocking * 4) x_blocking = 1; |
| 426 | x_blocking *= conf_.x_block_; |
| 427 | return x_blocking; |
| 428 | } |
| 429 | |
| 430 | template <impl::data_type_t data_type> |
| 431 | void cpu_reducer_2d_t<data_type>::reduce_block(const data_t* space_base, |
| 432 | data_t *dst, int job, int start_y, int start_x, |
| 433 | int ny_start, int nx_start, int ny_step, int nx_step) const { |
| 434 | data_t *d = dst + (start_y + ny_start) * conf_.dst_x_ |
| 435 | + start_x + nx_start; |
| 436 | const data_t *space = space_base + job * balancer().job_size_ |
| 437 | + ny_start * conf_.job_size_x_ + nx_start; |
| 438 | #ifdef SIMPLE_IMPL |
| 439 | for (int idg = 0; idg < balancer().nthr_per_group_; ++idg) { |
| 440 | const data_t *w = &space[idg * space_per_thread(balancer())]; |
| 441 | for (int y = 0; y < ny_step; ++y) |
| 442 | for (int x = 0; x < nx_step; ++x) { |
| 443 | d[y * conf_.dst_x_ + x] |
| 444 | = (idg == 0 ? 0 : d[y * conf_.dst_x_ + x]) |
| 445 | + w[y * conf_.job_size_x_ + x]; |
| 446 | } |
| 447 | } |
| 448 | #else |
| 449 | (*drv_)(d, space, ny_step, nx_step); |
| 450 | #endif |
| 451 | } |
| 452 | |
| 453 | template <impl::data_type_t data_type> |
| 454 | void cpu_reducer_2d_t<data_type>::reduce_nolock(int ithr, data_t *dst, |
| 455 | const memory_tracking::grantor_t &scratchpad) const { |
| 456 | bool redundant_reduction = balancer().nthr_per_group_ == 1 |
| 457 | || balancer().idle(ithr); |
| 458 | if (redundant_reduction) return; |
| 459 | |
| 460 | const int id_in_grp = balancer().id_in_group(ithr); |
| 461 | const int njobs_in_grp = balancer().ithr_njobs(ithr); |
| 462 | const int njobs_x = utils::div_up(conf_.dst_x_, conf_.job_size_x_); |
| 463 | const int global_job_start = balancer().ithr_job_off(ithr); |
| 464 | |
| 465 | const data_t *space_base = get_local_ptr(ithr - id_in_grp, scratchpad); |
| 466 | |
| 467 | const int pr_grps = nstl::min(njobs_in_grp, balancer().nthr_per_group_); |
| 468 | const int pr_nthr_per_grp = balancer().nthr_per_group_ / pr_grps; |
| 469 | |
| 470 | if (id_in_grp >= pr_grps * pr_nthr_per_grp) |
| 471 | return; /* idle */ |
| 472 | |
| 473 | const int pr_my_grp = id_in_grp / pr_nthr_per_grp; |
| 474 | const int pr_my_id = id_in_grp % pr_nthr_per_grp; |
| 475 | |
| 476 | int pr_job_start{0}, pr_job_end{0}; |
| 477 | balance211(njobs_in_grp, pr_grps, pr_my_grp, pr_job_start, pr_job_end); |
| 478 | |
| 479 | for (int j = pr_job_start; j < pr_job_end; ++j) { |
| 480 | const int global_job = global_job_start + j; |
| 481 | const int j_y = global_job / njobs_x; |
| 482 | const int j_x = global_job % njobs_x; |
| 483 | const int start_y = j_y * conf_.job_size_y_; |
| 484 | const int start_x = j_x * conf_.job_size_x_; |
| 485 | const int ny = nstl::min(conf_.dst_y_ - start_y, conf_.job_size_y_); |
| 486 | const int nx = nstl::min(conf_.dst_x_ - start_x, conf_.job_size_x_); |
| 487 | int x_blocking = choose_x_blocking(nx, ny, pr_nthr_per_grp); |
| 488 | |
| 489 | int nxy_start{0}, nxy_end{0}; |
| 490 | balance211(ny * nx / x_blocking, pr_nthr_per_grp, pr_my_id, |
| 491 | nxy_start, nxy_end); |
| 492 | if (nxy_start == nxy_end) continue; |
| 493 | nxy_start *= x_blocking; |
| 494 | nxy_end *= x_blocking; |
| 495 | |
| 496 | int nxy = nxy_start; |
| 497 | if (nxy % nx != 0) { |
| 498 | int nx_step = nstl::min(nx - nxy % nx, nxy_end - nxy); |
| 499 | reduce_block(space_base, dst, j, start_y, start_x, |
| 500 | nxy / nx, nxy % nx, 1, nx_step); |
| 501 | nxy += nx_step; |
| 502 | } |
| 503 | if ((nxy_end - nxy) > nx) { |
| 504 | int ny_step = (nxy_end - nxy) / nx; |
| 505 | reduce_block(space_base, dst, j, start_y, start_x, |
| 506 | nxy / nx, nxy % nx, ny_step, nx); |
| 507 | nxy += nx * ny_step; |
| 508 | } |
| 509 | if ((nxy_end - nxy) > 0) { |
| 510 | reduce_block(space_base, dst, j, start_y, start_x, |
| 511 | nxy / nx, nxy % nx, 1, nxy_end - nxy); |
| 512 | } |
| 513 | } |
| 514 | } |
| 515 | |
| 516 | template struct cpu_reducer_2d_t<data_type::f32>; |
| 517 | template struct cpu_reducer_2d_t<data_type::s32>; |
| 518 | |
| 519 | /* accumulator section */ |
| 520 | |
| 521 | template <impl::data_type_t data_type> |
| 522 | cpu_accumulator_1d_t<data_type>::cpu_accumulator_1d_t(): drv_(nullptr) { |
| 523 | drv_ = create_reduce_2d_drv<data_type>(1, 0, 0, 0, false); |
| 524 | } |
| 525 | |
| 526 | template <impl::data_type_t data_type> |
| 527 | cpu_accumulator_1d_t<data_type>::~cpu_accumulator_1d_t() { |
| 528 | delete drv_; |
| 529 | } |
| 530 | |
| 531 | template <impl::data_type_t data_type> |
| 532 | void cpu_accumulator_1d_t<data_type>::accumulate(data_t *dst, |
| 533 | const data_t *src, size_t size) { |
| 534 | (*drv_)(dst, src, 1, size); |
| 535 | } |
| 536 | |
| 537 | template struct cpu_accumulator_1d_t<data_type::f32>; |
| 538 | template struct cpu_accumulator_1d_t<data_type::s32>; |
| 539 | |
| 540 | } |
| 541 | } |
| 542 | } |
| 543 | |
| 544 | // vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s |
| 545 | |