| 1 | /******************************************************************************* |
| 2 | * Copyright 2016-2018 Intel Corporation |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | *******************************************************************************/ |
| 16 | |
| 17 | #ifndef CPU_SIMPLE_REORDER_HPP |
| 18 | #define CPU_SIMPLE_REORDER_HPP |
| 19 | |
| 20 | #include <assert.h> |
| 21 | |
| 22 | #include "c_types_map.hpp" |
| 23 | #include "type_helpers.hpp" |
| 24 | #include "math_utils.hpp" |
| 25 | #include "mkldnn_thread.hpp" |
| 26 | #include "utils.hpp" |
| 27 | |
| 28 | #include "tag_traits.hpp" |
| 29 | #include "cpu_reorder_pd.hpp" |
| 30 | #include "cpu_primitive.hpp" |
| 31 | |
| 32 | #include "simple_q10n.hpp" |
| 33 | #include "cpu_isa_traits.hpp" |
| 34 | |
| 35 | namespace mkldnn { |
| 36 | namespace impl { |
| 37 | namespace cpu { |
| 38 | |
| 39 | using namespace mkldnn::impl::status; |
| 40 | using namespace mkldnn::impl::format_tag; |
| 41 | using namespace mkldnn::impl::data_type; |
| 42 | |
| 43 | using bd = block_dim_t; |
| 44 | using ib = inner_blk_t; |
| 45 | |
| 46 | using namespace mkldnn::impl::utils; |
| 47 | using math::saturate; |
| 48 | |
| 49 | template<impl::data_type_t type> |
| 50 | using data_t = typename prec_traits<type>::type; |
| 51 | |
| 52 | template<impl::data_type_t type_i, impl::data_type_t type_o> |
| 53 | using _qz_a1b0 = qz_a1b0<data_t<type_i>, data_t<type_o>>; |
| 54 | |
| 55 | template<impl::data_type_t type_i, impl::data_type_t type_o> |
| 56 | using _qz = qz<data_t<type_i>, data_t<type_o>>; |
| 57 | |
| 58 | namespace fmt_order { |
| 59 | const bool keep = true; |
| 60 | const bool reverse = false; |
| 61 | const bool any = keep; |
| 62 | } |
| 63 | |
| 64 | namespace spec { |
| 65 | struct direct_copy {}; |
| 66 | struct direct_copy_except_dim_0 {}; |
| 67 | struct reference {}; |
| 68 | struct conv_s8s8 {}; |
| 69 | } |
| 70 | |
| 71 | #define SIMPLE_REORDER_TEMPL_DECL \ |
| 72 | impl::data_type_t type_i, impl::format_tag_t tag_i, \ |
| 73 | impl::data_type_t type_o, impl::format_tag_t tag_o, bool order_keep |
| 74 | #define SIMPLE_REORDER_TEMPL_CALL \ |
| 75 | type_i, tag_i, type_o, tag_o, order_keep |
| 76 | |
| 77 | #define DECLARE_COMMON_PARAMS() \ |
| 78 | const memory_desc_wrapper &input_d = pd->src_md(); \ |
| 79 | const memory_desc_wrapper &output_d = pd->dst_md(); \ |
| 80 | const float alpha = pd->alpha(); MAYBE_UNUSED(alpha); \ |
| 81 | const float beta = pd->beta(); MAYBE_UNUSED(beta); |
| 82 | |
| 83 | /* specific reorders: common template */ |
| 84 | template <SIMPLE_REORDER_TEMPL_DECL, typename spec = void> |
| 85 | struct simple_reorder_impl {}; |
| 86 | |
| 87 | namespace { |
| 88 | inline bool simple_fmt_check(bool order_keep, impl::format_tag_t tag_i, |
| 89 | impl::format_tag_t tag_o, const memory_desc_wrapper &input_d, |
| 90 | const memory_desc_wrapper &output_d) { |
| 91 | return input_d.matches_tag(order_keep ? tag_i : tag_o) |
| 92 | && output_d.matches_tag(order_keep ? tag_o : tag_i); |
| 93 | } |
| 94 | inline bool simple_attr_check(const primitive_attr_t *attr, bool many_scales_support) { |
| 95 | if (many_scales_support) |
| 96 | return true; |
| 97 | return IMPLICATION(attr, attr->output_scales_.mask_ == 0); |
| 98 | } |
| 99 | } |
| 100 | |
| 101 | /* specific reorders: implementation */ |
| 102 | template <SIMPLE_REORDER_TEMPL_DECL> |
| 103 | struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL, |
| 104 | typename utils::enable_if<tag_i == any && (false |
| 105 | || tag_o == hwio |
| 106 | || tag_o == hwigo) |
| 107 | , spec::conv_s8s8>::type> |
| 108 | { |
| 109 | static bool is_applicable(const memory_desc_wrapper &input_d, |
| 110 | const memory_desc_wrapper &output_d, const primitive_attr_t *attr) |
| 111 | { |
| 112 | const size_t D_mask = utils::array_product(input_d.dims(), |
| 113 | math::ilog2q(attr->output_scales_.mask_ + 1)); |
| 114 | const int oc = (input_d.dims()[tag_o == hwigo + 0]); |
| 115 | const int g = (tag_o == hwigo) ? (input_d.dims()[0]) : 1; |
| 116 | |
| 117 | return output_d.matches_tag(tag_o) |
| 118 | && (output_d.extra().flags & memory_extra_flags::compensation_conv_s8s8) |
| 119 | && (input_d.data_type() == f32 || input_d.data_type() == s8) |
| 120 | && output_d.data_type() == s8 |
| 121 | && (D_mask == 1 || D_mask == (size_t)g * oc); |
| 122 | } |
| 123 | |
| 124 | static status_t execute(const cpu_reorder_pd_t *pd, |
| 125 | const data_t<type_i> *input, data_t<type_o> *output) { |
| 126 | DECLARE_COMMON_PARAMS(); |
| 127 | |
| 128 | static constexpr bool w_groups = tag_o == hwigo; |
| 129 | |
| 130 | const auto &dims = input_d.dims(); |
| 131 | const auto &pdims = output_d.padded_dims(); |
| 132 | |
| 133 | const int G = w_groups ? dims[0] : 1; |
| 134 | const int OC = dims[w_groups + 0]; |
| 135 | const int IC = dims[w_groups + 1]; |
| 136 | const int H = dims[w_groups + 2]; |
| 137 | const int W = dims[w_groups + 3]; |
| 138 | |
| 139 | const float *scales = pd->attr()->output_scales_.scales_; |
| 140 | const size_t D_mask = utils::array_product(input_d.dims(), |
| 141 | math::ilog2q(pd->attr()->output_scales_.mask_ + 1)); |
| 142 | |
| 143 | assert(output_d.extra().flags |
| 144 | & memory_extra_flags::compensation_conv_s8s8); |
| 145 | float adj_scale = |
| 146 | (output_d.extra().flags & memory_extra_flags::scale_adjust) |
| 147 | ? output_d.extra().scale_adjust : 1.f; |
| 148 | |
| 149 | size_t offset = G * pdims[w_groups + 0] * pdims[w_groups + 1] * H * W; |
| 150 | int32_t *cp = reinterpret_cast<int32_t *>(output + offset); |
| 151 | |
| 152 | parallel_nd(G, OC, [&](int g, int oc) { |
| 153 | cp[g * OC + oc] = 0; |
| 154 | for (int ic = 0; ic < IC; ic++) |
| 155 | for (int h = 0; h < H; h++) |
| 156 | for (int w = 0; w < W; w++) { |
| 157 | auto i = input[input_d.blk_off<!w_groups>(g, oc, ic, h, w)]; |
| 158 | auto &o = output[output_d.blk_off<!w_groups>(g, oc, ic, h, w)]; |
| 159 | const float s = scales[(D_mask == 1) ? 0 : g * OC + oc]; |
| 160 | |
| 161 | o = qz_b0<data_t<type_i>, data_t<type_o>>()( |
| 162 | i, s * adj_scale); |
| 163 | cp[g * OC + oc] -= (int32_t)o; |
| 164 | } |
| 165 | cp [g * OC + oc] *= 128; |
| 166 | }); |
| 167 | return success; |
| 168 | } |
| 169 | }; |
| 170 | |
| 171 | template <SIMPLE_REORDER_TEMPL_DECL> |
| 172 | struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL, |
| 173 | typename utils::enable_if< |
| 174 | (tag_i == goiw && tag_o == gOIw4i16o4i) |
| 175 | || (tag_i == oiw && tag_o == OIw4i16o4i) |
| 176 | || (tag_i == goihw && tag_o == gOIhw4i16o4i) |
| 177 | || (tag_i == oihw && tag_o == OIhw4i16o4i) |
| 178 | || (tag_i == goihw && tag_o == gOIhw2i8o4i) |
| 179 | || (tag_i == goihw && tag_o == gOIhw4o4i) |
| 180 | , spec::conv_s8s8>::type> |
| 181 | { |
| 182 | static bool is_applicable(const memory_desc_wrapper &input_d, |
| 183 | const memory_desc_wrapper &output_d, const primitive_attr_t *attr) |
| 184 | { |
| 185 | const size_t D_mask = utils::array_product(input_d.dims(), |
| 186 | math::ilog2q(attr->output_scales_.mask_ + 1)); |
| 187 | const bool w_groups = !utils::one_of(tag_o, OIw4i16o4i, OIhw4i16o4i); |
| 188 | const int oc = (input_d.dims()[w_groups ? 1 : 0]); |
| 189 | const int g = w_groups ? input_d.dims()[0] : 1; |
| 190 | |
| 191 | return input_d.matches_tag(tag_i) |
| 192 | && output_d.matches_tag(tag_o) |
| 193 | && (output_d.extra().flags & memory_extra_flags::compensation_conv_s8s8) |
| 194 | && (input_d.data_type() == f32 || input_d.data_type() == s8) |
| 195 | && output_d.data_type() == s8 |
| 196 | && (D_mask == 1 || D_mask == (size_t)g * oc); |
| 197 | } |
| 198 | |
| 199 | static status_t execute(const cpu_reorder_pd_t *pd, |
| 200 | const data_t<type_i> *input, data_t<type_o> *output) { |
| 201 | DECLARE_COMMON_PARAMS(); |
| 202 | |
| 203 | static constexpr bool w_groups = |
| 204 | !utils::one_of(tag_o, OIw4i16o4i, OIhw4i16o4i); |
| 205 | constexpr int is_1d = |
| 206 | utils::one_of(tag_o, gOIw4i16o4i, OIw4i16o4i); |
| 207 | constexpr int blksize = tag_traits<tag_o>::inner_blks == ib::_4b4c |
| 208 | ? 4 |
| 209 | : tag_traits<tag_o>::inner_blks == ib::_2c8b4c |
| 210 | ? 8 |
| 211 | : 16; |
| 212 | |
| 213 | const auto &_g_oihw_d = order_keep ? input_d : output_d; |
| 214 | const auto &dims = input_d.dims(); |
| 215 | const auto &pdims = order_keep |
| 216 | ? output_d.padded_dims() |
| 217 | : input_d.padded_dims(); |
| 218 | |
| 219 | const int G = w_groups ? dims[0] : 1; |
| 220 | const int OC = dims[w_groups + 0]; |
| 221 | const int NB_OC = pdims[w_groups + 0] / blksize; |
| 222 | const int IC = dims[w_groups + 1]; |
| 223 | const int NB_IC = pdims[w_groups + 1] / blksize; |
| 224 | const int H = is_1d ? 1 : dims[w_groups + 2]; |
| 225 | const int W = dims[w_groups + 3 - is_1d]; |
| 226 | |
| 227 | const float *scales = pd->attr()->output_scales_.scales_; |
| 228 | const size_t D_mask = utils::array_product(input_d.dims(), |
| 229 | math::ilog2q(pd->attr()->output_scales_.mask_ + 1)); |
| 230 | |
| 231 | assert(output_d.extra().flags |
| 232 | & memory_extra_flags::compensation_conv_s8s8); |
| 233 | float adj_scale = |
| 234 | (output_d.extra().flags & memory_extra_flags::scale_adjust) |
| 235 | ? output_d.extra().scale_adjust : 1.f; |
| 236 | |
| 237 | auto ker = [&](const data_t<type_i> *inp, data_t<type_o> *out, |
| 238 | int32_t *c, const float *s, const int oc_block, const int ic_block) { |
| 239 | # define index AB_or_BC_blk_off<tag_traits<tag_o>::inner_blks> |
| 240 | |
| 241 | for (int ic = 0; ic < ic_block; ++ic) { |
| 242 | for (int oc = 0; oc < oc_block; ++oc) { |
| 243 | const auto _g_oihw_off = |
| 244 | oc * _g_oihw_d.blocking_desc().strides[w_groups + 0] |
| 245 | + ic * _g_oihw_d.blocking_desc().strides[w_groups + 1]; |
| 246 | out[index(oc, ic)] |
| 247 | = qz_b0<data_t<type_i>, data_t<type_o>>()( |
| 248 | inp[_g_oihw_off], s[oc] * adj_scale); |
| 249 | c[oc] -= (128 * (int32_t)(out[index(oc, ic)])); |
| 250 | } |
| 251 | } |
| 252 | # undef index |
| 253 | }; |
| 254 | |
| 255 | constexpr int i_mult = blksize; |
| 256 | constexpr int o_mult = 1; |
| 257 | |
| 258 | size_t offset = G * pdims[w_groups+0] * pdims[w_groups+1] * H * W; |
| 259 | int32_t *cp = reinterpret_cast<int32_t *>(output + offset); |
| 260 | parallel_nd(G * NB_OC * blksize, [&](int i) { |
| 261 | cp[i] = 0; |
| 262 | }); |
| 263 | |
| 264 | # define wei_blk_off(md, g, o, i, h, w) \ |
| 265 | (is_1d ? (md).blk_off<!w_groups>(g, o, i, w) \ |
| 266 | : (md).blk_off<!w_groups>(g, o, i, h, w)) |
| 267 | |
| 268 | parallel_nd(G, NB_OC, [&](int g, int O) { |
| 269 | for (int I = 0; I < NB_IC; I++) |
| 270 | for (int h = 0; h < H; h++) |
| 271 | for (int w = 0; w < W; w++) { |
| 272 | auto i = &input[wei_blk_off( |
| 273 | input_d, g, i_mult * O, i_mult * I, h, w)]; |
| 274 | auto o = &output[wei_blk_off( |
| 275 | output_d, g, o_mult * O, o_mult * I, h, w)]; |
| 276 | const int oc_block = nstl::min(blksize, OC - O * blksize); |
| 277 | const int ic_block = nstl::min(blksize, IC - I * blksize); |
| 278 | |
| 279 | int _offset = (g * NB_OC + O) * blksize; |
| 280 | ker(i, o, (order_keep) ? &cp[_offset] : nullptr, |
| 281 | &scales[(D_mask == 1) ? 0 : _offset], |
| 282 | oc_block, ic_block); |
| 283 | } |
| 284 | }); |
| 285 | |
| 286 | # undef wei_blk_off |
| 287 | |
| 288 | return success; |
| 289 | } |
| 290 | }; |
| 291 | |
| 292 | template <SIMPLE_REORDER_TEMPL_DECL> |
| 293 | struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL, |
| 294 | typename utils::enable_if<false |
| 295 | ||(tag_i == goiw && tag_o == Goiw16g) |
| 296 | ||(tag_i == goihw && tag_o == Goihw16g) |
| 297 | , spec::conv_s8s8>::type> |
| 298 | { |
| 299 | static bool is_applicable(const memory_desc_wrapper &input_d, |
| 300 | const memory_desc_wrapper &output_d, const primitive_attr_t *attr) { |
| 301 | const size_t D_mask = utils::array_product(input_d.dims(), |
| 302 | math::ilog2q(attr->output_scales_.mask_ + 1)); |
| 303 | const int oc = input_d.dims()[1]; |
| 304 | const int g = input_d.dims()[0]; |
| 305 | |
| 306 | return true |
| 307 | && order_keep |
| 308 | && input_d.matches_tag(tag_i) |
| 309 | && output_d.matches_tag(tag_o) |
| 310 | && (output_d.extra().flags & memory_extra_flags::compensation_conv_s8s8) |
| 311 | && (input_d.data_type() == f32 || input_d.data_type() == s8) |
| 312 | && output_d.data_type() == s8 |
| 313 | && (D_mask == 1 || D_mask == (size_t)g * oc); |
| 314 | } |
| 315 | |
| 316 | static status_t execute(const cpu_reorder_pd_t *pd, |
| 317 | const data_t<type_i> *input, data_t<type_o> *output) { |
| 318 | DECLARE_COMMON_PARAMS(); |
| 319 | |
| 320 | constexpr bool is_1d = tag_i == goiw; |
| 321 | constexpr int blksize = 16; |
| 322 | |
| 323 | const auto &dims = input_d.dims(); |
| 324 | const auto &pdims = output_d.padded_dims(); |
| 325 | const int G = dims[0]; |
| 326 | const int Gp = pdims[0]; |
| 327 | const int OC = dims[1]; |
| 328 | const int IC = dims[2]; |
| 329 | const int H = is_1d ? 1 : dims[3]; |
| 330 | const int W = dims[4 - is_1d]; |
| 331 | |
| 332 | const size_t D_mask = utils::array_product(input_d.dims(), |
| 333 | math::ilog2q(pd->attr()->output_scales_.mask_ + 1)); |
| 334 | const float *scales = pd->attr()->output_scales_.scales_; |
| 335 | |
| 336 | assert(output_d.extra().flags |
| 337 | & memory_extra_flags::compensation_conv_s8s8); |
| 338 | float adj_scale = |
| 339 | (output_d.extra().flags & memory_extra_flags::scale_adjust) |
| 340 | ? output_d.extra().scale_adjust : 1.f; |
| 341 | |
| 342 | auto ker = [&](const data_t<type_i> *inp, data_t<type_o> *out, |
| 343 | int32_t *cp, const float *s, const int g_block) { |
| 344 | PRAGMA_OMP_SIMD() |
| 345 | for (int g = 0; g < g_block; g++) { |
| 346 | const auto i_off = g * input_d.blocking_desc().strides[0]; |
| 347 | out[g] = qz_b0<data_t<type_i>, data_t<type_o>>()( |
| 348 | inp[i_off], s[g * OC] * adj_scale); |
| 349 | cp[g * OC] -= 128 * (int32_t)(out[g]); |
| 350 | } |
| 351 | }; |
| 352 | |
| 353 | size_t cp_offset = output_d.size() - output_d.additional_buffer_size(); |
| 354 | int32_t *cp = reinterpret_cast<int32_t *>(output + cp_offset); |
| 355 | parallel_nd((Gp/blksize) * OC, [&](int ib) { |
| 356 | PRAGMA_OMP_SIMD() |
| 357 | for (int i = 0; i < blksize; i++) |
| 358 | cp[ib * blksize + i] = 0; |
| 359 | }); |
| 360 | |
| 361 | # define wei_blk_off(md, g, o, i, h, w) \ |
| 362 | (is_1d ? (md).blk_off(g, o, i, w) : (md).blk_off(g, o, i, h, w)) |
| 363 | |
| 364 | parallel_nd(Gp/blksize, OC, [&](int gb, int O) { |
| 365 | for (int I = 0; I < IC; I++) { |
| 366 | for (int h = 0; h < H; h++) |
| 367 | for (int w = 0; w < W; w++) |
| 368 | { |
| 369 | const int g_block = nstl::min(G - gb * blksize, blksize); |
| 370 | const auto inp = &input[wei_blk_off( |
| 371 | input_d, gb * blksize, O, I, h, w)]; |
| 372 | const auto out = &output[wei_blk_off( |
| 373 | output_d, gb, O, I, h, w)]; |
| 374 | int offset = gb * blksize + O; |
| 375 | ker(inp, out, &cp[offset], |
| 376 | &scales[(D_mask == 1) ? 0 : offset], g_block); |
| 377 | } |
| 378 | } |
| 379 | }); |
| 380 | |
| 381 | # undef wei_blk_off |
| 382 | |
| 383 | return success; |
| 384 | } |
| 385 | }; |
| 386 | |
| 387 | /* reorders with tail support */ |
| 388 | |
| 389 | template <SIMPLE_REORDER_TEMPL_DECL> |
| 390 | struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL, |
| 391 | typename utils::enable_if<false |
| 392 | || (tag_i == nCdhw8c && tag_o == nCdhw16c) |
| 393 | || (tag_i == nChw8c && tag_o == nChw16c) |
| 394 | || (tag_i == nCw8c && tag_o == nCw16c) |
| 395 | >::type> |
| 396 | { |
| 397 | static bool is_applicable(const memory_desc_wrapper &input_d, |
| 398 | const memory_desc_wrapper &output_d, const primitive_attr_t *attr) |
| 399 | { |
| 400 | return simple_fmt_check(order_keep, tag_i, tag_o, input_d, output_d) |
| 401 | && simple_attr_check(attr, false); |
| 402 | } |
| 403 | |
| 404 | static status_t execute(const cpu_reorder_pd_t *pd, |
| 405 | const data_t<type_i> *input, data_t<type_o> *output) { |
| 406 | DECLARE_COMMON_PARAMS(); |
| 407 | |
| 408 | constexpr int is_1d = tag_i == nCw8c; |
| 409 | constexpr int is_3d = tag_i == nCdhw8c; |
| 410 | constexpr int blksize_16 = 16; |
| 411 | constexpr int blksize_8 = 8; |
| 412 | constexpr int ic_mult = order_keep ? 2 : 1; |
| 413 | constexpr int oc_mult = order_keep ? 1 : 2; |
| 414 | |
| 415 | const auto &dims = input_d.dims(); |
| 416 | const auto &pdims = order_keep ? output_d.padded_dims() |
| 417 | : input_d.padded_dims(); |
| 418 | |
| 419 | const int C = dims[1]; |
| 420 | const int D = is_3d ? dims[2] : 1; |
| 421 | const int H = is_1d ? 1 : dims[2 + is_3d]; |
| 422 | const int W = dims[3 + is_3d - is_1d]; |
| 423 | |
| 424 | auto ker = [&](const data_t<type_i> *i, data_t<type_o> *o, |
| 425 | const int block_16) { |
| 426 | const int nb = (block_16 - 1) / blksize_8 + 1; |
| 427 | if (alpha == 1.0 && beta == 0.0) { |
| 428 | for (int b = 0; b < nb; ++b) { |
| 429 | const ptrdiff_t i_off = order_keep ? b : b * blksize_8; |
| 430 | const ptrdiff_t o_off = order_keep ? b * blksize_8 : b; |
| 431 | const int block_8 = nstl::min(blksize_8, |
| 432 | block_16 - b * blksize_8); |
| 433 | for (int c = 0; c < block_8; ++c) { |
| 434 | o[o_off + c] = _qz_a1b0<type_i, type_o>()( |
| 435 | i[i_off + c]); |
| 436 | } |
| 437 | } |
| 438 | } else { |
| 439 | for (int b = 0; b < nb; ++b) { |
| 440 | const ptrdiff_t i_off = order_keep ? b : b * blksize_8; |
| 441 | const ptrdiff_t o_off = order_keep ? b * blksize_8 : b; |
| 442 | const int block_8 = nstl::min(blksize_8, |
| 443 | block_16 - b * blksize_8); |
| 444 | for (int c = 0; c < block_8; ++c) { |
| 445 | o[o_off + c] = _qz<type_i, type_o>()(i[i_off + c], |
| 446 | o[o_off + c], alpha, beta); |
| 447 | } |
| 448 | } |
| 449 | } |
| 450 | }; |
| 451 | |
| 452 | # define data_blk_off(md, n, c, d, h, w) \ |
| 453 | ( is_1d ? (md).blk_off(n, c, w) \ |
| 454 | : is_3d ? (md).blk_off(n, c, d, h, w) : (md).blk_off(n, c, h, w)) |
| 455 | |
| 456 | parallel_nd(dims[0], pdims[1] / blksize_16, D, H, W, |
| 457 | [&](int n, int nb_c, int d, int h, int w) { |
| 458 | auto i = &input[data_blk_off(input_d, n, ic_mult * nb_c, d, h, w)]; |
| 459 | auto o = &output[data_blk_off(output_d, n, oc_mult * nb_c, d, h, w)]; |
| 460 | const int block_16 = nstl::min(blksize_16, C - nb_c * blksize_16); |
| 461 | ker(i, o, block_16); |
| 462 | }); |
| 463 | |
| 464 | # undef data_blk_off |
| 465 | |
| 466 | return success; |
| 467 | } |
| 468 | }; |
| 469 | |
| 470 | #define PLAIN_TO_BLOCKED_IS_APPLICABLE() \ |
| 471 | static bool is_applicable(const memory_desc_wrapper &input_d, \ |
| 472 | const memory_desc_wrapper &output_d, const primitive_attr_t *attr) { \ |
| 473 | return simple_attr_check(attr, false) && (order_keep \ |
| 474 | ? output_d.matches_tag(tag_o) && input_d.is_plain() \ |
| 475 | : input_d.matches_tag(tag_o) && output_d.is_plain()); \ |
| 476 | } |
| 477 | |
| 478 | template <SIMPLE_REORDER_TEMPL_DECL> |
| 479 | struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL, |
| 480 | typename utils::enable_if<tag_i == any |
| 481 | && (tag_traits<tag_o>::block_dims == bd::_A |
| 482 | || tag_traits<tag_o>::block_dims == bd::_B) |
| 483 | && tag_traits<tag_o>::ndims >= 3 |
| 484 | && tag_traits<tag_o>::ndims <= 6 |
| 485 | >::type> |
| 486 | { |
| 487 | PLAIN_TO_BLOCKED_IS_APPLICABLE(); |
| 488 | |
| 489 | static status_t execute(const cpu_reorder_pd_t *pd, |
| 490 | const data_t<type_i> *input, data_t<type_o> *output) { |
| 491 | DECLARE_COMMON_PARAMS(); |
| 492 | |
| 493 | const auto &flat_d = order_keep ? input_d : output_d; |
| 494 | const auto &block_d = order_keep ? output_d : input_d; |
| 495 | const auto &dims = input_d.dims(); |
| 496 | const auto &pdims = block_d.padded_dims(); |
| 497 | |
| 498 | constexpr int ndims = tag_traits<tag_o>::ndims; |
| 499 | constexpr int blk_idx = tag_traits<tag_o>::block_dims == bd::_A ? 0 : 1; |
| 500 | |
| 501 | const dim_t H0 = dims[0]; |
| 502 | const dim_t H1 = dims[1]; |
| 503 | const dim_t M0 = ndims >= 6 ? dims[ndims - 4] : 1; |
| 504 | const dim_t M1 = ndims >= 5 ? dims[ndims - 3] : 1; |
| 505 | const dim_t M2 = ndims >= 4 ? dims[ndims - 2] : 1; |
| 506 | const dim_t L = dims[ndims - 1]; |
| 507 | const dim_t l_blk_stride = block_d.blocking_desc().strides[ndims - 1]; |
| 508 | |
| 509 | constexpr int blksize = false ? 0 |
| 510 | : utils::one_of(tag_traits<tag_o>::inner_blks, ib::_4a, ib::_4b) ? 4 |
| 511 | : utils::one_of(tag_traits<tag_o>::inner_blks, ib::_8a, ib::_8b) ? 8 |
| 512 | : 16; |
| 513 | |
| 514 | auto ker = [&](const data_t<type_i> *i, data_t<type_o> *o, int block) { |
| 515 | if (alpha == 1.0 && beta == 0.0) { |
| 516 | for (int l = 0; l < L; ++l) |
| 517 | for (int blk = 0; blk < block; ++blk) { |
| 518 | const dim_t flat_off = 0 |
| 519 | + blk * flat_d.blocking_desc().strides[blk_idx] |
| 520 | + l * flat_d.blocking_desc().strides[ndims - 1]; |
| 521 | if (order_keep) { |
| 522 | o[l * l_blk_stride + blk] = _qz_a1b0<type_i, type_o>()( |
| 523 | i[flat_off]); |
| 524 | } else { |
| 525 | o[flat_off] = _qz_a1b0<type_i, type_o>()( |
| 526 | i[l * l_blk_stride + blk]); |
| 527 | } |
| 528 | } |
| 529 | } else { |
| 530 | for (int l = 0; l < L; ++l) |
| 531 | for (int blk = 0; blk < block; ++blk) { |
| 532 | const dim_t flat_off = 0 |
| 533 | + blk * flat_d.blocking_desc().strides[blk_idx] |
| 534 | + l * flat_d.blocking_desc().strides[ndims - 1]; |
| 535 | if (order_keep) { |
| 536 | o[l * l_blk_stride + blk] = _qz<type_i, type_o>()( |
| 537 | i[flat_off], o[l * blksize + blk], |
| 538 | alpha, beta); |
| 539 | } else { |
| 540 | o[flat_off] = _qz<type_i, type_o>()( |
| 541 | i[l * l_blk_stride + blk], o[flat_off], |
| 542 | alpha, beta); |
| 543 | } |
| 544 | } |
| 545 | } |
| 546 | }; |
| 547 | |
| 548 | # define off(md, h0, h1, m0, m1, m2) \ |
| 549 | (ndims >= 6 ? (md).blk_off(h0, h1, m0, m1, m2) \ |
| 550 | : ndims >= 5 ? (md).blk_off(h0, h1, m1, m2) \ |
| 551 | : ndims >= 4 ? (md).blk_off(h0, h1, m2) \ |
| 552 | : /* ndims >= 3 ? */ (md).blk_off(h0, h1)) |
| 553 | |
| 554 | constexpr int i_mult = order_keep ? blksize : 1; |
| 555 | constexpr int o_mult = order_keep ? 1 : blksize; |
| 556 | |
| 557 | if (blk_idx == 0) { |
| 558 | const dim_t BH0 = pdims[0] / blksize; |
| 559 | parallel_nd(BH0, H1, M0, M1, M2, |
| 560 | [&](dim_t bh0, dim_t h1, dim_t m0, dim_t m1, dim_t m2) { |
| 561 | auto i = &input[off(input_d, bh0 * i_mult, h1, m0, m1, m2)]; |
| 562 | auto o = &output[off(output_d, bh0 * o_mult, h1, m0, m1, m2)]; |
| 563 | const int block = nstl::min<int>(blksize, H0 - bh0 * blksize); |
| 564 | ker(i, o, block); |
| 565 | }); |
| 566 | } else if (blk_idx == 1) { |
| 567 | const dim_t BH1 = pdims[1] / blksize; |
| 568 | parallel_nd(H0, BH1, M0, M1, M2, |
| 569 | [&](dim_t h0, dim_t bh1, dim_t m0, dim_t m1, dim_t m2) { |
| 570 | auto i = &input[off(input_d, h0, bh1 * i_mult, m0, m1, m2)]; |
| 571 | auto o = &output[off(output_d, h0, bh1 * o_mult, m0, m1, m2)]; |
| 572 | const int block = nstl::min<int>(blksize, H1 - bh1 * blksize); |
| 573 | ker(i, o, block); |
| 574 | }); |
| 575 | } else { |
| 576 | assert(!"unimplemented" ); |
| 577 | } |
| 578 | |
| 579 | # undef off |
| 580 | |
| 581 | return success; |
| 582 | } |
| 583 | }; |
| 584 | |
| 585 | template <SIMPLE_REORDER_TEMPL_DECL> |
| 586 | struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL, |
| 587 | typename utils::enable_if<tag_i == any |
| 588 | && (tag_traits<tag_o>::block_dims == bd::_AB |
| 589 | || tag_traits<tag_o>::block_dims == bd::_BC) |
| 590 | && IMPLICATION(tag_traits<tag_o>::block_dims == bd::_AB, |
| 591 | tag_traits<tag_o>::ndims >= 3 && tag_traits<tag_o>::ndims <= 5) |
| 592 | && IMPLICATION(tag_traits<tag_o>::block_dims == bd::_BC, |
| 593 | tag_traits<tag_o>::ndims >= 4 && tag_traits<tag_o>::ndims <= 6) |
| 594 | >::type> |
| 595 | { |
| 596 | PLAIN_TO_BLOCKED_IS_APPLICABLE(); |
| 597 | |
| 598 | static status_t execute(const cpu_reorder_pd_t *pd, |
| 599 | const data_t<type_i> *input, data_t<type_o> *output) { |
| 600 | DECLARE_COMMON_PARAMS(); |
| 601 | |
| 602 | const auto &flat_d = order_keep ? input_d : output_d; |
| 603 | const auto &dims = input_d.dims(); |
| 604 | const auto &pdims = order_keep |
| 605 | ? output_d.padded_dims() |
| 606 | : input_d.padded_dims(); |
| 607 | |
| 608 | constexpr int ndims = tag_traits<tag_o>::ndims; |
| 609 | |
| 610 | static constexpr bool with_g = tag_traits<tag_o>::block_dims == bd::_BC; |
| 611 | const dim_t G = with_g ? dims[0] : 1; |
| 612 | |
| 613 | const dim_t H0 = dims[0 + with_g]; |
| 614 | const dim_t H1 = dims[1 + with_g]; |
| 615 | |
| 616 | const dim_t M0 = ndims >= 5 + with_g ? dims[ndims - 3] : 1; |
| 617 | const dim_t M1 = ndims >= 4 + with_g ? dims[ndims - 2] : 1; |
| 618 | const dim_t M2 = ndims >= 3 + with_g ? dims[ndims - 1] : 1; |
| 619 | |
| 620 | constexpr int blksize_0 = false ? 0 |
| 621 | : utils::one_of(tag_traits<tag_o>::inner_blks, |
| 622 | ib::_4b4a, ib::_4b4c, ib::_4c4b) |
| 623 | ? 4 |
| 624 | : utils::one_of(tag_traits<tag_o>::inner_blks, |
| 625 | ib::_8a8b, ib::_8b8a, ib::_8b8c, ib::_8c8b, ib::_2c8b4c) |
| 626 | ? 8 |
| 627 | : utils::one_of(tag_traits<tag_o>::inner_blks, |
| 628 | ib::_16a16b, ib::_16a4b, ib::_16b16a, ib::_16b4c, |
| 629 | ib::_16b16c, ib::_16c16b, ib::_8a16b2a, ib::_4b16a4b, |
| 630 | ib::_8b16a2b, ib::_8b16c2b, ib::_4c16b4c, ib::_8c16b2c) |
| 631 | ? 16 : INT_MIN; |
| 632 | |
| 633 | constexpr int blksize_1 = utils::one_of(tag_traits<tag_o>::inner_blks, |
| 634 | ib::_8a8b, ib::_8b8a, ib::_8b8c, ib::_8c8b, ib::_2c8b4c) |
| 635 | ? 8 |
| 636 | : utils::one_of(tag_traits<tag_o>::inner_blks, |
| 637 | ib::_16a16b, ib::_16b16a, ib::_16b16c, ib::_16c16b, |
| 638 | ib::_8a16b2a, ib::_4b16a4b, ib::_8b16a2b, ib::_8b16c2b, |
| 639 | ib::_4c16b4c, ib::_8c16b2c) |
| 640 | ? 16 |
| 641 | : utils::one_of(tag_traits<tag_o>::inner_blks, |
| 642 | ib::_4b4a, ib::_4b4c, ib::_4c4b, |
| 643 | ib::_16a4b, ib::_16b4c) |
| 644 | ? 4 |
| 645 | : INT_MIN; |
| 646 | |
| 647 | const dim_t NB_H0 = pdims[0 + with_g] / blksize_0; |
| 648 | const dim_t NB_H1 = pdims[1 + with_g] / blksize_1; |
| 649 | |
| 650 | auto ker = [&](const data_t<type_i> *i, data_t<type_o> *o, |
| 651 | const int block_h0, const int block_h1) { |
| 652 | # define blk_off AB_or_BC_blk_off<tag_traits<tag_o>::inner_blks> |
| 653 | |
| 654 | if (alpha == 1.0 && beta == 0.0) { |
| 655 | for (int h0 = 0; h0 < block_h0; ++h0) |
| 656 | for (int h1 = 0; h1 < block_h1; ++h1) { |
| 657 | const dim_t flat_off = 0 |
| 658 | + h0 * flat_d.blocking_desc().strides[with_g + 0] |
| 659 | + h1 * flat_d.blocking_desc().strides[with_g + 1]; |
| 660 | if (order_keep) { |
| 661 | o[blk_off(h0, h1)] = _qz_a1b0<type_i, type_o>()( |
| 662 | i[flat_off]); |
| 663 | } else { |
| 664 | o[flat_off] = _qz_a1b0<type_i, type_o>()( |
| 665 | i[blk_off(h0, h1)]); |
| 666 | } |
| 667 | } |
| 668 | } else { |
| 669 | for (int h0 = 0; h0 < block_h0; ++h0) |
| 670 | for (int h1 = 0; h1 < block_h1; ++h1) { |
| 671 | const dim_t flat_off = 0 |
| 672 | + h0 * flat_d.blocking_desc().strides[with_g + 0] |
| 673 | + h1 * flat_d.blocking_desc().strides[with_g + 1]; |
| 674 | if (order_keep) { |
| 675 | o[blk_off(h0, h1)] = _qz<type_i, type_o>()(i[flat_off], |
| 676 | o[blk_off(h0, h1)], alpha, beta); |
| 677 | } else { |
| 678 | o[flat_off] = _qz<type_i, type_o>()(i[blk_off(h0, h1)], |
| 679 | o[flat_off], alpha, beta); |
| 680 | } |
| 681 | } |
| 682 | } |
| 683 | |
| 684 | # undef blk_off |
| 685 | }; |
| 686 | |
| 687 | constexpr int i_mult_0 = order_keep ? blksize_0 : 1; |
| 688 | constexpr int o_mult_0 = order_keep ? 1 : blksize_0; |
| 689 | |
| 690 | constexpr int i_mult_1 = order_keep ? blksize_1 : 1; |
| 691 | constexpr int o_mult_1 = order_keep ? 1 : blksize_1; |
| 692 | |
| 693 | # define off(md, g, h0, h1, m0, m1, m2) \ |
| 694 | (ndims >= 5 + with_g ? (md).blk_off<!with_g>(g, h0, h1, m0, m1, m2) \ |
| 695 | : ndims >= 4 + with_g ? (md).blk_off<!with_g>(g, h0, h1, m1, m2) \ |
| 696 | : /* ndims >= 3 + with_g ? */ (md).blk_off<!with_g>(g, h0, h1, m2)) |
| 697 | |
| 698 | parallel_nd(G, NB_H0, NB_H1, M0, M1, M2, |
| 699 | [&](dim_t g, dim_t nb_h0, dim_t nb_h1, dim_t m0, dim_t m1, dim_t m2) { |
| 700 | auto i = &input[off(input_d, |
| 701 | g, i_mult_0 * nb_h0, i_mult_1 * nb_h1, m0, m1, m2)]; |
| 702 | auto o = &output[off(output_d, |
| 703 | g, o_mult_0 * nb_h0, o_mult_1 * nb_h1, m0, m1, m2)]; |
| 704 | const int block_h0 = nstl::min<int>(blksize_0, H0 - nb_h0 * blksize_0); |
| 705 | const int block_h1 = nstl::min<int>(blksize_1, H1 - nb_h1 * blksize_1); |
| 706 | ker(i, o, block_h0, block_h1); |
| 707 | }); |
| 708 | |
| 709 | # undef off |
| 710 | |
| 711 | return success; |
| 712 | } |
| 713 | }; |
| 714 | |
| 715 | /* generic and direct-copy reorders */ |
| 716 | |
| 717 | template <SIMPLE_REORDER_TEMPL_DECL> |
| 718 | struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL, |
| 719 | typename utils::enable_if< |
| 720 | tag_i == any && tag_o == any && order_keep == fmt_order::any, |
| 721 | spec::direct_copy>::type> |
| 722 | { |
| 723 | static bool is_applicable(const memory_desc_wrapper &input_d, |
| 724 | const memory_desc_wrapper &output_d, const primitive_attr_t *attr) { |
| 725 | /* FIXME: is the formula correct? */ |
| 726 | return input_d.similar_to(output_d, true, false, 0) |
| 727 | && input_d.is_dense() && output_d.is_dense() |
| 728 | && simple_attr_check(attr, false); |
| 729 | } |
| 730 | |
| 731 | static status_t execute(const cpu_reorder_pd_t *pd, |
| 732 | const data_t<type_i> *input, data_t<type_o> *output) { |
| 733 | DECLARE_COMMON_PARAMS(); |
| 734 | |
| 735 | assert(input_d.is_dense()); |
| 736 | |
| 737 | input += input_d.blk_off(0); |
| 738 | output += output_d.blk_off(0); |
| 739 | |
| 740 | const size_t nelems = input_d.nelems(); |
| 741 | |
| 742 | constexpr int block_size = 16; |
| 743 | const auto num_blocks = nelems / block_size; |
| 744 | const auto rem_elems = nelems % block_size; |
| 745 | |
| 746 | parallel(0, [&](const int ithr, const int nthr) { |
| 747 | size_t start{0}, end{0}; |
| 748 | balance211(num_blocks, nthr, ithr, start, end); |
| 749 | start = start * block_size; |
| 750 | end = end * block_size; |
| 751 | |
| 752 | if (alpha == 1.0 && beta == 0.0) { |
| 753 | PRAGMA_OMP_SIMD() |
| 754 | for (size_t e = start; e < end; ++e) { |
| 755 | output[e] = qz_a1b0<data_t<type_i>, data_t<type_o>>() |
| 756 | (input[e]); |
| 757 | } |
| 758 | } else if (alpha == 1.0) { |
| 759 | PRAGMA_OMP_SIMD() |
| 760 | for (size_t e = start; e < end; ++e) { |
| 761 | output[e] = qz_a1<data_t<type_i>, data_t<type_o>>() |
| 762 | (input[e], output[e], beta); |
| 763 | } |
| 764 | } else if (beta == 0.0) { |
| 765 | PRAGMA_OMP_SIMD() |
| 766 | for (size_t e = start; e < end; ++e) { |
| 767 | output[e] = qz_b0<data_t<type_i>, data_t<type_o>>() |
| 768 | (input[e], alpha); |
| 769 | } |
| 770 | } else { |
| 771 | PRAGMA_OMP_SIMD() |
| 772 | for (size_t e = start; e < end; ++e) { |
| 773 | output[e] = qz<data_t<type_i>, data_t<type_o>>() |
| 774 | (input[e], output[e], alpha, beta); |
| 775 | } |
| 776 | } |
| 777 | |
| 778 | if (rem_elems != 0 && ithr == nthr - 1){ |
| 779 | if (alpha == 1.0 && beta == 0.0) { |
| 780 | PRAGMA_OMP_SIMD() |
| 781 | for (size_t e = nelems - rem_elems; e < nelems; ++e) { |
| 782 | output[e] = qz_a1b0<data_t<type_i>, |
| 783 | data_t<type_o>>()(input[e]); |
| 784 | } |
| 785 | } else if (alpha == 1.0) { |
| 786 | PRAGMA_OMP_SIMD() |
| 787 | for (size_t e = nelems - rem_elems; e < nelems; ++e) { |
| 788 | output[e] = qz_a1<data_t<type_i>, |
| 789 | data_t<type_o>>()(input[e], output[e], beta); |
| 790 | } |
| 791 | } else if (beta == 0.0) { |
| 792 | PRAGMA_OMP_SIMD() |
| 793 | for (size_t e = nelems - rem_elems; e < nelems; ++e) { |
| 794 | output[e] = qz_b0<data_t<type_i>, |
| 795 | data_t<type_o>>()(input[e], alpha); |
| 796 | } |
| 797 | } else { |
| 798 | PRAGMA_OMP_SIMD() |
| 799 | for (size_t e = nelems - rem_elems; e < nelems; ++e) { |
| 800 | output[e] = qz<data_t<type_i>, data_t<type_o>>() |
| 801 | (input[e], output[e], alpha, beta); |
| 802 | } |
| 803 | } |
| 804 | } |
| 805 | }); |
| 806 | return success; |
| 807 | } |
| 808 | }; |
| 809 | |
| 810 | template <SIMPLE_REORDER_TEMPL_DECL> |
| 811 | struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL, |
| 812 | typename utils::enable_if< |
| 813 | tag_i == any && tag_o == any && order_keep == fmt_order::any, |
| 814 | spec::direct_copy_except_dim_0>::type> |
| 815 | { |
| 816 | static bool is_applicable(const memory_desc_wrapper &input_d, |
| 817 | const memory_desc_wrapper &output_d, const primitive_attr_t *attr) { |
| 818 | auto is_dense_no_0 = [](const memory_desc_wrapper &data_d) { |
| 819 | return nelems_no_dim_0(data_d) == _size_no_dim_0(data_d); |
| 820 | }; |
| 821 | /* FIXME: is the formula correct? */ |
| 822 | return input_d.similar_to(output_d, true, false, 1) |
| 823 | && is_dense_no_0(input_d) && is_dense_no_0(output_d) |
| 824 | && simple_attr_check(attr, false); |
| 825 | } |
| 826 | |
| 827 | static status_t execute(const cpu_reorder_pd_t *pd, |
| 828 | const data_t<type_i> *input, data_t<type_o> *output) { |
| 829 | DECLARE_COMMON_PARAMS(); |
| 830 | |
| 831 | input += input_d.blk_off(0); |
| 832 | output += output_d.blk_off(0); |
| 833 | |
| 834 | const int N = input_d.dims()[0]; |
| 835 | const dim_t is = input_d.blocking_desc().strides[0]; |
| 836 | const dim_t os = output_d.blocking_desc().strides[0]; |
| 837 | const dim_t nelems_no_d0 = nelems_no_dim_0(input_d); |
| 838 | const dim_t work_amount = N * nelems_no_d0; |
| 839 | |
| 840 | if (alpha == 1.0 && beta == 0.0) { |
| 841 | parallel(0, [&](const int ithr, const int nthr) { |
| 842 | dim_t n{0}, dim1_s{0}; |
| 843 | dim_t start{0}, end{0}; |
| 844 | balance211(work_amount, nthr, ithr, start, end); |
| 845 | nd_iterator_init(start, n, N, dim1_s, nelems_no_d0); |
| 846 | while(start < end) { |
| 847 | dim_t work_rem = end - start; |
| 848 | dim_t dim1_e = dim1_s + work_rem > nelems_no_d0 |
| 849 | ? nelems_no_d0 : dim1_s + work_rem; |
| 850 | PRAGMA_OMP_SIMD() |
| 851 | for (dim_t e = dim1_s; e < dim1_e; ++e) { |
| 852 | output[os * n + e] = _qz_a1b0<type_i, type_o>()( |
| 853 | input[is * n + e]); |
| 854 | } |
| 855 | nd_iterator_jump(start, end, n, N, dim1_s, nelems_no_d0); |
| 856 | } |
| 857 | }); |
| 858 | } else { |
| 859 | parallel(0, [&](const int ithr, const int nthr) { |
| 860 | dim_t n{0}, dim1_s{0}; |
| 861 | dim_t start{0}, end{0}; |
| 862 | balance211(work_amount, nthr, ithr, start, end); |
| 863 | nd_iterator_init(start, n, N, dim1_s, nelems_no_d0); |
| 864 | while(start < end) { |
| 865 | dim_t work_rem = end - start; |
| 866 | dim_t dim1_e = |
| 867 | dim1_s + work_rem > nelems_no_d0 ? nelems_no_d0 |
| 868 | : dim1_s + work_rem; |
| 869 | PRAGMA_OMP_SIMD() |
| 870 | for (dim_t e = dim1_s; e < dim1_e; ++e){ |
| 871 | output[os * n + e] = _qz<type_i, type_o>()( |
| 872 | input[is * n + e], output[os * n + e], alpha, |
| 873 | beta); |
| 874 | } |
| 875 | nd_iterator_jump(start, end, n, N, dim1_s, nelems_no_d0); |
| 876 | } |
| 877 | }); |
| 878 | } |
| 879 | |
| 880 | return success; |
| 881 | } |
| 882 | |
| 883 | private: |
| 884 | static dim_t nelems_no_dim_0(const memory_desc_wrapper &data_d) { |
| 885 | const int ndims = data_d.ndims(); |
| 886 | if (ndims <= 1) return 1; |
| 887 | return utils::array_product(data_d.dims() + 1, data_d.ndims() - 1); |
| 888 | } |
| 889 | |
| 890 | static dim_t _size_no_dim_0(const memory_desc_wrapper &data_d) { |
| 891 | dims_t blocks; |
| 892 | data_d.compute_blocks(blocks); |
| 893 | |
| 894 | const auto &blk = data_d.blocking_desc(); |
| 895 | |
| 896 | dim_t blk_size = 1; |
| 897 | for (int iblk = 0; iblk < blk.inner_nblks; ++iblk) |
| 898 | blk_size *= blk.inner_blks[iblk]; |
| 899 | |
| 900 | dim_t max_size = blk_size; |
| 901 | for (int d = 1; d < data_d.ndims(); ++d) { |
| 902 | max_size = nstl::max(max_size, |
| 903 | data_d.padded_dims()[d] / blocks[d] * blk.strides[d]); |
| 904 | } |
| 905 | |
| 906 | return max_size; |
| 907 | } |
| 908 | }; |
| 909 | |
| 910 | template <SIMPLE_REORDER_TEMPL_DECL> |
| 911 | struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL, |
| 912 | typename utils::enable_if< |
| 913 | tag_i == any && tag_o == any && order_keep == fmt_order::any, |
| 914 | spec::reference>::type> |
| 915 | { |
| 916 | static bool is_applicable(const memory_desc_wrapper &input_d, |
| 917 | const memory_desc_wrapper &output_d, const primitive_attr_t *attr) { |
| 918 | /* supported smask: 0x0...011..10...0, |
| 919 | * i.e. 1 should be contiguous */ |
| 920 | int smask = attr ? attr->output_scales_.mask_ : 0; |
| 921 | for (; smask > 0 && !(smask & 0x1); smask >>= 1); |
| 922 | for (; smask > 0 && smask & 0x1; smask >>= 1); |
| 923 | return true |
| 924 | && input_d.is_blocking_desc() |
| 925 | && output_d.is_blocking_desc() |
| 926 | && !output_d.is_additional_buffer() |
| 927 | && !input_d.is_additional_buffer() |
| 928 | && smask == 0; |
| 929 | } |
| 930 | |
| 931 | static status_t execute(const cpu_reorder_pd_t *pd, |
| 932 | const data_t<type_i> *input, data_t<type_o> *output) { |
| 933 | DECLARE_COMMON_PARAMS(); |
| 934 | |
| 935 | const size_t nelems = input_d.nelems(); |
| 936 | |
| 937 | int ndims_start = 0, ndims_mask = 0; |
| 938 | int smask = pd->attr()->output_scales_.mask_; |
| 939 | for (; smask > 0 && !(smask & 0x1); smask >>= 1) ++ndims_start; |
| 940 | for (; smask > 0 && smask & 0x1; smask >>= 1) ++ndims_mask; |
| 941 | assert(smask == 0); |
| 942 | |
| 943 | const ptrdiff_t D_start |
| 944 | = utils::array_product(input_d.dims(), ndims_start); |
| 945 | const ptrdiff_t D_mask |
| 946 | = utils::array_product(input_d.dims() + ndims_start, ndims_mask); |
| 947 | const ptrdiff_t D_rest = nelems / D_start / D_mask; |
| 948 | |
| 949 | const float *scales = pd->attr()->output_scales_.scales_; |
| 950 | |
| 951 | parallel_nd(D_start, D_mask, D_rest, |
| 952 | [&](ptrdiff_t ds, ptrdiff_t dm, ptrdiff_t dr) { |
| 953 | const float scale = scales[dm]; |
| 954 | |
| 955 | const size_t e = (ds * D_mask + dm) * D_rest + dr; |
| 956 | const auto &i = input[input_d.off_l(e)]; |
| 957 | auto &o = output[output_d.off_l(e)]; |
| 958 | |
| 959 | o = _qz<type_i, type_o>()(i, o, scale, beta); |
| 960 | }); |
| 961 | |
| 962 | return success; |
| 963 | } |
| 964 | }; |
| 965 | |
| 966 | |
| 967 | /* high level class declaration */ |
| 968 | |
| 969 | template <SIMPLE_REORDER_TEMPL_DECL, typename spec = void> |
| 970 | struct simple_reorder_t: public cpu_primitive_t { |
| 971 | struct pd_t: public cpu_reorder_pd_t { |
| 972 | using cpu_reorder_pd_t::cpu_reorder_pd_t; |
| 973 | |
| 974 | DECLARE_COMMON_PD_T("simple:any" , simple_reorder_t); |
| 975 | |
| 976 | static status_t create(reorder_pd_t **reorder_pd, |
| 977 | engine_t *engine, const primitive_attr_t *attr, |
| 978 | engine_t *src_engine, const memory_desc_t *src_md, |
| 979 | engine_t *dst_engine, const memory_desc_t *dst_md) { |
| 980 | bool args_ok = true |
| 981 | && src_md->data_type == type_i |
| 982 | && dst_md->data_type == type_o |
| 983 | && simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL, spec>:: |
| 984 | is_applicable(src_md, dst_md, attr); |
| 985 | if (!args_ok) |
| 986 | return status::invalid_arguments; |
| 987 | |
| 988 | auto _pd = new pd_t(engine, attr, src_engine, src_md, dst_engine, |
| 989 | dst_md); |
| 990 | if (_pd == nullptr) return status::out_of_memory; |
| 991 | if (_pd->init() != status::success) { |
| 992 | delete _pd; |
| 993 | return status::unimplemented; |
| 994 | } |
| 995 | return safe_ptr_assign<reorder_pd_t>(*reorder_pd, _pd); |
| 996 | } |
| 997 | }; |
| 998 | |
| 999 | simple_reorder_t(const pd_t *apd): cpu_primitive_t(apd) {} |
| 1000 | |
| 1001 | virtual status_t execute(const exec_ctx_t &ctx) const override { |
| 1002 | auto input = CTX_IN_MEM(const data_t<type_i> *, MKLDNN_ARG_FROM); |
| 1003 | auto output = CTX_OUT_MEM(data_t<type_o> *, MKLDNN_ARG_TO); |
| 1004 | simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL, spec>::execute( |
| 1005 | pd(), input, output); |
| 1006 | return status::success; |
| 1007 | } |
| 1008 | |
| 1009 | private: |
| 1010 | const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } |
| 1011 | }; |
| 1012 | |
| 1013 | #undef SIMPLE_REORDER_TEMPL_DECL |
| 1014 | #undef SIMPLE_REORDER_TEMPL_CALL |
| 1015 | |
| 1016 | } |
| 1017 | } |
| 1018 | } |
| 1019 | |
| 1020 | #endif |
| 1021 | |
| 1022 | // vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s |
| 1023 | |