| 1 | /******************************************************************************* |
| 2 | * Copyright 2018 Intel Corporation |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | *******************************************************************************/ |
| 16 | |
| 17 | #include <assert.h> |
| 18 | |
| 19 | #include "mkldnn_traits.hpp" |
| 20 | #include "mkldnn_thread.hpp" |
| 21 | #include "type_helpers.hpp" |
| 22 | #include "utils.hpp" |
| 23 | |
| 24 | #include "cpu_memory.hpp" |
| 25 | |
| 26 | namespace mkldnn { |
| 27 | namespace impl { |
| 28 | namespace cpu { |
| 29 | |
| 30 | using namespace mkldnn::impl; |
| 31 | using namespace mkldnn::impl::data_type; |
| 32 | using namespace mkldnn::impl::status; |
| 33 | using namespace mkldnn::impl::format_tag; |
| 34 | |
| 35 | enum blk_kind_t { a, b, c, ab, ba, bc, cb }; |
| 36 | |
| 37 | template <data_type_t dt, blk_kind_t blk_kind, int blksize> |
| 38 | void typed_zero_pad_blk( |
| 39 | const memory_desc_wrapper &m_d, typename prec_traits<dt>::type *data) { |
| 40 | using data_t = typename prec_traits<dt>::type; |
| 41 | const auto &dims = m_d.dims(); |
| 42 | const auto &pdims = m_d.padded_dims(); |
| 43 | const auto &blk = m_d.blocking_desc(); |
| 44 | auto dim_is_blocked = [&](int dim) { |
| 45 | for (int i = 0; i < blk.inner_nblks; i++) |
| 46 | if (blk.inner_idxs[i] == dim) |
| 47 | return true; |
| 48 | return false; |
| 49 | }; |
| 50 | bool A_blocked = dim_is_blocked(0), B_blocked = dim_is_blocked(1), |
| 51 | C_blocked = dim_is_blocked(2); |
| 52 | |
| 53 | assert(blk.inner_nblks < 4); |
| 54 | assert((A_blocked || B_blocked || C_blocked) || (A_blocked && B_blocked) |
| 55 | || (C_blocked && B_blocked)); |
| 56 | |
| 57 | const int a_tail_s = A_blocked ? dims[0] % blksize : 0; |
| 58 | const int b_tail_s = B_blocked ? dims[1] % blksize : 0; |
| 59 | const int c_tail_s = C_blocked ? dims[2] % blksize : 0; |
| 60 | assert(a_tail_s || b_tail_s || c_tail_s); |
| 61 | |
| 62 | const int A = A_blocked ? pdims[0] / blksize : dims[0]; |
| 63 | const int B = B_blocked ? pdims[1] / blksize : dims[1]; |
| 64 | const int C = C_blocked ? pdims[2] / blksize : dims[2]; |
| 65 | const int D = m_d.ndims() > 3 ? dims[3] : 1; |
| 66 | const int E = m_d.ndims() > 4 ? dims[4] : 1; |
| 67 | const int F = m_d.ndims() > 5 ? dims[5] : 1; |
| 68 | const int inner_blk = blk.inner_nblks == 3 ? blk.inner_blks[2] : 1; |
| 69 | |
| 70 | auto zeroize_tail = [&](data_t *d, const int tail_s) { |
| 71 | for (int b = tail_s; b < blksize; ++b) |
| 72 | d[b] = 0; |
| 73 | }; |
| 74 | auto zeroize_tail_inner = [&](data_t *d, const int tail_s) { |
| 75 | for (int b1 = 0; b1 < blksize; ++b1) |
| 76 | for (int b2 = tail_s; b2 < blksize; ++b2) |
| 77 | d[(b1 / inner_blk) * blksize * inner_blk + inner_blk * b2 |
| 78 | + b1 % inner_blk] |
| 79 | = 0; |
| 80 | }; |
| 81 | auto zeroize_tail_outer = [&](data_t *d, const int tail_s) { |
| 82 | for (int b1 = tail_s; b1 < blksize; ++b1) |
| 83 | for (int b2 = 0; b2 < blksize; ++b2) |
| 84 | d[(b1 / inner_blk) * blksize * inner_blk + inner_blk * b2 |
| 85 | + b1 % inner_blk] |
| 86 | = 0; |
| 87 | }; |
| 88 | |
| 89 | if (c_tail_s) { |
| 90 | parallel_nd(A, B, D, E, F, [&](int a, int b, int d, int e, int f) { |
| 91 | auto x = &data[m_d.blk_off(a, b, C - 1, d, e, f)]; |
| 92 | if (blk_kind == c) |
| 93 | zeroize_tail(x, c_tail_s); |
| 94 | else if (blk_kind == bc) |
| 95 | zeroize_tail_inner(x, c_tail_s); |
| 96 | else if (blk_kind == cb) |
| 97 | zeroize_tail_outer(x, c_tail_s); |
| 98 | }); |
| 99 | } |
| 100 | |
| 101 | if (b_tail_s) { |
| 102 | parallel_nd(A, C, D, E, F, [&](int a, int c, int d, int e, int f) { |
| 103 | auto x = &data[m_d.blk_off(a, B - 1, c, d, e, f)]; |
| 104 | if (blk_kind == b) |
| 105 | zeroize_tail(x, b_tail_s); |
| 106 | else if (blk_kind == ab || blk_kind == cb) |
| 107 | zeroize_tail_inner(x, b_tail_s); |
| 108 | else if (blk_kind == ba || blk_kind == bc) |
| 109 | zeroize_tail_outer(x, b_tail_s); |
| 110 | }); |
| 111 | } |
| 112 | |
| 113 | if (a_tail_s) { |
| 114 | parallel_nd(B, C, D, E, F, [&](int b, int c, int d, int e, int f) { |
| 115 | auto x = &data[m_d.blk_off(A - 1, b, c, d, e, f)]; |
| 116 | if (blk_kind == a) |
| 117 | zeroize_tail(x, a_tail_s); |
| 118 | else if (blk_kind == ba) |
| 119 | zeroize_tail_inner(x, a_tail_s); |
| 120 | else if (blk_kind == ab) |
| 121 | zeroize_tail_outer(x, a_tail_s); |
| 122 | }); |
| 123 | } |
| 124 | } |
| 125 | |
| 126 | /* |
| 127 | * all |
| 128 | */ |
| 129 | template <data_type_t dt> |
| 130 | void typed_zero_pad_generic_blocked( |
| 131 | const memory_desc_wrapper &m_d, typename prec_traits<dt>::type *data) { |
| 132 | const int ndims = m_d.ndims(); |
| 133 | const auto &dims = m_d.dims(); |
| 134 | const auto &pdims = m_d.padded_dims(); |
| 135 | |
| 136 | const ptrdiff_t nelems = (ptrdiff_t)m_d.nelems(true); |
| 137 | |
| 138 | /* [D_0] .. [D_k][D_k+1] .. [D_ndim - 1] |
| 139 | * | \ / |
| 140 | * | --------------------- |
| 141 | * has contiguous |
| 142 | * padding |
| 143 | * |
| 144 | * step <-- D_k+1 * ... * D_ndims-1 |
| 145 | * step_dim <-- k |
| 146 | */ |
| 147 | |
| 148 | ptrdiff_t step = 1; |
| 149 | int step_dim = ndims - 1; |
| 150 | for (; step_dim >= 0; --step_dim) { |
| 151 | if (dims[step_dim] != pdims[step_dim]) |
| 152 | break; |
| 153 | step *= dims[step_dim]; |
| 154 | } |
| 155 | |
| 156 | assert(step_dim >= 0 && "no zero padding is required" ); |
| 157 | if (step_dim < 0) |
| 158 | return; |
| 159 | |
| 160 | parallel_nd(nelems / step, [&](ptrdiff_t e1) { |
| 161 | bool need_zero = false; |
| 162 | |
| 163 | ptrdiff_t idx = e1; |
| 164 | for (int d = step_dim; d >= 0; --d) { |
| 165 | if (idx % pdims[d] >= dims[d]) { |
| 166 | need_zero = true; |
| 167 | break; |
| 168 | } |
| 169 | idx /= pdims[d]; |
| 170 | } |
| 171 | |
| 172 | if (need_zero) { |
| 173 | for (ptrdiff_t e0 = 0; e0 < step; ++e0) |
| 174 | data[m_d.off_l(e1 * step + e0, true)] = 0; |
| 175 | } |
| 176 | }); |
| 177 | } |
| 178 | |
| 179 | template <data_type_t dt> |
| 180 | status_t cpu_memory_t::typed_zero_pad() const { |
| 181 | const memory_desc_wrapper mdw(md()); |
| 182 | |
| 183 | if (mdw.format_kind() != format_kind::blocked) |
| 184 | return unimplemented; |
| 185 | |
| 186 | if (mdw.nelems(false) == mdw.nelems(true)) |
| 187 | return success; |
| 188 | |
| 189 | auto *data = (typename prec_traits<dt>::type *)data_; |
| 190 | auto blk = mdw.blocking_desc(); |
| 191 | |
| 192 | auto get_blksize = [&](int ind) { |
| 193 | int blksize = 1; |
| 194 | for (int i = 0; i < blk.inner_nblks; i++) { |
| 195 | if (blk.inner_idxs[i] == ind) |
| 196 | blksize *= blk.inner_blks[i]; |
| 197 | } |
| 198 | return blksize; |
| 199 | }; |
| 200 | const int blksize = get_blksize(blk.inner_idxs[0]); |
| 201 | |
| 202 | # define CASE(blksize_, blk_kind) \ |
| 203 | do { \ |
| 204 | if (blksize == blksize_) { \ |
| 205 | typed_zero_pad_blk<dt, blk_kind, blksize_>(mdw, data); \ |
| 206 | return success; \ |
| 207 | } \ |
| 208 | } while(0) |
| 209 | |
| 210 | switch (blk.inner_nblks) { |
| 211 | case 1: |
| 212 | if (blk.inner_idxs[0] == 0) { |
| 213 | CASE(4, a); |
| 214 | CASE(8, a); |
| 215 | CASE(16, a); |
| 216 | } else if (blk.inner_idxs[0] == 1) { |
| 217 | CASE(4, b); |
| 218 | CASE(8, b); |
| 219 | CASE(16, b); |
| 220 | } |
| 221 | break; |
| 222 | case 2: |
| 223 | case 3: |
| 224 | if (!IMPLICATION(blk.inner_nblks == 3, |
| 225 | blk.inner_idxs[0] == blk.inner_idxs[2])) |
| 226 | break; |
| 227 | |
| 228 | if (blk.inner_idxs[0] == 0 && blk.inner_idxs[1] == 1) { |
| 229 | CASE(4, ab); |
| 230 | CASE(8, ab); |
| 231 | CASE(16, ab); |
| 232 | } else if (blk.inner_idxs[0] == 1 && blk.inner_idxs[1] == 0) { |
| 233 | CASE(4, ba); |
| 234 | CASE(8, ba); |
| 235 | CASE(16, ba); |
| 236 | } |
| 237 | if (blk.inner_idxs[0] == 1 && blk.inner_idxs[1] == 2) { |
| 238 | CASE(4, bc); |
| 239 | CASE(8, bc); |
| 240 | CASE(16, bc); |
| 241 | } else if (blk.inner_idxs[0] == 2 && blk.inner_idxs[1] == 1) { |
| 242 | CASE(4, cb); |
| 243 | CASE(8, cb); |
| 244 | CASE(16, cb); |
| 245 | } |
| 246 | break; |
| 247 | default: break; |
| 248 | } |
| 249 | |
| 250 | # undef CASE |
| 251 | |
| 252 | // the last line of defence |
| 253 | typed_zero_pad_generic_blocked<dt>(mdw, data); |
| 254 | return success; |
| 255 | } |
| 256 | |
| 257 | status_t cpu_memory_t::zero_pad() const { |
| 258 | memory_desc_wrapper mdw(md()); |
| 259 | const bool skip_zeroing = false |
| 260 | || data_ == nullptr |
| 261 | || mdw.is_zero() |
| 262 | || !mdw.is_blocking_desc(); |
| 263 | if (skip_zeroing) return success; |
| 264 | |
| 265 | switch (mdw.data_type()) { |
| 266 | case f32: return typed_zero_pad<f32>(); |
| 267 | case s32: return typed_zero_pad<s32>(); |
| 268 | case s8: return typed_zero_pad<s8>(); |
| 269 | case u8: return typed_zero_pad<u8>(); |
| 270 | default: assert(!"memory is undefined" ); return unimplemented; |
| 271 | } |
| 272 | return unimplemented; |
| 273 | } |
| 274 | |
| 275 | } |
| 276 | } |
| 277 | } |
| 278 | |