| 1 | #pragma once |
| 2 | |
| 3 | #include <Columns/ColumnArray.h> |
| 4 | #include <Columns/ColumnConst.h> |
| 5 | #include <Columns/ColumnString.h> |
| 6 | #include <Columns/ColumnVector.h> |
| 7 | #include <DataTypes/DataTypeArray.h> |
| 8 | #include <DataTypes/DataTypeString.h> |
| 9 | #include <DataTypes/DataTypesNumber.h> |
| 10 | #include <Functions/FunctionHelpers.h> |
| 11 | #include <Functions/IFunctionImpl.h> |
| 12 | #include <IO/WriteHelpers.h> |
| 13 | #include <Interpreters/Context.h> |
| 14 | #include <common/StringRef.h> |
| 15 | |
| 16 | namespace DB |
| 17 | { |
| 18 | /** Search and replace functions in strings: |
| 19 | * |
| 20 | * position(haystack, needle) - the normal search for a substring in a string, returns the position (in bytes) of the found substring starting with 1, or 0 if no substring is found. |
| 21 | * positionUTF8(haystack, needle) - the same, but the position is calculated at code points, provided that the string is encoded in UTF-8. |
| 22 | * positionCaseInsensitive(haystack, needle) |
| 23 | * positionCaseInsensitiveUTF8(haystack, needle) |
| 24 | * |
| 25 | * like(haystack, pattern) - search by the regular expression LIKE; Returns 0 or 1. Case-insensitive, but only for Latin. |
| 26 | * notLike(haystack, pattern) |
| 27 | * |
| 28 | * match(haystack, pattern) - search by regular expression re2; Returns 0 or 1. |
| 29 | * multiMatchAny(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- search by re2 regular expressions pattern_i; Returns 0 or 1 if any pattern_i matches. |
| 30 | * multiMatchAnyIndex(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- search by re2 regular expressions pattern_i; Returns index of any match or zero if none; |
| 31 | * multiMatchAllIndices(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- search by re2 regular expressions pattern_i; Returns an array of matched indices in any order; |
| 32 | * |
| 33 | * Applies regexp re2 and pulls: |
| 34 | * - the first subpattern, if the regexp has a subpattern; |
| 35 | * - the zero subpattern (the match part, otherwise); |
| 36 | * - if not match - an empty string. |
| 37 | * extract(haystack, pattern) |
| 38 | * |
| 39 | * replaceOne(haystack, pattern, replacement) - replacing the pattern with the specified rules, only the first occurrence. |
| 40 | * replaceAll(haystack, pattern, replacement) - replacing the pattern with the specified rules, all occurrences. |
| 41 | * |
| 42 | * replaceRegexpOne(haystack, pattern, replacement) - replaces the pattern with the specified regexp, only the first occurrence. |
| 43 | * replaceRegexpAll(haystack, pattern, replacement) - replaces the pattern with the specified type, all occurrences. |
| 44 | * |
| 45 | * multiSearchAllPositions(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- find first occurrences (positions) of all the const patterns inside haystack |
| 46 | * multiSearchAllPositionsUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n]) |
| 47 | * multiSearchAllPositionsCaseInsensitive(haystack, [pattern_1, pattern_2, ..., pattern_n]) |
| 48 | * multiSearchAllPositionsCaseInsensitiveUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n]) |
| 49 | * |
| 50 | * multiSearchFirstPosition(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- returns the first position of the haystack matched by strings or zero if nothing was found |
| 51 | * multiSearchFirstPositionUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n]) |
| 52 | * multiSearchFirstPositionCaseInsensitive(haystack, [pattern_1, pattern_2, ..., pattern_n]) |
| 53 | * multiSearchFirstPositionCaseInsensitiveUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n]) |
| 54 | * |
| 55 | * multiSearchAny(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- find any of the const patterns inside haystack and return 0 or 1 |
| 56 | * multiSearchAnyUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n]) |
| 57 | * multiSearchAnyCaseInsensitive(haystack, [pattern_1, pattern_2, ..., pattern_n]) |
| 58 | * multiSearchAnyCaseInsensitiveUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n]) |
| 59 | |
| 60 | * multiSearchFirstIndex(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- returns the first index of the matched string or zero if nothing was found |
| 61 | * multiSearchFirstIndexUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n]) |
| 62 | * multiSearchFirstIndexCaseInsensitive(haystack, [pattern_1, pattern_2, ..., pattern_n]) |
| 63 | * multiSearchFirstIndexCaseInsensitiveUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n]) |
| 64 | */ |
| 65 | |
| 66 | namespace ErrorCodes |
| 67 | { |
| 68 | extern const int ILLEGAL_TYPE_OF_ARGUMENT; |
| 69 | extern const int ILLEGAL_COLUMN; |
| 70 | extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; |
| 71 | extern const int FUNCTION_NOT_ALLOWED; |
| 72 | } |
| 73 | |
| 74 | template <typename Impl, typename Name> |
| 75 | class FunctionsStringSearch : public IFunction |
| 76 | { |
| 77 | public: |
| 78 | static constexpr auto name = Name::name; |
| 79 | static FunctionPtr create(const Context &) { return std::make_shared<FunctionsStringSearch>(); } |
| 80 | |
| 81 | String getName() const override { return name; } |
| 82 | |
| 83 | size_t getNumberOfArguments() const override { return 2; } |
| 84 | |
| 85 | DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override |
| 86 | { |
| 87 | if (!isString(arguments[0])) |
| 88 | throw Exception( |
| 89 | "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); |
| 90 | |
| 91 | if (!isString(arguments[1])) |
| 92 | throw Exception( |
| 93 | "Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); |
| 94 | |
| 95 | return std::make_shared<DataTypeNumber<typename Impl::ResultType>>(); |
| 96 | } |
| 97 | |
| 98 | void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override |
| 99 | { |
| 100 | using ResultType = typename Impl::ResultType; |
| 101 | |
| 102 | const ColumnPtr & column_haystack = block.getByPosition(arguments[0]).column; |
| 103 | const ColumnPtr & column_needle = block.getByPosition(arguments[1]).column; |
| 104 | |
| 105 | const ColumnConst * col_haystack_const = typeid_cast<const ColumnConst *>(&*column_haystack); |
| 106 | const ColumnConst * col_needle_const = typeid_cast<const ColumnConst *>(&*column_needle); |
| 107 | |
| 108 | if (col_haystack_const && col_needle_const) |
| 109 | { |
| 110 | ResultType res{}; |
| 111 | Impl::constant_constant(col_haystack_const->getValue<String>(), col_needle_const->getValue<String>(), res); |
| 112 | block.getByPosition(result).column |
| 113 | = block.getByPosition(result).type->createColumnConst(col_haystack_const->size(), toField(res)); |
| 114 | return; |
| 115 | } |
| 116 | |
| 117 | auto col_res = ColumnVector<ResultType>::create(); |
| 118 | |
| 119 | typename ColumnVector<ResultType>::Container & vec_res = col_res->getData(); |
| 120 | vec_res.resize(column_haystack->size()); |
| 121 | |
| 122 | const ColumnString * col_haystack_vector = checkAndGetColumn<ColumnString>(&*column_haystack); |
| 123 | const ColumnString * col_needle_vector = checkAndGetColumn<ColumnString>(&*column_needle); |
| 124 | |
| 125 | if (col_haystack_vector && col_needle_vector) |
| 126 | Impl::vector_vector( |
| 127 | col_haystack_vector->getChars(), |
| 128 | col_haystack_vector->getOffsets(), |
| 129 | col_needle_vector->getChars(), |
| 130 | col_needle_vector->getOffsets(), |
| 131 | vec_res); |
| 132 | else if (col_haystack_vector && col_needle_const) |
| 133 | Impl::vector_constant( |
| 134 | col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), col_needle_const->getValue<String>(), vec_res); |
| 135 | else if (col_haystack_const && col_needle_vector) |
| 136 | Impl::constant_vector( |
| 137 | col_haystack_const->getValue<String>(), col_needle_vector->getChars(), col_needle_vector->getOffsets(), vec_res); |
| 138 | else |
| 139 | throw Exception( |
| 140 | "Illegal columns " + block.getByPosition(arguments[0]).column->getName() + " and " |
| 141 | + block.getByPosition(arguments[1]).column->getName() + " of arguments of function " + getName(), |
| 142 | ErrorCodes::ILLEGAL_COLUMN); |
| 143 | |
| 144 | block.getByPosition(result).column = std::move(col_res); |
| 145 | } |
| 146 | }; |
| 147 | |
| 148 | |
| 149 | template <typename Impl, typename Name> |
| 150 | class FunctionsStringSearchToString : public IFunction |
| 151 | { |
| 152 | public: |
| 153 | static constexpr auto name = Name::name; |
| 154 | static FunctionPtr create(const Context &) { return std::make_shared<FunctionsStringSearchToString>(); } |
| 155 | |
| 156 | String getName() const override { return name; } |
| 157 | |
| 158 | size_t getNumberOfArguments() const override { return 2; } |
| 159 | |
| 160 | bool useDefaultImplementationForConstants() const override { return true; } |
| 161 | ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } |
| 162 | |
| 163 | DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override |
| 164 | { |
| 165 | if (!isString(arguments[0])) |
| 166 | throw Exception( |
| 167 | "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); |
| 168 | |
| 169 | if (!isString(arguments[1])) |
| 170 | throw Exception( |
| 171 | "Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); |
| 172 | |
| 173 | return std::make_shared<DataTypeString>(); |
| 174 | } |
| 175 | |
| 176 | void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override |
| 177 | { |
| 178 | const ColumnPtr column = block.getByPosition(arguments[0]).column; |
| 179 | const ColumnPtr column_needle = block.getByPosition(arguments[1]).column; |
| 180 | |
| 181 | const ColumnConst * col_needle = typeid_cast<const ColumnConst *>(&*column_needle); |
| 182 | if (!col_needle) |
| 183 | throw Exception("Second argument of function " + getName() + " must be constant string" , ErrorCodes::ILLEGAL_COLUMN); |
| 184 | |
| 185 | if (const ColumnString * col = checkAndGetColumn<ColumnString>(column.get())) |
| 186 | { |
| 187 | auto col_res = ColumnString::create(); |
| 188 | |
| 189 | ColumnString::Chars & vec_res = col_res->getChars(); |
| 190 | ColumnString::Offsets & offsets_res = col_res->getOffsets(); |
| 191 | Impl::vector(col->getChars(), col->getOffsets(), col_needle->getValue<String>(), vec_res, offsets_res); |
| 192 | |
| 193 | block.getByPosition(result).column = std::move(col_res); |
| 194 | } |
| 195 | else |
| 196 | throw Exception( |
| 197 | "Illegal column " + block.getByPosition(arguments[0]).column->getName() + " of argument of function " + getName(), |
| 198 | ErrorCodes::ILLEGAL_COLUMN); |
| 199 | } |
| 200 | }; |
| 201 | |
| 202 | template <typename Impl, typename Name> |
| 203 | class FunctionsMultiStringPosition : public IFunction |
| 204 | { |
| 205 | public: |
| 206 | static constexpr auto name = Name::name; |
| 207 | static FunctionPtr create(const Context &) { return std::make_shared<FunctionsMultiStringPosition>(); } |
| 208 | |
| 209 | String getName() const override { return name; } |
| 210 | |
| 211 | size_t getNumberOfArguments() const override { return 2; } |
| 212 | bool useDefaultImplementationForConstants() const override { return true; } |
| 213 | ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } |
| 214 | |
| 215 | DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override |
| 216 | { |
| 217 | if (!isString(arguments[0])) |
| 218 | throw Exception( |
| 219 | "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); |
| 220 | |
| 221 | const DataTypeArray * array_type = checkAndGetDataType<DataTypeArray>(arguments[1].get()); |
| 222 | if (!array_type || !checkAndGetDataType<DataTypeString>(array_type->getNestedType().get())) |
| 223 | throw Exception( |
| 224 | "Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); |
| 225 | |
| 226 | return std::make_shared<DataTypeArray>(std::make_shared<DataTypeUInt64>()); |
| 227 | } |
| 228 | |
| 229 | void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override |
| 230 | { |
| 231 | using ResultType = typename Impl::ResultType; |
| 232 | |
| 233 | const ColumnPtr & column_haystack = block.getByPosition(arguments[0]).column; |
| 234 | |
| 235 | const ColumnString * col_haystack_vector = checkAndGetColumn<ColumnString>(&*column_haystack); |
| 236 | |
| 237 | const ColumnPtr & arr_ptr = block.getByPosition(arguments[1]).column; |
| 238 | const ColumnConst * col_const_arr = checkAndGetColumnConst<ColumnArray>(arr_ptr.get()); |
| 239 | |
| 240 | if (!col_const_arr) |
| 241 | throw Exception( |
| 242 | "Illegal column " + block.getByPosition(arguments[1]).column->getName() + ". The array is not const" , |
| 243 | ErrorCodes::ILLEGAL_COLUMN); |
| 244 | |
| 245 | Array src_arr = col_const_arr->getValue<Array>(); |
| 246 | |
| 247 | if (src_arr.size() > std::numeric_limits<UInt8>::max()) |
| 248 | throw Exception( |
| 249 | "Number of arguments for function " + getName() + " doesn't match: passed " + std::to_string(src_arr.size()) |
| 250 | + ", should be at most 255" , |
| 251 | ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); |
| 252 | |
| 253 | std::vector<StringRef> refs; |
| 254 | for (const auto & el : src_arr) |
| 255 | refs.emplace_back(el.get<String>()); |
| 256 | |
| 257 | const size_t column_haystack_size = column_haystack->size(); |
| 258 | |
| 259 | auto col_res = ColumnVector<ResultType>::create(); |
| 260 | auto col_offsets = ColumnArray::ColumnOffsets::create(column_haystack_size); |
| 261 | |
| 262 | auto & vec_res = col_res->getData(); |
| 263 | auto & offsets_res = col_offsets->getData(); |
| 264 | |
| 265 | vec_res.resize(column_haystack_size * refs.size()); |
| 266 | |
| 267 | if (col_haystack_vector) |
| 268 | Impl::vector_constant(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res); |
| 269 | else |
| 270 | throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName(), ErrorCodes::ILLEGAL_COLUMN); |
| 271 | |
| 272 | size_t refs_size = refs.size(); |
| 273 | size_t accum = refs_size; |
| 274 | |
| 275 | for (size_t i = 0; i < column_haystack_size; ++i, accum += refs_size) |
| 276 | offsets_res[i] = accum; |
| 277 | |
| 278 | block.getByPosition(result).column = ColumnArray::create(std::move(col_res), std::move(col_offsets)); |
| 279 | } |
| 280 | }; |
| 281 | |
| 282 | /// The argument limiting raises from Volnitsky searcher -- it is performance crucial to save only one byte for pattern number. |
| 283 | /// But some other searchers use this function, for example, multiMatchAny -- hyperscan does not have such restrictions |
| 284 | template <typename Impl, typename Name, size_t LimitArgs = std::numeric_limits<UInt8>::max()> |
| 285 | class FunctionsMultiStringSearch : public IFunction |
| 286 | { |
| 287 | static_assert(LimitArgs > 0); |
| 288 | |
| 289 | public: |
| 290 | static constexpr auto name = Name::name; |
| 291 | static FunctionPtr create(const Context & context) |
| 292 | { |
| 293 | if (Impl::is_using_hyperscan && !context.getSettingsRef().allow_hyperscan) |
| 294 | throw Exception( |
| 295 | "Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0" , ErrorCodes::FUNCTION_NOT_ALLOWED); |
| 296 | |
| 297 | return std::make_shared<FunctionsMultiStringSearch>(); |
| 298 | } |
| 299 | |
| 300 | String getName() const override { return name; } |
| 301 | |
| 302 | size_t getNumberOfArguments() const override { return 2; } |
| 303 | bool useDefaultImplementationForConstants() const override { return true; } |
| 304 | ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } |
| 305 | |
| 306 | DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override |
| 307 | { |
| 308 | if (!isString(arguments[0])) |
| 309 | throw Exception( |
| 310 | "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); |
| 311 | |
| 312 | const DataTypeArray * array_type = checkAndGetDataType<DataTypeArray>(arguments[1].get()); |
| 313 | if (!array_type || !checkAndGetDataType<DataTypeString>(array_type->getNestedType().get())) |
| 314 | throw Exception( |
| 315 | "Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); |
| 316 | return Impl::ReturnType(); |
| 317 | } |
| 318 | |
| 319 | void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override |
| 320 | { |
| 321 | using ResultType = typename Impl::ResultType; |
| 322 | |
| 323 | const ColumnPtr & column_haystack = block.getByPosition(arguments[0]).column; |
| 324 | |
| 325 | const ColumnString * col_haystack_vector = checkAndGetColumn<ColumnString>(&*column_haystack); |
| 326 | |
| 327 | const ColumnPtr & arr_ptr = block.getByPosition(arguments[1]).column; |
| 328 | const ColumnConst * col_const_arr = checkAndGetColumnConst<ColumnArray>(arr_ptr.get()); |
| 329 | |
| 330 | if (!col_const_arr) |
| 331 | throw Exception( |
| 332 | "Illegal column " + block.getByPosition(arguments[1]).column->getName() + ". The array is not const" , |
| 333 | ErrorCodes::ILLEGAL_COLUMN); |
| 334 | |
| 335 | Array src_arr = col_const_arr->getValue<Array>(); |
| 336 | |
| 337 | if (src_arr.size() > LimitArgs) |
| 338 | throw Exception( |
| 339 | "Number of arguments for function " + getName() + " doesn't match: passed " + std::to_string(src_arr.size()) |
| 340 | + ", should be at most " + std::to_string(LimitArgs), |
| 341 | ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); |
| 342 | |
| 343 | std::vector<StringRef> refs; |
| 344 | refs.reserve(src_arr.size()); |
| 345 | |
| 346 | for (const auto & el : src_arr) |
| 347 | refs.emplace_back(el.get<String>()); |
| 348 | |
| 349 | auto col_res = ColumnVector<ResultType>::create(); |
| 350 | auto col_offsets = ColumnArray::ColumnOffsets::create(); |
| 351 | |
| 352 | auto & vec_res = col_res->getData(); |
| 353 | auto & offsets_res = col_offsets->getData(); |
| 354 | |
| 355 | /// The blame for resizing output is for the callee. |
| 356 | if (col_haystack_vector) |
| 357 | Impl::vector_constant(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res, offsets_res); |
| 358 | else |
| 359 | throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName(), ErrorCodes::ILLEGAL_COLUMN); |
| 360 | |
| 361 | if constexpr (Impl::is_column_array) |
| 362 | block.getByPosition(result).column = ColumnArray::create(std::move(col_res), std::move(col_offsets)); |
| 363 | else |
| 364 | block.getByPosition(result).column = std::move(col_res); |
| 365 | } |
| 366 | }; |
| 367 | |
| 368 | } |
| 369 | |