| 1 | #include "duckdb/function/scalar/string_functions.hpp" |
| 2 | #include "duckdb/common/exception.hpp" |
| 3 | #include "duckdb/common/vector_operations/vector_operations.hpp" |
| 4 | #include "duckdb/execution/expression_executor.hpp" |
| 5 | #include "duckdb/planner/expression/bound_function_expression.hpp" |
| 6 | #include "duckdb/common/vector_operations/unary_executor.hpp" |
| 7 | #include "duckdb/common/vector_operations/binary_executor.hpp" |
| 8 | #include "duckdb/common/vector_operations/ternary_executor.hpp" |
| 9 | #include "utf8proc_wrapper.hpp" |
| 10 | |
| 11 | #include "re2/re2.h" |
| 12 | |
| 13 | using namespace std; |
| 14 | |
| 15 | namespace duckdb { |
| 16 | |
| 17 | RegexpMatchesBindData::RegexpMatchesBindData(unique_ptr<RE2> constant_pattern, string range_min, string range_max, |
| 18 | bool range_success) |
| 19 | : constant_pattern(std::move(constant_pattern)), range_min(range_min), range_max(range_max), |
| 20 | range_success(range_success) { |
| 21 | } |
| 22 | |
| 23 | RegexpMatchesBindData::~RegexpMatchesBindData() { |
| 24 | } |
| 25 | |
| 26 | unique_ptr<FunctionData> RegexpMatchesBindData::Copy() { |
| 27 | return make_unique<RegexpMatchesBindData>(move(constant_pattern), range_min, range_max, range_success); |
| 28 | } |
| 29 | |
| 30 | static inline re2::StringPiece CreateStringPiece(string_t &input) { |
| 31 | return re2::StringPiece(input.GetData(), input.GetSize()); |
| 32 | } |
| 33 | |
| 34 | struct RegexPartialMatch { |
| 35 | static inline bool Operation(const re2::StringPiece &input, RE2 &re) { |
| 36 | return RE2::PartialMatch(input, re); |
| 37 | } |
| 38 | }; |
| 39 | |
| 40 | struct RegexFullMatch { |
| 41 | static inline bool Operation(const re2::StringPiece &input, RE2 &re) { |
| 42 | return RE2::FullMatch(input, re); |
| 43 | } |
| 44 | }; |
| 45 | |
| 46 | template <class OP> static void regexp_matches_function(DataChunk &args, ExpressionState &state, Vector &result) { |
| 47 | auto &strings = args.data[0]; |
| 48 | auto &patterns = args.data[1]; |
| 49 | |
| 50 | auto &func_expr = (BoundFunctionExpression &)state.expr; |
| 51 | auto &info = (RegexpMatchesBindData &)*func_expr.bind_info; |
| 52 | |
| 53 | RE2::Options options; |
| 54 | options.set_log_errors(false); |
| 55 | |
| 56 | if (info.constant_pattern) { |
| 57 | // FIXME: this should be a unary loop |
| 58 | UnaryExecutor::Execute<string_t, bool, true>(strings, result, args.size(), [&](string_t input) { |
| 59 | return OP::Operation(CreateStringPiece(input), *info.constant_pattern); |
| 60 | }); |
| 61 | } else { |
| 62 | BinaryExecutor::Execute<string_t, string_t, bool, true>(strings, patterns, result, args.size(), |
| 63 | [&](string_t input, string_t pattern) { |
| 64 | RE2 re(CreateStringPiece(pattern), options); |
| 65 | if (!re.ok()) { |
| 66 | throw Exception(re.error()); |
| 67 | } |
| 68 | return OP::Operation(CreateStringPiece(input), re); |
| 69 | }); |
| 70 | } |
| 71 | } |
| 72 | |
| 73 | static unique_ptr<FunctionData> regexp_matches_get_bind_function(BoundFunctionExpression &expr, |
| 74 | ClientContext &context) { |
| 75 | // pattern is the second argument. If its constant, we can already prepare the pattern and store it for later. |
| 76 | assert(expr.children.size() == 2); |
| 77 | if (expr.children[1]->IsScalar()) { |
| 78 | Value pattern_str = ExpressionExecutor::EvaluateScalar(*expr.children[1]); |
| 79 | if (!pattern_str.is_null && pattern_str.type == TypeId::VARCHAR) { |
| 80 | RE2::Options options; |
| 81 | options.set_log_errors(false); |
| 82 | auto re = make_unique<RE2>(pattern_str.str_value, options); |
| 83 | if (!re->ok()) { |
| 84 | throw Exception(re->error()); |
| 85 | } |
| 86 | |
| 87 | string range_min, range_max; |
| 88 | auto range_success = re->PossibleMatchRange(&range_min, &range_max, 1000); |
| 89 | return make_unique<RegexpMatchesBindData>(move(re), range_min, range_max, range_success); |
| 90 | } |
| 91 | } |
| 92 | return make_unique<RegexpMatchesBindData>(nullptr, "" , "" , false); |
| 93 | } |
| 94 | |
| 95 | static void regexp_replace_function(DataChunk &args, ExpressionState &state, Vector &result) { |
| 96 | auto &strings = args.data[0]; |
| 97 | auto &patterns = args.data[1]; |
| 98 | auto &replaces = args.data[2]; |
| 99 | |
| 100 | RE2::Options options; |
| 101 | options.set_log_errors(false); |
| 102 | |
| 103 | TernaryExecutor::Execute<string_t, string_t, string_t, string_t>( |
| 104 | strings, patterns, replaces, result, args.size(), [&](string_t input, string_t pattern, string_t replace) { |
| 105 | RE2 re(CreateStringPiece(pattern), options); |
| 106 | std::string sstring(input.GetData(), input.GetSize()); |
| 107 | RE2::Replace(&sstring, re, CreateStringPiece(replace)); |
| 108 | return StringVector::AddString(result, sstring); |
| 109 | }); |
| 110 | } |
| 111 | |
| 112 | void RegexpFun::RegisterFunction(BuiltinFunctions &set) { |
| 113 | set.AddFunction(ScalarFunction("regexp_full_match" , {SQLType::VARCHAR, SQLType::VARCHAR}, SQLType::BOOLEAN, |
| 114 | regexp_matches_function<RegexFullMatch>, false, regexp_matches_get_bind_function)); |
| 115 | set.AddFunction(ScalarFunction("regexp_matches" , {SQLType::VARCHAR, SQLType::VARCHAR}, SQLType::BOOLEAN, |
| 116 | regexp_matches_function<RegexPartialMatch>, false, |
| 117 | regexp_matches_get_bind_function)); |
| 118 | set.AddFunction(ScalarFunction("regexp_replace" , {SQLType::VARCHAR, SQLType::VARCHAR, SQLType::VARCHAR}, |
| 119 | SQLType::VARCHAR, regexp_replace_function)); |
| 120 | } |
| 121 | |
| 122 | } // namespace duckdb |
| 123 | |