diff --git a/lib/filterx/expr-regexp-subst.c b/lib/filterx/expr-regexp-subst.c index 59e25798d..ba6d2bb95 100644 --- a/lib/filterx/expr-regexp-subst.c +++ b/lib/filterx/expr-regexp-subst.c @@ -35,6 +35,7 @@ #include "filterx/expr-regexp-common.h" #include "compat/pcre.h" #include "scratch-buffers.h" +#include DEFINE_FUNC_FLAG_NAMES(FilterXRegexpSubstFlags, FILTERX_FUNC_REGEXP_SUBST_FLAG_JIT_NAME, @@ -53,6 +54,7 @@ DEFINE_FUNC_FLAG_NAMES(FilterXRegexpSubstFlags, FILTERX_FUNC_REGEXP_SUBST_FLAG_NEWLINE_NAME"=(boolean)" \ FILTERX_FUNC_REGEXP_SUBST_FLAG_GROUPS_NAME"=(boolean))" \ +#define FILTERX_FUNC_REGEXP_SUBST_GRP_ID_MAX_DIGITS 3 typedef struct FilterXFuncRegexpSubst_ { @@ -63,43 +65,84 @@ typedef struct FilterXFuncRegexpSubst_ FLAGSET flags; } FilterXFuncRegexpSubst; +static gchar * +_next_matchgrp_ref(gchar *from, gchar **to) +{ + if (from == NULL || *from == '\0') + return NULL; + g_assert(to); + while (*from != '\0') + { + if ((*from == '\\') && isdigit(*(from + 1))) + { + gchar *start = from; + from += 2; + while (isdigit(*from) && from - start <= FILTERX_FUNC_REGEXP_SUBST_GRP_ID_MAX_DIGITS) + { + from++; + } + *to = from; + return start; + } + from++; + } + return NULL; +} + +static gboolean +_parse_machgrp_ref(const gchar *from, const gchar *to, gint *value) +{ + if (!from || !to || !value || from >= to || to > from + 5) + { + return FALSE; + } + + if (*from != '\\') + { + return FALSE; + } + + from++; + *value = 0; + + while (from < to && isdigit(*from)) + { + *value = (*value * 10) + (*from - '0'); + from++; + } + + return from == to; +} + static gboolean _build_replacement_stirng_with_match_groups(const FilterXFuncRegexpSubst *self, FilterXReMatchState *state, GString *replacement_string) { - PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(state->match_data); g_string_set_size(replacement_string, 0); - const gchar *rep_ptr = self->replacement; - const gchar *last_ptr = rep_ptr; gint num_grps = state->rc; + PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(state->match_data); - while (*rep_ptr) + gchar *pos = self->replacement; + gchar *last = pos; + gchar *close = NULL; + gint idx = -1; + while ((pos = _next_matchgrp_ref(pos, &close)) != NULL) { - if (*rep_ptr == '\\') + if (_parse_machgrp_ref(pos, close, &idx) && (idx < num_grps)) { - rep_ptr++; - if (*rep_ptr >= '1' && *rep_ptr <= '9') + PCRE2_SIZE start = ovector[2 * idx]; + PCRE2_SIZE end = ovector[2 * idx + 1]; + if (start != PCRE2_UNSET) { - gint grp_idx = *rep_ptr - '0'; - if (grp_idx < num_grps) - { - PCRE2_SIZE start = ovector[2 * grp_idx]; - PCRE2_SIZE end = ovector[2 * grp_idx + 1]; - if (start != PCRE2_UNSET) - { - g_string_append_len(replacement_string, last_ptr, rep_ptr - last_ptr - 1); - last_ptr = rep_ptr + 1; - size_t group_len = end - start; - g_string_append_len(replacement_string, state->lhs_str + start, group_len); - } - } + g_string_append_len(replacement_string, last, pos - last); + last = close; + size_t group_len = end - start; + g_string_append_len(replacement_string, state->lhs_str + start, group_len); } - rep_ptr++; } - else - rep_ptr++; + pos = close; } - g_string_append_len(replacement_string, last_ptr, rep_ptr - last_ptr); + g_string_append_len(replacement_string, last, pos - last); return TRUE; } @@ -117,7 +160,6 @@ _replace_matches(const FilterXFuncRegexpSubst *self, FilterXReMatchState *state) _build_replacement_stirng_with_match_groups(self, state, rep_str); replacement_string = rep_str->str; } - do { ovector = pcre2_get_ovector_pointer(state->match_data); @@ -253,6 +295,13 @@ _extract_optional_flags(FilterXFuncRegexpSubst *self, FilterXFunctionArgs *args, return TRUE; } +static gboolean +_contains_match_grp_ref(gchar *str) +{ + gchar *close = NULL; + return _next_matchgrp_ref(str, &close) != NULL; +} + static gboolean _extract_subst_args(FilterXFuncRegexpSubst *self, FilterXFunctionArgs *args, GError **error) { @@ -277,7 +326,9 @@ _extract_subst_args(FilterXFuncRegexpSubst *self, FilterXFunctionArgs *args, GEr self->replacement = _extract_subst_replacement_arg(args, error); if (!self->replacement) return FALSE; - + // turn off group mode if there is no match grp ref due to it's performance impact + if (!_contains_match_grp_ref(self->replacement)) + set_flag(&self->flags, FILTERX_FUNC_REGEXP_SUBST_FLAG_GROUPS, FALSE); return TRUE; } @@ -322,7 +373,8 @@ filterx_function_regexp_subst_new(FilterXFunctionArgs *args, GError **error) self->super.super.deinit = _subst_deinit; self->super.super.free_fn = _subst_free; - reset_flags(&self->flags, FLAG_VAL(FILTERX_FUNC_REGEXP_SUBST_FLAG_JIT)); + reset_flags(&self->flags, FLAG_VAL(FILTERX_FUNC_REGEXP_SUBST_FLAG_JIT) | FLAG_VAL( + FILTERX_FUNC_REGEXP_SUBST_FLAG_GROUPS)); if (!_extract_subst_args(self, args, error) || !filterx_function_args_check(args, error)) goto error; diff --git a/lib/filterx/tests/test_expr_regexp_subst.c b/lib/filterx/tests/test_expr_regexp_subst.c index 3270cfadd..ad417744b 100644 --- a/lib/filterx/tests/test_expr_regexp_subst.c +++ b/lib/filterx/tests/test_expr_regexp_subst.c @@ -69,9 +69,9 @@ _build_subst_func(const gchar *pattern, const gchar *repr, const gchar *str, Fil if (opts.utf8) args = g_list_append(args, filterx_function_arg_new(FILTERX_FUNC_REGEXP_SUBST_FLAG_UTF8_NAME, filterx_literal_new(filterx_boolean_new(TRUE)))); - if (opts.groups) + if (!opts.groups) args = g_list_append(args, filterx_function_arg_new(FILTERX_FUNC_REGEXP_SUBST_FLAG_GROUPS_NAME, - filterx_literal_new(filterx_boolean_new(TRUE)))); + filterx_literal_new(filterx_boolean_new(FALSE)))); GError *err = NULL; FilterXExpr *func = filterx_function_regexp_subst_new(filterx_function_args_new(args, NULL), &err); @@ -350,6 +350,39 @@ Test(filterx_expr_regexp_subst, regexp_subst_group_subst_without_ref) filterx_object_unref(result); } +Test(filterx_expr_regexp_subst, regexp_subst_group_reference_with_multiple_digits) +{ + FilterXFuncRegexpSubstOpts opts = {.groups = TRUE}; + FilterXObject *result = + _sub("(\\d{2})(\\d{2})(\\d{2})(\\d{2})(\\d{2})(\\d{2})(\\d{2})(\\d{2})(\\d{2})(\\d{2})(\\d{2})(\\d{2})", + "\\12-\\11-\\10-\\9\\8\\7\\6\\5\\4\\3\\2\\1", "010203040506070809101112", opts); + cr_assert(filterx_object_is_type(result, &FILTERX_TYPE_NAME(string))); + const gchar *res = filterx_string_get_value_ref(result, NULL); + cr_assert_str_eq(res, "12-11-10-090807060504030201"); + filterx_object_unref(result); +} + +Test(filterx_expr_regexp_subst, regexp_subst_group_do_not_replace_unknown_ref) +{ + FilterXFuncRegexpSubstOpts opts = {.groups = TRUE}; + FilterXObject *result = _sub("(\\d{2})(\\d{2})(\\d{2})", + "\\3\\20\\1", "010203", opts); + cr_assert(filterx_object_is_type(result, &FILTERX_TYPE_NAME(string))); + const gchar *res = filterx_string_get_value_ref(result, NULL); + cr_assert_str_eq(res, "03\\2001"); + filterx_object_unref(result); +} + +Test(filterx_expr_regexp_subst, regexp_subst_group_limited_digits_and_zero_prefixes) +{ + FilterXFuncRegexpSubstOpts opts = {.groups = TRUE}; + FilterXObject *result = _sub("(\\w+),(\\w+),(\\w+)", "\\3\\02\\0013.14", "baz,bar,foo", opts); + cr_assert(filterx_object_is_type(result, &FILTERX_TYPE_NAME(string))); + const gchar *res = filterx_string_get_value_ref(result, NULL); + cr_assert_str_eq(res, "foobarbaz3.14"); + filterx_object_unref(result); +} + static void setup(void) { diff --git a/tests/light/functional_tests/filterx/test_filterx.py b/tests/light/functional_tests/filterx/test_filterx.py index 33b7b5e45..6ce1469ce 100644 --- a/tests/light/functional_tests/filterx/test_filterx.py +++ b/tests/light/functional_tests/filterx/test_filterx.py @@ -2004,9 +2004,11 @@ def test_regexp_subst(config, syslog_ng): $MSG.orgrp_global = regexp_subst("foobarbaz", "(fo|az)", "!", global=true); $MSG.ignore_case_control = regexp_subst("FoObArBaz", "(o|a)", "!", global=true); $MSG.ignore_case = regexp_subst("FoObArBaz", "(o|a)", "!", ignorecase=true, global=true); - $MSG.groups_off = regexp_subst("25-02-2022", /(\d{2})-(\d{2})-(\d{4})/, "\\3-\\2-\\1");; - $MSG.groups_on = regexp_subst("25-02-2022", /(\d{2})-(\d{2})-(\d{4})/, "\\3-\\2-\\1", groups=true); - $MSG.mixed_grps = regexp_subst("25-02-2022", /(\d{2})-(\d{2})-(\d{4})/, "foo:\\3-\\2-\\1:bar:baz", groups=true); + $MSG.groups_off = regexp_subst("25-02-2022", /(\d{2})-(\d{2})-(\d{4})/, "\\3-\\2-\\1", groups=false); + $MSG.groups_on = regexp_subst("25-02-2022", /(\d{2})-(\d{2})-(\d{4})/, "\\3-\\2-\\1"); + $MSG.mixed_grps = regexp_subst("25-02-2022", /(\d{2})-(\d{2})-(\d{4})/, "foo:\\3-\\2-\\1:bar:baz"); + $MSG.multi_digit_grps = regexp_subst("010203040506070809101112", /(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})/, "\\10-\\11-\\12"); + $MSG.prefixing_zeros = regexp_subst("foobar", /^(.*)$/, "\\001012345"); """, ) syslog_ng.start(config) @@ -2028,7 +2030,9 @@ def test_regexp_subst(config, syslog_ng): r""""ignore_case":"F!!b!rB!z",""" r""""groups_off":"\\3-\\2-\\1",""" r""""groups_on":"2022-02-25",""" - r""""mixed_grps":"foo:2022-02-25:bar:baz"}""" + "\n" + r""""mixed_grps":"foo:2022-02-25:bar:baz",""" + r""""multi_digit_grps":"10-11-12",""" + r""""prefixing_zeros":"foobar012345"}""" + "\n" ) assert file_true.read_log() == exp