From b6162a686fe33fd8e651e004493dc770b3956695 Mon Sep 17 00:00:00 2001 From: Michael Matloka Date: Tue, 19 Mar 2024 13:59:01 +0100 Subject: [PATCH] fix(elements): Avoid matching element tag on classes (#20533) * fix(elements): Avoid matching element tag on classes * Update test_property.py * Make test a bit more robust * Update query snapshots * Update query snapshots --------- Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com> --- ...ickhouse_experiment_secondary_results.ambr | 2 +- posthog/hogql/test/test_property.py | 18 ++++---- posthog/models/property/util.py | 23 +++++----- posthog/models/test/test_event_model.py | 42 ++++++++++++++++++- 4 files changed, 64 insertions(+), 21 deletions(-) diff --git a/ee/clickhouse/views/test/__snapshots__/test_clickhouse_experiment_secondary_results.ambr b/ee/clickhouse/views/test/__snapshots__/test_clickhouse_experiment_secondary_results.ambr index 3c6b8ef78c385..54b67fa7d359b 100644 --- a/ee/clickhouse/views/test/__snapshots__/test_clickhouse_experiment_secondary_results.ambr +++ b/ee/clickhouse/views/test/__snapshots__/test_clickhouse_experiment_secondary_results.ambr @@ -1,7 +1,7 @@ # serializer version: 1 # name: ClickhouseTestExperimentSecondaryResults.test_basic_secondary_metric_results ''' - /* user_id:107 celery:posthog.tasks.tasks.sync_insight_caching_state */ + /* user_id:108 celery:posthog.tasks.tasks.sync_insight_caching_state */ SELECT team_id, date_diff('second', max(timestamp), now()) AS age FROM events diff --git a/posthog/hogql/test/test_property.py b/posthog/hogql/test/test_property.py index f271ee5e2f4ff..c50615eb6730a 100644 --- a/posthog/hogql/test/test_property.py +++ b/posthog/hogql/test/test_property.py @@ -431,37 +431,39 @@ def test_tag_name_to_expr(self): def test_selector_to_expr(self): self.assertEqual( self._selector_to_expr("div"), - clear_locations(elements_chain_match('div([-_a-zA-Z0-9\\.:"= ]*?)?($|;|:([^;^\\s]*(;|$|\\s)))')), + clear_locations(elements_chain_match('(^|;)div([-_a-zA-Z0-9\\.:"= ]*?)?($|;|:([^;^\\s]*(;|$|\\s)))')), ) self.assertEqual( self._selector_to_expr("div > div"), clear_locations( elements_chain_match( - 'div([-_a-zA-Z0-9\\.:"= ]*?)?($|;|:([^;^\\s]*(;|$|\\s)))div([-_a-zA-Z0-9\\.:"= ]*?)?($|;|:([^;^\\s]*(;|$|\\s))).*' + '(^|;)div([-_a-zA-Z0-9\\.:"= ]*?)?($|;|:([^;^\\s]*(;|$|\\s)))div([-_a-zA-Z0-9\\.:"= ]*?)?($|;|:([^;^\\s]*(;|$|\\s))).*' ) ), ) self.assertEqual( self._selector_to_expr("a[href='boo']"), clear_locations( - elements_chain_match('a.*?href="boo".*?([-_a-zA-Z0-9\\.:"= ]*?)?($|;|:([^;^\\s]*(;|$|\\s)))') + elements_chain_match('(^|;)a.*?href="boo".*?([-_a-zA-Z0-9\\.:"= ]*?)?($|;|:([^;^\\s]*(;|$|\\s)))') ), ) self.assertEqual( self._selector_to_expr(".class"), - clear_locations(elements_chain_match('.*?\\.class([-_a-zA-Z0-9\\.:"= ]*?)?($|;|:([^;^\\s]*(;|$|\\s)))')), + clear_locations( + elements_chain_match('(^|;).*?\\.class([-_a-zA-Z0-9\\.:"= ]*?)?($|;|:([^;^\\s]*(;|$|\\s)))') + ), ) self.assertEqual( self._selector_to_expr("#withid"), clear_locations( - elements_chain_match('.*?attr_id="withid".*?([-_a-zA-Z0-9\\.:"= ]*?)?($|;|:([^;^\\s]*(;|$|\\s)))') + elements_chain_match('(^|;).*?attr_id="withid".*?([-_a-zA-Z0-9\\.:"= ]*?)?($|;|:([^;^\\s]*(;|$|\\s)))') ), ) self.assertEqual( self._selector_to_expr("#with-dashed-id"), clear_locations( elements_chain_match( - '.*?attr_id="with\\-dashed\\-id".*?([-_a-zA-Z0-9\\.:"= ]*?)?($|;|:([^;^\\s]*(;|$|\\s)))' + '(^|;).*?attr_id="with\\-dashed\\-id".*?([-_a-zA-Z0-9\\.:"= ]*?)?($|;|:([^;^\\s]*(;|$|\\s)))' ) ), ) @@ -473,7 +475,7 @@ def test_selector_to_expr(self): self._selector_to_expr("#with\\slashed\\id"), clear_locations( elements_chain_match( - '.*?attr_id="with\\\\slashed\\\\id".*?([-_a-zA-Z0-9\\.:"= ]*?)?($|;|:([^;^\\s]*(;|$|\\s)))' + '(^|;).*?attr_id="with\\\\slashed\\\\id".*?([-_a-zA-Z0-9\\.:"= ]*?)?($|;|:([^;^\\s]*(;|$|\\s)))' ) ), ) @@ -526,7 +528,7 @@ def test_action_to_expr(self): "event = '$autocapture' and elements_chain =~ {regex1} and elements_chain =~ {regex2}", { "regex1": ast.Constant( - value='a.*?\\.active\\..*?nav\\-link([-_a-zA-Z0-9\\.:"= ]*?)?($|;|:([^;^\\s]*(;|$|\\s)))' + value='(^|;)a.*?\\.active\\..*?nav\\-link([-_a-zA-Z0-9\\.:"= ]*?)?($|;|:([^;^\\s]*(;|$|\\s)))' ), "regex2": ast.Constant(value="(^|;)a(\\.|$|;|:)"), }, diff --git a/posthog/models/property/util.py b/posthog/models/property/util.py index bd684283049c7..cae1be3340eac 100644 --- a/posthog/models/property/util.py +++ b/posthog/models/property/util.py @@ -847,21 +847,24 @@ def process_ok_values(ok_values: Any, operator: OperatorType) -> List[str]: def build_selector_regex(selector: Selector) -> str: regex = r"" for tag in selector.parts: - if tag.data.get("tag_name") and isinstance(tag.data["tag_name"], str): - if tag.data["tag_name"] == "*": - regex += ".+" - else: - regex += re.escape(tag.data["tag_name"]) + if tag.data.get("tag_name") and isinstance(tag.data["tag_name"], str) and tag.data["tag_name"] != "*": + # The elements in the elements_chain are separated by the semicolon + regex += re.escape(tag.data["tag_name"]) if tag.data.get("attr_class__contains"): - regex += r".*?\.{}".format(r"\..*?".join([re.escape(s) for s in sorted(tag.data["attr_class__contains"])])) + regex += r".*?\." + r"\..*?".join([re.escape(s) for s in sorted(tag.data["attr_class__contains"])]) if tag.ch_attributes: - regex += ".*?" + regex += r".*?" for key, value in sorted(tag.ch_attributes.items()): - regex += '{}="{}".*?'.format(re.escape(key), re.escape(str(value))) + regex += rf'{re.escape(key)}="{re.escape(str(value))}".*?' regex += r'([-_a-zA-Z0-9\.:"= ]*?)?($|;|:([^;^\s]*(;|$|\s)))' if tag.direct_descendant: - regex += ".*" - return regex + regex += r".*" + if regex: + # Always start matching at the beginning of an element in the chain string + # This is to avoid issues like matching elements with class "foo" when looking for elements with tag name "foo" + return r"(^|;)" + regex + else: + return r"" class HogQLPropertyChecker(TraversingVisitor): diff --git a/posthog/models/test/test_event_model.py b/posthog/models/test/test_event_model.py index e918291ca15e4..cbe6a2bcad70c 100644 --- a/posthog/models/test/test_event_model.py +++ b/posthog/models/test/test_event_model.py @@ -72,8 +72,15 @@ def _setup_action_selector_events(self): attr_class=["one-class"], ), Element(tag_name="button", nth_child=0, nth_of_type=0), - Element(tag_name="div", nth_child=0, nth_of_type=0), - Element(tag_name="div", nth_child=0, nth_of_type=0, attr_id="nested"), + Element( + # Important that in this hierarchy the div is sandwiched between button and section. + # This way makes sure that any conditions which should match this element also work + # if the element is neither first nor last in the hierarchy. + tag_name="div", + nth_child=0, + nth_of_type=0, + ), + Element(tag_name="section", nth_child=0, nth_of_type=0, attr_id="nested"), ], ) @@ -417,6 +424,37 @@ def test_with_class_with_escaped_slashes(self): self.assertEqual(events[0].uuid, event1_uuid) self.assertEqual(len(events), 1) + def test_with_tag_matching_class_selector(self): + _create_person(distinct_ids=["whatever"], team=self.team) + action1 = Action.objects.create(team=self.team) + ActionStep.objects.create( + event="$autocapture", + action=action1, + selector="input", # This should ONLY match the tag, but not a class named `input` + ) + event_matching_tag_uuid = _create_event( + event="$autocapture", + team=self.team, + distinct_id="whatever", + elements=[ + Element(tag_name="span", attr_class=None), + Element(tag_name="input", attr_class=["button"]), # Should match + ], + ) + _create_event( + event="$autocapture", + team=self.team, + distinct_id="whatever", + elements=[ + Element(tag_name="span", attr_class=None), + Element(tag_name="button", attr_class=["input"]), # Cannot match + ], + ) + + events = _get_events_for_action(action1) + self.assertEqual(len(events), 1) + self.assertEqual(events[0].uuid, event_matching_tag_uuid) + def test_attributes(self): _create_person(distinct_ids=["whatever"], team=self.team) event1_uuid = _create_event(