From a9ad05f5c9ebfd8e8720bc054654116733012455 Mon Sep 17 00:00:00 2001 From: Robbie Coomber Date: Wed, 14 Aug 2024 08:26:53 +0100 Subject: [PATCH 01/13] Use NavigatorUAData and navigator.webdriver to improve bot detection --- src/__tests__/utils.test.ts | 49 ++++++++++++++++++++++++++++-- src/posthog-core.ts | 10 +++---- src/utils/blocked-uas.ts | 60 +++++++++++++++++++++++++++++++++++++ src/utils/globals.ts | 2 ++ 4 files changed, 113 insertions(+), 8 deletions(-) diff --git a/src/__tests__/utils.test.ts b/src/__tests__/utils.test.ts index cdbb5e300..de513f126 100644 --- a/src/__tests__/utils.test.ts +++ b/src/__tests__/utils.test.ts @@ -9,7 +9,8 @@ import { _copyAndTruncateStrings, isCrossDomainCookie, _base64Encode } from '../utils' import { Info } from '../utils/event-utils' -import { isBlockedUA, DEFAULT_BLOCKED_UA_STRS } from '../utils/blocked-uas' +import { isLikelyBot, DEFAULT_BLOCKED_UA_STRS, isBlockedUA } from '../utils/blocked-uas' +import { expect } from '@jest/globals' function userAgentFor(botString: string) { const randOne = (Math.random() + 1).toString(36).substring(7) @@ -103,13 +104,13 @@ describe('utils', () => { }) }) - describe('user agent blocking', () => { + describe('isLikelyBot', () => { it.each(DEFAULT_BLOCKED_UA_STRS.concat('testington'))( 'blocks a bot based on the user agent %s', (botString) => { const randomisedUserAgent = userAgentFor(botString) - expect(isBlockedUA(randomisedUserAgent, ['testington'])).toBe(true) + expect(isLikelyBot({ userAgent: randomisedUserAgent } as Navigator, ['testington'])).toBe(true) } ) @@ -125,10 +126,52 @@ describe('utils', () => { [ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.6422.175 Safari/537.36 (compatible; Google-HotelAdsVerifier/2.0)', ], + [ + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/122.0.0.0 Safari/537.36', + ], ])('blocks based on user agent', (botString) => { expect(isBlockedUA(botString, [])).toBe(true) expect(isBlockedUA(botString.toLowerCase(), [])).toBe(true) expect(isBlockedUA(botString.toUpperCase(), [])).toBe(true) + expect(isLikelyBot({ userAgent: botString } as Navigator, [])).toBe(true) + expect(isLikelyBot({ userAgent: botString.toLowerCase() } as Navigator, [])).toBe(true) + expect(isLikelyBot({ userAgent: botString.toUpperCase() } as Navigator, [])).toBe(true) + }) + + it.each([ + ['Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:129.0) Gecko/20100101 Firefox/129.0'], + [ + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36', + ], + [ + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15', + ], + ])('does not block based on non-bot user agent', (userAgent) => { + expect(isBlockedUA(userAgent, [])).toBe(false) + expect(isBlockedUA(userAgent.toLowerCase(), [])).toBe(false) + expect(isBlockedUA(userAgent.toUpperCase(), [])).toBe(false) + expect(isLikelyBot({ userAgent } as Navigator, [])).toBe(false) + expect(isLikelyBot({ userAgent: userAgent.toLowerCase() } as Navigator, [])).toBe(false) + expect(isLikelyBot({ userAgent: userAgent.toUpperCase() } as Navigator, [])).toBe(false) + }) + + it('blocks based on the webdriver property being set to true', () => { + expect(isLikelyBot({ webdriver: true } as Navigator, [])).toBe(true) + }) + + it('blocks based on hints in userAgentData', () => { + expect(isLikelyBot({ userAgentData: { brands: ['Googlebot'] } } as Navigator, [])).toBe(true) + expect(isLikelyBot({ userAgentData: { brands: ['HeadlessChrome'] } } as Navigator, [])).toBe(true) + }) + + it('does not crash if the type of navigatorUAData changes', () => { + isLikelyBot({ userAgentData: { brands: [() => 'HeadlessChrome'] } } as Navigator, []) + isLikelyBot({ userAgentData: { brands: () => ['HeadlessChrome'] } } as unknown as Navigator, []) + isLikelyBot({ userAgentData: 'HeadlessChrome' } as unknown as Navigator, []) + isLikelyBot({ userAgentData: {} } as unknown as Navigator, []) + isLikelyBot({ userAgentData: null } as unknown as Navigator, []) + isLikelyBot({ userAgentData: () => ['HeadlessChrome'] } as unknown as Navigator, []) + isLikelyBot({ userAgentData: true } as unknown as Navigator, []) }) }) diff --git a/src/posthog-core.ts b/src/posthog-core.ts index 16f21e721..e8d151f60 100644 --- a/src/posthog-core.ts +++ b/src/posthog-core.ts @@ -10,7 +10,7 @@ import { isCrossDomainCookie, isDistinctIdStringLike, } from './utils' -import { assignableWindow, document, location, userAgent, window } from './utils/globals' +import { assignableWindow, document, location, navigator, userAgent, window } from './utils/globals' import { PostHogFeatureFlags } from './posthog-featureflags' import { PostHogPersistence } from './posthog-persistence' import { @@ -66,7 +66,7 @@ import { import { Info } from './utils/event-utils' import { logger } from './utils/logger' import { SessionPropsManager } from './session-props' -import { isBlockedUA } from './utils/blocked-uas' +import { isLikelyBot } from './utils/blocked-uas' import { extendURLParams, request, SUPPORTS_REQUEST } from './request' import { Heatmaps } from './heatmaps' import { ScrollManager } from './scroll-manager' @@ -772,9 +772,9 @@ export class PostHog { } if ( - userAgent && + navigator && !this.config.opt_out_useragent_filter && - isBlockedUA(userAgent, this.config.custom_blocked_useragents) + isLikelyBot(navigator, this.config.custom_blocked_useragents) ) { return } @@ -930,7 +930,7 @@ export class PostHog { // this is only added when this.config.opt_out_useragent_filter is true, // or it would always add "browser" if (userAgent && this.config.opt_out_useragent_filter) { - properties['$browser_type'] = isBlockedUA(userAgent, this.config.custom_blocked_useragents) + properties['$browser_type'] = isLikelyBot(navigator, this.config.custom_blocked_useragents) ? 'bot' : 'browser' } diff --git a/src/utils/blocked-uas.ts b/src/utils/blocked-uas.ts index d01da0e56..9cfd2fb85 100644 --- a/src/utils/blocked-uas.ts +++ b/src/utils/blocked-uas.ts @@ -1,3 +1,5 @@ +import { isArray } from './type-utils' + export const DEFAULT_BLOCKED_UA_STRS = [ 'ahrefsbot', 'ahrefssiteaudit', @@ -34,6 +36,9 @@ export const DEFAULT_BLOCKED_UA_STRS = [ 'yahoo! slurp', 'yandexbot', + // headless browsers + 'headlesschrome', + // a whole bunch of goog-specific crawlers // https://developers.google.com/search/docs/advanced/crawling/overview-google-crawlers 'Google-HotelAdsVerifier', @@ -66,3 +71,58 @@ export const isBlockedUA = function (ua: string, customBlockedUserAgents: string return uaLower.indexOf(blockedUaLower) !== -1 }) } + +// There's more in the type, but this is all we use. It's currently experimental, see +// https://developer.mozilla.org/en-US/docs/Web/API/Navigator/userAgentData +// if you're reading this in the future, when it's no longer experimental, please remove this type and use an official one. +// Be extremely defensive here to ensure backwards and *forwards* compatibility, and remove this defensiveness in the +// future when it is safe to do so. +export interface NavigatorUAData { + brands?: unknown[] +} +declare global { + interface Navigator { + userAgentData?: NavigatorUAData + } +} + +export const isLikelyBot = function (navigator: Navigator | undefined, customBlockedUserAgents: string[]): boolean { + if (!navigator) { + return false + } + const ua = navigator.userAgent + if (ua) { + if (isBlockedUA(ua, customBlockedUserAgents)) { + return true + } + } + try { + if ('userAgentData' in navigator) { + const uaData = navigator.userAgentData as NavigatorUAData + if ( + uaData.brands && + isArray(uaData.brands) && + uaData.brands.some((brand) => typeof brand === 'string' && isBlockedUA(brand, customBlockedUserAgents)) + ) { + return true + } + } + } catch { + // ignore the error, we were using experimental browser features + } + + if ('webdriver' in navigator && navigator.webdriver) { + return true + } + + // There's some more enhancements we could make in this area, e.g. it's possible to check if Chrome dev tools are + // open, which will detect some bots that are trying to mask themselves and might get past the checks above. + // However, this would give false positives for actual humans who have dev tools open. + + // We could also use the data in navigator.userAgentData.getHighEntropyValues() to detect bots, but we should wait + // until this stops being experimental. The MDN docs imply that this might eventually require user permission. + // See https://developer.mozilla.org/en-US/docs/Web/API/NavigatorUAData/getHighEntropyValues + // It would be very bad if posthog-js caused a permission prompt to appear on every page load. + + return false +} diff --git a/src/utils/globals.ts b/src/utils/globals.ts index 48472df92..f1b744f7c 100644 --- a/src/utils/globals.ts +++ b/src/utils/globals.ts @@ -25,6 +25,8 @@ export const XMLHttpRequest = global?.XMLHttpRequest && 'withCredentials' in new global.XMLHttpRequest() ? global.XMLHttpRequest : undefined export const AbortController = global?.AbortController export const userAgent = navigator?.userAgent + +export const userAgentData = (navigator as any)?.userAgentData //experimental, see https://developer.mozilla.org/en-US/docs/Web/API/Navigator/userAgentData export const assignableWindow: Window & typeof globalThis & Record = win ?? ({} as any) export { win as window } From 31ea43ab447864315510070f50bad2eb7a527ca7 Mon Sep 17 00:00:00 2001 From: Robbie Coomber Date: Wed, 14 Aug 2024 09:51:26 +0100 Subject: [PATCH 02/13] Use real values of brands from headless and regular chrome in tests --- playground/nextjs/package.json | 2 +- playground/nextjs/pages/ua.tsx | 32 +++++++++++++++++++++++++++++ src/__tests__/utils.test.ts | 37 +++++++++++++++++++++++++++++++--- src/utils/blocked-uas.ts | 9 ++++++++- 4 files changed, 75 insertions(+), 5 deletions(-) create mode 100644 playground/nextjs/pages/ua.tsx diff --git a/playground/nextjs/package.json b/playground/nextjs/package.json index 557149376..4811df568 100644 --- a/playground/nextjs/package.json +++ b/playground/nextjs/package.json @@ -4,7 +4,7 @@ "private": true, "scripts": { "clean-react": "cd ../../react && rm -rf ./node_modules/", - "dev": "pnpm run link-posthog-js && pnpm run clean-react && next dev --experimental-https", + "dev": "pnpm run link-posthog-js && pnpm run clean-react && next dev", "dev-crossdomain": "pnpm run link-posthog-js && pnpm run clean-react && NEXT_PUBLIC_CROSSDOMAIN=1 next dev --experimental-https", "build": "pnpm run build-posthog-js && pnpm run link-posthog-js && pnpm run clean-react && next build", "start": "next start", diff --git a/playground/nextjs/pages/ua.tsx b/playground/nextjs/pages/ua.tsx new file mode 100644 index 000000000..96f749112 --- /dev/null +++ b/playground/nextjs/pages/ua.tsx @@ -0,0 +1,32 @@ +import { useEffect, useState } from 'react' + +// Try this page with some of the following commands: +// chrome --headless --disable-gpu --print-to-pdf http://localhost:3000/ua --virtual-time-budget=10000 +// chrome --headless --disable-gpu --print-to-pdf http://localhost:3000/ua --virtual-time-budget=10000 --user-agent="RealHuman" + +export default function Home() { + const [isClient, setIsClient] = useState(false) + useEffect(() => { + setIsClient(true) + }, []) + if (!isClient) { + return
Not client
+ } + return ( +
+
UA
+
+ {navigator.userAgent} +
+
WebDriver
+
+ {String(navigator.webdriver)} +
+
NavigatorUAData brands
+
+ {/* eslint-disable-next-line compat/compat */} + {JSON.stringify((navigator as any).userAgentData?.brands)} +
+
+ ) +} diff --git a/src/__tests__/utils.test.ts b/src/__tests__/utils.test.ts index de513f126..b6216bfee 100644 --- a/src/__tests__/utils.test.ts +++ b/src/__tests__/utils.test.ts @@ -159,12 +159,43 @@ describe('utils', () => { expect(isLikelyBot({ webdriver: true } as Navigator, [])).toBe(true) }) - it('blocks based on hints in userAgentData', () => { - expect(isLikelyBot({ userAgentData: { brands: ['Googlebot'] } } as Navigator, [])).toBe(true) - expect(isLikelyBot({ userAgentData: { brands: ['HeadlessChrome'] } } as Navigator, [])).toBe(true) + it('blocks based on userAgentData', () => { + expect( + isLikelyBot( + { + userAgentData: { + brands: [ + { brand: 'Not)A;Brand', version: '99' }, + { brand: 'HeadlessChrome', version: '127' }, + { brand: 'Chromium', version: '127' }, + ], + }, + } as Navigator, + [] + ) + ).toBe(true) + }) + + it('does not block a normal browser based of userAgentData', () => { + expect( + isLikelyBot( + { + userAgentData: { + brands: [ + { brand: 'Not)A;Brand', version: '99' }, + { brand: 'Google Chrome', version: '127' }, + { brand: 'Chromium', version: '127' }, + ], + }, + } as Navigator, + [] + ) + ).toBe(false) }) it('does not crash if the type of navigatorUAData changes', () => { + // we're not checking the return values of these, only that they don't crash + isLikelyBot({ userAgentData: { brands: ['HeadlessChrome'] } } as Navigator, []) isLikelyBot({ userAgentData: { brands: [() => 'HeadlessChrome'] } } as Navigator, []) isLikelyBot({ userAgentData: { brands: () => ['HeadlessChrome'] } } as unknown as Navigator, []) isLikelyBot({ userAgentData: 'HeadlessChrome' } as unknown as Navigator, []) diff --git a/src/utils/blocked-uas.ts b/src/utils/blocked-uas.ts index 9cfd2fb85..d59fd5242 100644 --- a/src/utils/blocked-uas.ts +++ b/src/utils/blocked-uas.ts @@ -102,7 +102,14 @@ export const isLikelyBot = function (navigator: Navigator | undefined, customBlo if ( uaData.brands && isArray(uaData.brands) && - uaData.brands.some((brand) => typeof brand === 'string' && isBlockedUA(brand, customBlockedUserAgents)) + uaData.brands.some( + (brandObj: unknown) => + typeof brandObj === 'object' && + brandObj && + 'brand' in brandObj && + typeof brandObj.brand === 'string' && + isBlockedUA(brandObj.brand, customBlockedUserAgents) + ) ) { return true } From 4562f64d43494e0a3333446b6dd205902c120abc Mon Sep 17 00:00:00 2001 From: Robbie Coomber Date: Wed, 14 Aug 2024 09:59:16 +0100 Subject: [PATCH 03/13] Reduce bundle size slightly --- src/__tests__/utils.test.ts | 34 +++++++++++++++++++--------------- src/utils/blocked-uas.ts | 19 ++++++------------- 2 files changed, 25 insertions(+), 28 deletions(-) diff --git a/src/__tests__/utils.test.ts b/src/__tests__/utils.test.ts index b6216bfee..de93b314c 100644 --- a/src/__tests__/utils.test.ts +++ b/src/__tests__/utils.test.ts @@ -9,7 +9,7 @@ import { _copyAndTruncateStrings, isCrossDomainCookie, _base64Encode } from '../utils' import { Info } from '../utils/event-utils' -import { isLikelyBot, DEFAULT_BLOCKED_UA_STRS, isBlockedUA } from '../utils/blocked-uas' +import { isLikelyBot, DEFAULT_BLOCKED_UA_STRS, isBlockedUA, NavigatorUAData } from '../utils/blocked-uas' import { expect } from '@jest/globals' function userAgentFor(botString: string) { @@ -160,16 +160,17 @@ describe('utils', () => { }) it('blocks based on userAgentData', () => { + const headlessUserAgentData: NavigatorUAData = { + brands: [ + { brand: 'Not)A;Brand', version: '99' }, + { brand: 'HeadlessChrome', version: '127' }, + { brand: 'Chromium', version: '127' }, + ], + } expect( isLikelyBot( { - userAgentData: { - brands: [ - { brand: 'Not)A;Brand', version: '99' }, - { brand: 'HeadlessChrome', version: '127' }, - { brand: 'Chromium', version: '127' }, - ], - }, + userAgentData: headlessUserAgentData, } as Navigator, [] ) @@ -177,16 +178,17 @@ describe('utils', () => { }) it('does not block a normal browser based of userAgentData', () => { + const realUserAgentData: NavigatorUAData = { + brands: [ + { brand: 'Not)A;Brand', version: '99' }, + { brand: 'Google Chrome', version: '127' }, + { brand: 'Chromium', version: '127' }, + ], + } expect( isLikelyBot( { - userAgentData: { - brands: [ - { brand: 'Not)A;Brand', version: '99' }, - { brand: 'Google Chrome', version: '127' }, - { brand: 'Chromium', version: '127' }, - ], - }, + userAgentData: realUserAgentData, } as Navigator, [] ) @@ -195,7 +197,9 @@ describe('utils', () => { it('does not crash if the type of navigatorUAData changes', () => { // we're not checking the return values of these, only that they don't crash + // @ts-expect-error testing invalid data isLikelyBot({ userAgentData: { brands: ['HeadlessChrome'] } } as Navigator, []) + // @ts-expect-error testing invalid data isLikelyBot({ userAgentData: { brands: [() => 'HeadlessChrome'] } } as Navigator, []) isLikelyBot({ userAgentData: { brands: () => ['HeadlessChrome'] } } as unknown as Navigator, []) isLikelyBot({ userAgentData: 'HeadlessChrome' } as unknown as Navigator, []) diff --git a/src/utils/blocked-uas.ts b/src/utils/blocked-uas.ts index d59fd5242..7e4af1d9d 100644 --- a/src/utils/blocked-uas.ts +++ b/src/utils/blocked-uas.ts @@ -1,5 +1,3 @@ -import { isArray } from './type-utils' - export const DEFAULT_BLOCKED_UA_STRS = [ 'ahrefsbot', 'ahrefssiteaudit', @@ -78,7 +76,10 @@ export const isBlockedUA = function (ua: string, customBlockedUserAgents: string // Be extremely defensive here to ensure backwards and *forwards* compatibility, and remove this defensiveness in the // future when it is safe to do so. export interface NavigatorUAData { - brands?: unknown[] + brands?: { + brand: string + version: string + }[] } declare global { interface Navigator { @@ -100,16 +101,8 @@ export const isLikelyBot = function (navigator: Navigator | undefined, customBlo if ('userAgentData' in navigator) { const uaData = navigator.userAgentData as NavigatorUAData if ( - uaData.brands && - isArray(uaData.brands) && - uaData.brands.some( - (brandObj: unknown) => - typeof brandObj === 'object' && - brandObj && - 'brand' in brandObj && - typeof brandObj.brand === 'string' && - isBlockedUA(brandObj.brand, customBlockedUserAgents) - ) + uaData?.brands && + uaData.brands.some((brandObj) => isBlockedUA(brandObj?.brand, customBlockedUserAgents)) ) { return true } From 8cb7d1e4cfcee8c408c7923fe2996fbc059020e0 Mon Sep 17 00:00:00 2001 From: Robbie Coomber Date: Wed, 14 Aug 2024 10:00:26 +0100 Subject: [PATCH 04/13] Remove global --- src/utils/globals.ts | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/utils/globals.ts b/src/utils/globals.ts index f1b744f7c..48472df92 100644 --- a/src/utils/globals.ts +++ b/src/utils/globals.ts @@ -25,8 +25,6 @@ export const XMLHttpRequest = global?.XMLHttpRequest && 'withCredentials' in new global.XMLHttpRequest() ? global.XMLHttpRequest : undefined export const AbortController = global?.AbortController export const userAgent = navigator?.userAgent - -export const userAgentData = (navigator as any)?.userAgentData //experimental, see https://developer.mozilla.org/en-US/docs/Web/API/Navigator/userAgentData export const assignableWindow: Window & typeof globalThis & Record = win ?? ({} as any) export { win as window } From d9c862448979865415ca744ac5b777a29434b923 Mon Sep 17 00:00:00 2001 From: Robbie Coomber Date: Wed, 14 Aug 2024 10:08:05 +0100 Subject: [PATCH 05/13] Fix test setup --- src/__tests__/posthog-core.ts | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/src/__tests__/posthog-core.ts b/src/__tests__/posthog-core.ts index 2ce103037..0662c9880 100644 --- a/src/__tests__/posthog-core.ts +++ b/src/__tests__/posthog-core.ts @@ -144,24 +144,28 @@ describe('posthog core', () => { }) it('respects opt_out_useragent_filter (default: false)', () => { - const originalUseragent = globals.userAgent - ;(globals as any)['userAgent'] = - 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/W.X.Y.Z Safari/537.36' - + const originalNavigator = globals.navigator + ;(globals as any).navigator = { + ...globals.navigator, + userAgent: + 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/W.X.Y.Z Safari/537.36', + } const hook = jest.fn() const posthog = posthogWith(defaultConfig, defaultOverrides) posthog._addCaptureHook(hook) posthog.capture(eventName, {}, {}) expect(hook).not.toHaveBeenCalledWith('$event') - ;(globals as any)['userAgent'] = originalUseragent + ;(globals as any)['navigator'] = originalNavigator }) it('respects opt_out_useragent_filter', () => { - const originalUseragent = globals.userAgent - - ;(globals as any)['userAgent'] = - 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/W.X.Y.Z Safari/537.36' + const originalNavigator = globals.navigator + ;(globals as any).navigator = { + ...globals.navigator, + userAgent: + 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/W.X.Y.Z Safari/537.36', + } const hook = jest.fn() const posthog = posthogWith( @@ -184,7 +188,7 @@ describe('posthog core', () => { }) ) expect(event.properties['$browser_type']).toEqual('bot') - ;(globals as any)['userAgent'] = originalUseragent + ;(globals as any)['navigator'] = originalNavigator }) it('truncates long properties', () => { From 1c87ed36f529953f353a718b194f0d0e36c49399 Mon Sep 17 00:00:00 2001 From: Robbie Coomber Date: Wed, 14 Aug 2024 11:07:09 +0100 Subject: [PATCH 06/13] Opt out of bot filtering in cypress --- cypress/support/setup.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cypress/support/setup.ts b/cypress/support/setup.ts index 2e31e880c..655b3e070 100644 --- a/cypress/support/setup.ts +++ b/cypress/support/setup.ts @@ -39,7 +39,10 @@ export const start = ({ cy.visit(url) if (initPosthog) { - cy.posthogInit(options) + cy.posthogInit({ + opt_out_useragent_filter: true, // we ARE a bot, so we need to enable this opt-out + ...options, + }) } if (resetOnInit) { From e6ecd883d487eb6370d20a4d0850910f69a97cb1 Mon Sep 17 00:00:00 2001 From: Robbie Coomber Date: Wed, 14 Aug 2024 12:54:45 +0100 Subject: [PATCH 07/13] Make _is_likely_bot an instance function on posthog, use it in cypress tests --- cypress/e2e/ua.cy.ts | 19 +++++++++++++++++++ cypress/support/commands.ts | 1 + cypress/support/index.ts | 1 + cypress/tsconfig.json | 2 +- package.json | 1 + pnpm-lock.yaml | 7 +++++++ src/posthog-core.ts | 18 ++++++++++-------- 7 files changed, 40 insertions(+), 9 deletions(-) create mode 100644 cypress/e2e/ua.cy.ts diff --git a/cypress/e2e/ua.cy.ts b/cypress/e2e/ua.cy.ts new file mode 100644 index 000000000..cb5a3b966 --- /dev/null +++ b/cypress/e2e/ua.cy.ts @@ -0,0 +1,19 @@ +/// +import { start } from '../support/setup' +import '@cypress/skip-test/support' +import '@cypress/skip-test' + +describe('User Agent Blocking', () => { + it('should pick up that our automated cypress tests are indeed bot traffic', async () => { + // @ts-expect-error skipOn types don't work + cy.skipOn('windows') + start({}) + + // @ts-expect-error awaiting a cypress chainable + const isLikelyBot = await cy.window().then((win) => { + return win.eval('window.posthog._is_likely_bot()') + }) + + expect(isLikelyBot).to.eql(true) + }) +}) diff --git a/cypress/support/commands.ts b/cypress/support/commands.ts index 2c82d7f45..11a219bb2 100644 --- a/cypress/support/commands.ts +++ b/cypress/support/commands.ts @@ -13,6 +13,7 @@ Cypress.Commands.add('posthogInit', (options) => { $captures.push(event) $fullCaptures.push(eventData) }, + opt_out_useragent_filter: true, ...options, }) }) diff --git a/cypress/support/index.ts b/cypress/support/index.ts index 22e18cfb6..82dec1097 100644 --- a/cypress/support/index.ts +++ b/cypress/support/index.ts @@ -2,6 +2,7 @@ import { PostHog } from '../../src/posthog-core' import { PostHogConfig } from '../../src/types' +import '@cypress/skip-test' declare global { // eslint-disable-next-line @typescript-eslint/no-namespace diff --git a/cypress/tsconfig.json b/cypress/tsconfig.json index 02963d41b..3592f862f 100644 --- a/cypress/tsconfig.json +++ b/cypress/tsconfig.json @@ -1,7 +1,7 @@ { "compilerOptions": { "target": "es2015", - "lib": ["es5", "dom"], + "lib": ["es5", "dom", "es2015"], "types": ["cypress", "node"], "moduleResolution": "node" }, diff --git a/package.json b/package.json index 56efc1c7e..61ee98ab2 100644 --- a/package.json +++ b/package.json @@ -43,6 +43,7 @@ "@babel/plugin-transform-react-jsx": "^7.23.4", "@babel/preset-env": "7.18.9", "@babel/preset-typescript": "^7.18.6", + "@cypress/skip-test": "^2.6.1", "@jest/globals": "^27.5.1", "@rollup/plugin-babel": "^6.0.4", "@rollup/plugin-json": "^6.1.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index b5384b46a..276578390 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -36,6 +36,9 @@ devDependencies: '@babel/preset-typescript': specifier: ^7.18.6 version: 7.18.6(@babel/core@7.18.9) + '@cypress/skip-test': + specifier: ^2.6.1 + version: 2.6.1 '@jest/globals': specifier: ^27.5.1 version: 27.5.1 @@ -1656,6 +1659,10 @@ packages: uuid: 8.3.2 dev: true + /@cypress/skip-test@2.6.1: + resolution: {integrity: sha512-X+ibefBiuOmC5gKG91wRIT0/OqXeETYvu7zXktjZ3yLeO186Y8ia0K7/gQUpAwuUi28DuqMd1+7tBQVtPkzbPA==} + dev: true + /@cypress/xvfb@1.2.4(supports-color@8.1.1): resolution: {integrity: sha512-skbBzPggOVYCbnGgV+0dmBdW/s77ZkAOXIC1knS8NagwDjBrNC1LuXtQJeiN6l+m7lzmHtaoUw/ctJKdqkG57Q==} dependencies: diff --git a/src/posthog-core.ts b/src/posthog-core.ts index e8d151f60..dcbce88c6 100644 --- a/src/posthog-core.ts +++ b/src/posthog-core.ts @@ -771,11 +771,7 @@ export class PostHog { return } - if ( - navigator && - !this.config.opt_out_useragent_filter && - isLikelyBot(navigator, this.config.custom_blocked_useragents) - ) { + if (!this.config.opt_out_useragent_filter && this._is_likely_bot()) { return } @@ -930,9 +926,7 @@ export class PostHog { // this is only added when this.config.opt_out_useragent_filter is true, // or it would always add "browser" if (userAgent && this.config.opt_out_useragent_filter) { - properties['$browser_type'] = isLikelyBot(navigator, this.config.custom_blocked_useragents) - ? 'bot' - : 'browser' + properties['$browser_type'] = this._is_likely_bot() ? 'bot' : 'browser' } // note: extend writes to the first object, so lets make sure we @@ -2038,6 +2032,14 @@ export class PostHog { this._sync_opt_out_with_persistence() } + _is_likely_bot(): boolean | undefined { + if (navigator) { + return isLikelyBot(navigator, this.config.custom_blocked_useragents) + } else { + return undefined + } + } + debug(debug?: boolean): void { if (debug === false) { window?.console.log("You've disabled debug mode.") From cf3d1eb85453f6960d945429e8aec25865c2f4ec Mon Sep 17 00:00:00 2001 From: Robbie Coomber Date: Wed, 14 Aug 2024 12:55:50 +0100 Subject: [PATCH 08/13] Fix cypress imports --- cypress/e2e/ua.cy.ts | 2 -- cypress/support/e2e.ts | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/cypress/e2e/ua.cy.ts b/cypress/e2e/ua.cy.ts index cb5a3b966..e54d0ad50 100644 --- a/cypress/e2e/ua.cy.ts +++ b/cypress/e2e/ua.cy.ts @@ -1,7 +1,5 @@ /// import { start } from '../support/setup' -import '@cypress/skip-test/support' -import '@cypress/skip-test' describe('User Agent Blocking', () => { it('should pick up that our automated cypress tests are indeed bot traffic', async () => { diff --git a/cypress/support/e2e.ts b/cypress/support/e2e.ts index 429a80322..abff144f2 100644 --- a/cypress/support/e2e.ts +++ b/cypress/support/e2e.ts @@ -1,4 +1,5 @@ import './commands' +import '@cypress/skip-test/support' // Add console errors into cypress logs. Cypress.on('window:before:load', (win) => { From a438f7e1cd592b979eef4d0769ffd9ca27779f91 Mon Sep 17 00:00:00 2001 From: Robbie Coomber Date: Wed, 14 Aug 2024 13:04:38 +0100 Subject: [PATCH 09/13] Code golf --- src/utils/blocked-uas.ts | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/src/utils/blocked-uas.ts b/src/utils/blocked-uas.ts index 7e4af1d9d..a80653c82 100644 --- a/src/utils/blocked-uas.ts +++ b/src/utils/blocked-uas.ts @@ -98,22 +98,16 @@ export const isLikelyBot = function (navigator: Navigator | undefined, customBlo } } try { - if ('userAgentData' in navigator) { - const uaData = navigator.userAgentData as NavigatorUAData - if ( - uaData?.brands && - uaData.brands.some((brandObj) => isBlockedUA(brandObj?.brand, customBlockedUserAgents)) - ) { - return true - } + // eslint-disable-next-line compat/compat + const uaData = navigator?.userAgentData as NavigatorUAData + if (uaData?.brands && uaData.brands.some((brandObj) => isBlockedUA(brandObj?.brand, customBlockedUserAgents))) { + return true } } catch { // ignore the error, we were using experimental browser features } - if ('webdriver' in navigator && navigator.webdriver) { - return true - } + return !!navigator.webdriver // There's some more enhancements we could make in this area, e.g. it's possible to check if Chrome dev tools are // open, which will detect some bots that are trying to mask themselves and might get past the checks above. @@ -123,6 +117,4 @@ export const isLikelyBot = function (navigator: Navigator | undefined, customBlo // until this stops being experimental. The MDN docs imply that this might eventually require user permission. // See https://developer.mozilla.org/en-US/docs/Web/API/NavigatorUAData/getHighEntropyValues // It would be very bad if posthog-js caused a permission prompt to appear on every page load. - - return false } From 5c8e6c2b2dfe75911e5464c5cb135f28dc7a7833 Mon Sep 17 00:00:00 2001 From: Robbie Coomber Date: Wed, 14 Aug 2024 13:06:43 +0100 Subject: [PATCH 10/13] More code golf --- cypress/e2e/ua.cy.ts | 2 +- src/posthog-core.ts | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cypress/e2e/ua.cy.ts b/cypress/e2e/ua.cy.ts index e54d0ad50..4b177730a 100644 --- a/cypress/e2e/ua.cy.ts +++ b/cypress/e2e/ua.cy.ts @@ -9,7 +9,7 @@ describe('User Agent Blocking', () => { // @ts-expect-error awaiting a cypress chainable const isLikelyBot = await cy.window().then((win) => { - return win.eval('window.posthog._is_likely_bot()') + return win.eval('window.posthog._is_bot()') }) expect(isLikelyBot).to.eql(true) diff --git a/src/posthog-core.ts b/src/posthog-core.ts index dcbce88c6..d01ab5e9e 100644 --- a/src/posthog-core.ts +++ b/src/posthog-core.ts @@ -771,7 +771,7 @@ export class PostHog { return } - if (!this.config.opt_out_useragent_filter && this._is_likely_bot()) { + if (!this.config.opt_out_useragent_filter && this._is_bot()) { return } @@ -926,7 +926,7 @@ export class PostHog { // this is only added when this.config.opt_out_useragent_filter is true, // or it would always add "browser" if (userAgent && this.config.opt_out_useragent_filter) { - properties['$browser_type'] = this._is_likely_bot() ? 'bot' : 'browser' + properties['$browser_type'] = this._is_bot() ? 'bot' : 'browser' } // note: extend writes to the first object, so lets make sure we @@ -2032,7 +2032,7 @@ export class PostHog { this._sync_opt_out_with_persistence() } - _is_likely_bot(): boolean | undefined { + _is_bot(): boolean | undefined { if (navigator) { return isLikelyBot(navigator, this.config.custom_blocked_useragents) } else { From b581619f9d1a4804a3a423974043a240295dd7e1 Mon Sep 17 00:00:00 2001 From: Robbie Coomber Date: Wed, 14 Aug 2024 13:14:24 +0100 Subject: [PATCH 11/13] Fix testcafe tests --- testcafe/helpers.js | 1 + 1 file changed, 1 insertion(+) diff --git a/testcafe/helpers.js b/testcafe/helpers.js index a4a239698..9dff4711e 100644 --- a/testcafe/helpers.js +++ b/testcafe/helpers.js @@ -57,6 +57,7 @@ export const initPosthog = (config) => { distinctID: 'automated-tester', // We set this to get around the ingestion delay for new distinctIDs isIdentifiedID: true, }, + opt_out_useragent_filter: true, }) } From 7673f831d72a5fc3da3977643168169b91c8c9c1 Mon Sep 17 00:00:00 2001 From: Robbie Coomber Date: Wed, 14 Aug 2024 13:35:39 +0100 Subject: [PATCH 12/13] Treat cypress as a bot --- src/__tests__/utils.test.ts | 6 ++++++ src/utils/blocked-uas.ts | 2 ++ 2 files changed, 8 insertions(+) diff --git a/src/__tests__/utils.test.ts b/src/__tests__/utils.test.ts index de93b314c..a8ae87d6b 100644 --- a/src/__tests__/utils.test.ts +++ b/src/__tests__/utils.test.ts @@ -129,6 +129,9 @@ describe('utils', () => { [ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/122.0.0.0 Safari/537.36', ], + [ + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Cypress/13.6.3 Chrome/114.0.5735.289 Electron/25.8.4 Safari/537.36', + ], ])('blocks based on user agent', (botString) => { expect(isBlockedUA(botString, [])).toBe(true) expect(isBlockedUA(botString.toLowerCase(), [])).toBe(true) @@ -146,6 +149,9 @@ describe('utils', () => { [ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15', ], + [ + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) elec/1.0.0 Chrome/126.0.6478.127 Electron/31.2.1 Safari/537.36', + ], ])('does not block based on non-bot user agent', (userAgent) => { expect(isBlockedUA(userAgent, [])).toBe(false) expect(isBlockedUA(userAgent.toLowerCase(), [])).toBe(false) diff --git a/src/utils/blocked-uas.ts b/src/utils/blocked-uas.ts index a80653c82..fd0223d73 100644 --- a/src/utils/blocked-uas.ts +++ b/src/utils/blocked-uas.ts @@ -36,6 +36,8 @@ export const DEFAULT_BLOCKED_UA_STRS = [ // headless browsers 'headlesschrome', + 'cypress', + // we don't block electron here, as many customers use posthog-js in electron apps // a whole bunch of goog-specific crawlers // https://developers.google.com/search/docs/advanced/crawling/overview-google-crawlers From 2efe0eb4eec747fe891231de5cafde1c33b2eb6a Mon Sep 17 00:00:00 2001 From: Robbie Coomber Date: Wed, 14 Aug 2024 13:56:09 +0100 Subject: [PATCH 13/13] Fix cypress chaining logic --- cypress/e2e/ua.cy.ts | 9 +++------ package.json | 4 ++-- pnpm-lock.yaml | 28 +++++++++++++--------------- 3 files changed, 18 insertions(+), 23 deletions(-) diff --git a/cypress/e2e/ua.cy.ts b/cypress/e2e/ua.cy.ts index 4b177730a..6df99b8ec 100644 --- a/cypress/e2e/ua.cy.ts +++ b/cypress/e2e/ua.cy.ts @@ -3,15 +3,12 @@ import { start } from '../support/setup' describe('User Agent Blocking', () => { it('should pick up that our automated cypress tests are indeed bot traffic', async () => { - // @ts-expect-error skipOn types don't work cy.skipOn('windows') start({}) - // @ts-expect-error awaiting a cypress chainable - const isLikelyBot = await cy.window().then((win) => { - return win.eval('window.posthog._is_bot()') + cy.window().then((win) => { + const isLikelyBot = win.eval('window.posthog._is_bot()') + expect(isLikelyBot).to.eql(true) }) - - expect(isLikelyBot).to.eql(true) }) }) diff --git a/package.json b/package.json index 61ee98ab2..7650e4e18 100644 --- a/package.json +++ b/package.json @@ -65,8 +65,8 @@ "babel-eslint": "10.1.0", "babel-jest": "^26.6.3", "compare-versions": "^6.1.0", - "cypress": "13.6.3", - "cypress-localstorage-commands": "^2.2.5", + "cypress": "13.13.2", + "cypress-localstorage-commands": "^2.2.6", "date-fns": "^3.6.0", "eslint": "8.56.0", "eslint-config-posthog-js": "link:eslint-rules", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 276578390..0a6f9e8f9 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -103,11 +103,11 @@ devDependencies: specifier: ^6.1.0 version: 6.1.0 cypress: - specifier: 13.6.3 - version: 13.6.3 + specifier: 13.13.2 + version: 13.13.2 cypress-localstorage-commands: - specifier: ^2.2.5 - version: 2.2.5(cypress@13.6.3) + specifier: ^2.2.6 + version: 2.2.6(cypress@13.13.2) date-fns: specifier: ^3.6.0 version: 3.6.0 @@ -4481,17 +4481,17 @@ packages: resolution: {integrity: sha512-d4ZVpCW31eWwCMe1YT3ur7mUDnTXbgwyzaL320DrcRT45rfjYxkt5QWLrmOJ+/UEAI2+fQgKe/fCjR8l4TpRgw==} dev: true - /cypress-localstorage-commands@2.2.5(cypress@13.6.3): - resolution: {integrity: sha512-07zpwzWdY+uPi1NEHFhWQNylIJqRxR78Ile05L6WT8h1Gz0OaxgBSZRuzp+pqUni/3Pk4d2ieq/cSh++ZmujEA==} + /cypress-localstorage-commands@2.2.6(cypress@13.13.2): + resolution: {integrity: sha512-l3nZ+Lu6YbBWk4UIfNrRkNK56BkF8zVbCrqzCf35x4Nlx2wA2r0spBOZXnKjbutQZgo6qDqtS8uXoSqV36YM1Q==} engines: {node: '>=14.0.0'} peerDependencies: cypress: '>=2.1.0' dependencies: - cypress: 13.6.3 + cypress: 13.13.2 dev: true - /cypress@13.6.3: - resolution: {integrity: sha512-d/pZvgwjAyZsoyJ3FOsJT5lDsqnxQ/clMqnNc++rkHjbkkiF2h9s0JsZSyyH4QXhVFW3zPFg82jD25roFLOdZA==} + /cypress@13.13.2: + resolution: {integrity: sha512-PvJQU33933NvS1StfzEb8/mu2kMy4dABwCF+yd5Bi7Qly1HOVf+Bufrygee/tlmty/6j5lX+KIi8j9Q3JUMbhA==} engines: {node: ^16.0.0 || ^18.0.0 || >=20.0.0} hasBin: true requiresBuild: true @@ -4535,7 +4535,7 @@ packages: request-progress: 3.0.0 semver: 7.5.4 supports-color: 8.1.1 - tmp: 0.2.1 + tmp: 0.2.3 untildify: 4.0.0 yauzl: 2.10.0 dev: true @@ -10250,11 +10250,9 @@ packages: rimraf: 2.7.1 dev: true - /tmp@0.2.1: - resolution: {integrity: sha512-76SUhtfqR2Ijn+xllcI5P1oyannHNHByD80W1q447gU3mp9G9PSpGdWmjUOHRDPiHYacIk66W7ubDTuPF3BEtQ==} - engines: {node: '>=8.17.0'} - dependencies: - rimraf: 3.0.2 + /tmp@0.2.3: + resolution: {integrity: sha512-nZD7m9iCPC5g0pYmcaxogYKggSfLsdxl8of3Q/oIbqCqLLIO9IAF0GWjX1z9NZRHPiXv8Wex4yDCaZsgEw0Y8w==} + engines: {node: '>=14.14'} dev: true /tmpl@1.0.5: