Skip to content

Commit

Permalink
Preparation for navigational tracking detection.
Browse files Browse the repository at this point in the history
Details:
- pagejs: emits "page events" that trigger on state changes
  like navigations (already dealing with aspects like cross-browser
  support and filtering incognito pages)
- Better support for the slightly different APIs of Firefox on Android
- The module is stand-alone now (no longer requiring external
  triggers on navigations)
- Extended scenario based testing (building on page events)
- New experimental navigational tracking message: "wtm.nav-track-detect.search-ad"
  (generic page navigations are not supported yet)
  • Loading branch information
philipp-classen committed Dec 16, 2024
1 parent 1d029c6 commit 1973859
Show file tree
Hide file tree
Showing 14 changed files with 1,424 additions and 152 deletions.
38 changes: 38 additions & 0 deletions reporting/src/cooldowns.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/**
* WhoTracks.Me
* https://whotracks.me/
*
* Copyright 2017-present Ghostery GmbH. All rights reserved.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0
*/

import random from './random';

const HOUR = 60 * 60 * 1000;

/**
* Minimum delay before repeating an action that should not
* be repeat untiled the next day (e.g. for doublefetch attempts
* for queries). Waiting for next start of day (in UTC time) is
* recommended, as trying to send messages earlier is a waste of
* resources. Not that only successful attempts should be counted,
* and failed ones should be rolled back.
*
* In addition, enforce a minimum cooldown, intended for people
* living in timezones like US west coast where UTC midnight
* happens during the day. Without a minimum cooldown, there is
* the risk of introducing bias in the collected data, as we
* would include repeated searches with higher likelihood than
* in other parts of the world (e.g. Europe).
*/
export function timezoneAgnosticDailyExpireAt() {
const minCooldown = 8 * HOUR;
const tillNextUtcDay = new Date().setUTCHours(23, 59, 59, 999) + 1;
const tillCooldown = Date.now() + minCooldown;
const randomNoise = Math.ceil(random() * 2 * HOUR);

return Math.max(tillCooldown, tillNextUtcDay) + randomNoise;
}
12 changes: 11 additions & 1 deletion reporting/src/job-scheduler.js
Original file line number Diff line number Diff line change
Expand Up @@ -444,7 +444,7 @@ export default class JobScheduler {
} else {
logger.error('Job failed (unexpected error):', jobEntry.job, e);
}
this.notifyObservers('jobFailed', jobEntry);
this.notifyObservers('jobFailed', jobEntry, e);
} finally {
now = Date.now();
this._removeFromRunningQueue(jobEntry, now);
Expand Down Expand Up @@ -783,6 +783,16 @@ export default class JobScheduler {
return count;
}

getTotalJobsInDlq() {
let count = 0;
for (const queue of Object.values(this.jobQueues)) {
if (queue.dlq) {
count += queue.dlq.length;
}
}
return count;
}

_precomputed() {
if (!this._cachedPrecomputed) {
// Group the different types into set with the same priority and sort
Expand Down
261 changes: 261 additions & 0 deletions reporting/src/nav-tracking-detector.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,261 @@
/**
* WhoTracks.Me
* https://whotracks.me/
*
* Copyright 2017-present Ghostery GmbH. All rights reserved.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0
*/

import logger from './logger';
import { requireParam, requireString, requireObject, fastHash } from './utils';
import random from './random';
import { timezoneAgnosticDailyExpireAt } from './cooldowns';

const SECOND = 1000;

function tryParseHostname(url) {
try {
return new URL(url).host;
} catch (e) {
return null;
}
}

function hasQueryParams(url, params) {
try {
const { searchParams } = new URL(url);
return params.every((key) => searchParams.has(key));
} catch (e) {
return false;
}
}

const searchAdRedirectByCategory = {
go: (url) =>
url.startsWith('https://www.googleadservices.com/') ||
url.startsWith('https://www.google.com/aclk?'),
bi: (url) => url.startsWith('https://www.bing.com/aclk?'),
dd: (url) =>
url.startsWith('https://www.bing.com/aclick?') ||
(url.startsWith('https://duckduckgo.com/y.js?') &&
hasQueryParams(url, ['ad_domain', 'ad_provider', 'ad_type'])),
gh: (url) => url.startsWith('https://tatrck.com/h/'),
br: (url) =>
url.startsWith('https://search.brave.com/a/redirect?') &&
hasQueryParams(url, ['click_url', 'placement_id']),
ec: (url) =>
url.startsWith('https://syndicatedsearch.goog/aclk?') ||
url.startsWith('https://ad.doubleclick.net/searchads/link/click?'),
};

// exported only for tests
export function isAdUrlByCategory(url, category) {
requireString(url);
requireString(category);

const check = searchAdRedirectByCategory[category];
return !!(check && check(url));
}

function isTracking(url) {
requireString(url);

// Note: For bootstrapping, start with the tracking ads. We could integrate
// with the adblocker engine or TrackerDB eventually here to improve coverage.
return Object.values(searchAdRedirectByCategory).some((check) => check(url));
}

function isSearchAdRedirect(category, redirects) {
const trackingUrls = redirects.map((x) => x.from).filter(isTracking);
const isAd =
(redirects.length > 0 && isAdUrlByCategory(redirects[0].from, category)) ||
(trackingUrls.length > 0 && isAdUrlByCategory(trackingUrls[0], category));
return { isAd, trackingUrls };
}

function toSendMessageJob(action, payload, deduplicateBy) {
const body = {
action,
payload,
ver: 3, // Note: no need to keep this number in sync among messages
'anti-duplicates': Math.floor(random() * 10000000),
};
return { type: 'send-message', args: { body, deduplicateBy } };
}

/**
* Responsible for detecting navigational tracking
* (https://privacycg.github.io/nav-tracking-mitigations/#navigational-tracking).
*
* It observes events emitted by the "Page", so some events will
* have been filtered already (e.g. "incognito" tabs are filtered out).
*/
export default class NavTrackingDetector {
constructor({ sanitizer, persistedHashes, quorumChecker, jobScheduler }) {
this.active = false;
this.sanitizer = requireParam(sanitizer);
this.persistedHashes = requireParam(persistedHashes);
this.quorumChecker = requireParam(quorumChecker);
this.jobScheduler = requireParam(jobScheduler);

this.jobScheduler.registerHandler(
'nav-track-detect:quorum-isAdCheck',
async (job) => {
const { action, payload, deduplicateBy, quorumCheck } = job.args;
requireString(action);
requireObject(payload);
requireString(quorumCheck);

// Rate limit the quorum check to once per day.
//
// Note: there is another, independent check based on "deduplicateBy"
// before sending the message. This check here only protects
// rate-limits the quorum check.)
const expireAt = timezoneAgnosticDailyExpireAt();
const dedupHash = fastHash(`nav-track:quorum:${quorumCheck}`, {
truncate: true,
});
const wasAdded = await this.persistedHashes.add(dedupHash, expireAt);
if (!wasAdded) {
logger.debug(
'Dropping before quorum check (already seen):',
action,
payload,
);
return [];
}
try {
if (await this._passesQuorum(quorumCheck)) {
return [toSendMessageJob(action, payload, deduplicateBy)];
} else {
logger.debug(
'Dropping message (failed to reach quorum):',
action,
payload,
);
return [];
}
} catch (e) {
// unblock the hash to allow retries later
// (at this point, the error could be caused by a network error,
// so it is still possible that a retry later could work.)
await this.persistedHashes.delete(dedupHash).catch(() => {});
throw e;
}
},
{
priority: -1000,
cooldownInMs: 3 * SECOND,
maxJobsTotal: 200,
},
);
}

async init() {
this.active = true;
}

unload() {
this.active = false;
}

onPageEvent(event) {
if (!this.active) {
return;
}

if (event.type === 'safe-page-navigation') {
this._analyzeNavigation(event);
} else if (event.type === 'safe-search-landing') {
this._analyzeLanding(event.details);
}
}

// general case: page navigation
_analyzeNavigation(event) {
// TODO:
// A difference is that publically indexed 'search -> host' navigations
// less sensitive than arbitrary 'host -> host' navigations. Thus,
// it is likely that additional checks will be needed to support them.
// For now, start start without it.
console.debug('[STUB]: general navigation are not yet covered', event);
}

// special case: public search engine landings
_analyzeLanding({ from, to, redirects }) {
// Open questions:
// * Is it sufficient to look only at the first hop? For instance, what
// about permanent a redirect that is still controlled by the original
// site owner, before it hands control over to the tracker?
// * Currently, we look only at the first hop, even though there are chains
// of tracker redirects? Potentially, these could be useful. The message
// thus puts the results in an array; but for now, it will only have a
// single entry.
// * Should we use the statusCode? For instance, treat permanent redirects
// differently? - Currently, we do not.
const { category, query: unsafeQuery } = from;
const { isAd, trackingUrls } = isSearchAdRedirect(category, redirects);
if (!isAd) {
return;
}

// Since the context is a search engine landing, it is likely
// that it is a public hostname. Also, hostname are normally safe
// to share (e.g. they will be sent as cleartext even in https).
const hostname = tryParseHostname(to.targetUrl);
if (!hostname) {
return;
}
const trackingHosts = trackingUrls.map(tryParseHostname);

// null out the query if there is the risk of leaking information
const { accept } = this.sanitizer.checkSuspiciousQuery(unsafeQuery);
const query = accept ? unsafeQuery : null;

const action = 'wtm.nav-track-detect.search-ad';
this._registerJob({
type: 'nav-track-detect:quorum-isAdCheck',
args: {
action,
payload: {
from: {
search: {
category,
query,
},
},
to: {
hostname,
},
via: {
redirects: trackingHosts,
},
},
quorumCheck: JSON.stringify([
action,
category,
hostname,
trackingHosts,
]),
},
});
}

async _passesQuorum(quorumCheck) {
requireString(quorumCheck);

// TODO: maybe break this also in two independent jobs
// (not strictly required, but could improve error recovery).
await this.quorumChecker.sendQuorumIncrement({ text: quorumCheck });
return this.quorumChecker.checkQuorumConsent({ text: quorumCheck });
}

_registerJob(job) {
this.jobScheduler.registerJob(job).catch((e) => {
logger.error('Failed to register job', job, e);
});
}
}
2 changes: 1 addition & 1 deletion reporting/src/network.js
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ export function isLocalIP(ip) {
export class DnsResolver {
constructor() {
this.dns = new Map();
this._ttlInMs = DAY; // TODO: reconsider (if possible, make it small)
this._ttlInMs = DAY;
}

isPrivateURL(url) {
Expand Down
9 changes: 3 additions & 6 deletions reporting/src/page-aggregator.js
Original file line number Diff line number Diff line change
Expand Up @@ -47,24 +47,21 @@ export default class PageAggregator {
if (event.type === 'full-sync') {
this.fullSync();
} else if (event.type === 'lazy-init' || event.type === 'page-updated') {
// TODO: is there a down-side in increasing the cooldown?
// (especially, if there are many tabs open, it may become a problem)
if (Date.now() > this._lastFullSync + COOLDOWN_FOR_FORCING_A_FULL_SYNC) {
logger.debug('Forcing full sync');
this.fullSync();
} else {
const { tabId } = event;
this.syncTab(tabId);
this.syncTab(event.tabId);
}
} else if (event.type === 'search-landing') {
this.syncTab(event.tabId);
} else if (event.type === 'activity-updated') {
const { urls, activityEstimator } = event;
this._dbExecutor
.run(async () => {
await this.pagedb.updateActivity(urls, activityEstimator);
})
.catch(console.error);
} else {
logger.warn('Unexpected signal:', event);
}
}

Expand Down
Loading

0 comments on commit 1973859

Please sign in to comment.