-
Notifications
You must be signed in to change notification settings - Fork 30
/
Copy pathhref-sanitizer.ts
578 lines (532 loc) · 18.9 KB
/
href-sanitizer.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
import {
observeDOMChanges,
hit,
logMessage,
throttle,
} from '../helpers';
import { type Source } from './scriptlets';
/**
* @scriptlet href-sanitizer
*
* @description
* Set the `href` attribute to a value found in text content of the targeted `a` element,
* or in an attribute of the targeted `a` element,
* or in a URL parameter of the targeted `a` element's `href` attribute.
* This scriptlet runs once when the page loads and after that on DOM tree changes.
*
* Related UBO scriptlet:
* https://github.com/uBlockOrigin/uBlock-issues/wiki/Resources-Library#href-sanitizerjs-
*
* ### Syntax
*
* ```text
* example.org#%#//scriptlet('href-sanitizer', selector[, attribute, [ transform]])
* ```
*
* - `selector` — required, a CSS selector to match the elements to be sanitized,
* which should be anchor elements (`<a>`) with `href` attribute.
* - `attribute` — optional, default to `text`:
* - `text` — use the text content of the matched element,
* - `[<attribute-name>]` copy the value from attribute `attribute-name` on the same element,
* - `?<parameter-name>` copy the value from URL parameter `parameter-name` of the same element's `href` attribute.
* - `transform` — optional, defaults to no transforming. Possible values:
* - `base64decode` — decode the base64 string from specified attribute.
* - `removeHash` — remove the hash from the URL.
* - `removeParam[:<parameters>]` — remove the specified parameters from the URL,
* where `<parameters>` is a comma-separated list of parameter names;
* if no parameter is specified, remove all parameters.
*
* > Note that in the case where the discovered value does not correspond to a valid URL with the appropriate
* > http or https protocols, the value will not be set.
*
* ### Examples
*
* 1. Set the `href` attribute to a value found in text content of the targeted `a` element:
*
* ```adblock
* example.org#%#//scriptlet('href-sanitizer', 'a[href*="foo.com"]')
* ```
*
* ```html
* <!-- before -->
* <div>
* <a href="https://foo.com/bar">https://example.org/test?foo</a>
* </div>
*
* <!-- after -->
* <div>
* <a href="https://example.org/test?foo">https://example.org/test?foo</a>
* </div>
* ```
*
* 2. Set the `href` attribute to a value found in an attribute of the targeted `a` element:
*
* ```adblock
* example.org#%#//scriptlet('href-sanitizer', 'a[href*="foo.com"]', '[data-href]')
* ```
*
* ```html
* <!-- before -->
* <div>
* <a href="https://foo.com/bar" data-href="https://example.org/test?foo"></a>
* </div>
*
* <!-- after -->
* <div>
* <a href="https://example.org/test?foo" data-href="https://example.org/test?foo"></a>
* </div>
* ```
*
* 3. Set the `href` attribute to a value found in a URL parameter of the targeted `a` element's `href` attribute:
*
* ```adblock
* example.org#%#//scriptlet('href-sanitizer', 'a[href*="tracker.com"]', '?redirect')
* ```
*
* ```html
* <!-- before -->
* <div>
* <a href="https://tracker.com/foo?redirect=https://example.org/"></a>
* </div>
*
* <!-- after -->
* <div>
* <a href="https://example.org/"></a>
* </div>
* ```
*
* 4. Decode the base64 string from specified attribute:
*
* ```adblock
* example.org#%#//scriptlet('href-sanitizer', 'a[href*="foo.com"]', '[href]', 'base64decode')
* ```
*
* ```html
* <!-- before -->
* <div>
* <a href="http://www.foo.com/out/?aHR0cDovL2V4YW1wbGUuY29tLz92PTEyMw=="></a>
* </div>
*
* <!-- after -->
* <div>
* <a href="http://example.com/?v=123"></a>
* </div>
* ```
*
* 5. Remove the hash from the URL:
*
* ```adblock
* example.org#%#//scriptlet('href-sanitizer', 'a[href*="foo.com"]', '[href]', 'removeHash')
* ```
*
* ```html
* <!-- before -->
* <div>
* <a href="http://www.foo.com/out/#aHR0cDovL2V4YW1wbGUuY29tLz92PTEyMw=="></a>
* </div>
*
* <!-- after -->
* <div>
* <a href="http://www.foo.com/out/"></a>
* </div>
* ```
*
* 6. Remove the all parameter(s) from the URL:
*
* ```adblock
* example.org#%#//scriptlet('href-sanitizer', 'a[href*="foo.com"]', '[href]', 'removeParam')
* ```
*
* ```html
* <!-- before -->
* <div>
* <a href="https://foo.com/123123?utm_source=nova&utm_medium=tg&utm_campaign=main"></a>
* </div>
*
* <!-- after -->
* <div>
* <a href="https://foo.com/123123"></a>
* </div>
* ```
*
* 7. Remove the specified parameter(s) from the URL:
*
* ```adblock
* example.org#%#//scriptlet('href-sanitizer', 'a[href*="foo.com"]', '[href]', 'removeParam:utm_source,utm_medium')
* ```
*
* ```html
* <!-- before -->
* <div>
* <a href="https://foo.com/123123?utm_source=nova&utm_medium=tg&utm_campaign=main"></a>
* </div>
*
* <!-- after -->
* <div>
* <a href="https://foo.com/123123?utm_campaign=main"></a>
* </div>
* ```
*
* @added v1.10.25.
*/
export function hrefSanitizer(
source: Source,
selector: string,
attribute = 'text',
transform = '',
) {
if (!selector) {
logMessage(source, 'Selector is required.');
return;
}
// transform markers
const BASE64_DECODE_TRANSFORM_MARKER = 'base64decode';
const REMOVE_HASH_TRANSFORM_MARKER = 'removeHash';
const REMOVE_PARAM_TRANSFORM_MARKER = 'removeParam';
// separator markers
const MARKER_SEPARATOR = ':';
const COMMA = ',';
// Regular expression to find not valid characters at the beginning and at the end of the string,
// \x21-\x7e is a range that includes the ASCII characters from ! (hex 21) to ~ (hex 7E).
// This range covers numbers, English letters, and common symbols.
// \p{Letter} matches any kind of letter from any language.
// It's required to fix Twitter case, 'textContent' of the link contains '…' at the end,
// so it have to be removed, otherwise it will not work properly.
const regexpNotValidAtStart = /^[^\x21-\x7e\p{Letter}]+/u;
const regexpNotValidAtEnd = /[^\x21-\x7e\p{Letter}]+$/u;
/**
* Extracts text from an element based on the specified attribute.
*
* @param anchor The element from which to extract the text.
* @param attr The attribute indicating how to extract the text.
* @returns The extracted text.
*/
const extractNewHref = (anchor: HTMLAnchorElement, attr: string): string => {
if (attr === 'text') {
if (!anchor.textContent) {
return '';
}
return anchor.textContent
.replace(regexpNotValidAtStart, '')
.replace(regexpNotValidAtEnd, '');
}
if (attr.startsWith('?')) {
try {
const url = new URL(anchor.href, document.location.href);
return url.searchParams.get(attr.slice(1)) || '';
} catch (ex) {
logMessage(
source,
`Cannot retrieve the parameter '${attr.slice(1)}' from the URL '${anchor.href}`,
);
return '';
}
}
if (attr.startsWith('[') && attr.endsWith(']')) {
return anchor.getAttribute(attr.slice(1, -1)) || '';
}
return '';
};
/**
* Validates whether a given string is a URL.
*
* @param url The URL string to validate.
* @returns `true` if the string is a valid URL, otherwise `false`.
*/
const isValidURL = (url: string): boolean => {
try {
new URL(url);
return true;
} catch {
return false;
}
};
/**
* Validates a URL, if valid return URL,
* otherwise return null.
*
* @param text The URL to be validated
* @returns URL for valid URL, otherwise null.
*/
const getValidURL = (text: string): string | null => {
if (!text) {
return null;
}
try {
const { href, protocol } = new URL(text, document.location.href);
if (protocol !== 'http:' && protocol !== 'https:') {
logMessage(source, `Protocol not allowed: "${protocol}", from URL: "${href}"`);
return null;
}
return href;
} catch {
return null;
}
};
/**
* Checks if the given element is a sanitizable anchor element.
*
* @param element The element to check.
* @returns True if the element is a sanitizable anchor element, false otherwise.
*/
const isSanitizableAnchor = (element: Element): element is HTMLAnchorElement => {
return element.nodeName.toLowerCase() === 'a' && element.hasAttribute('href');
};
/**
* Recursively searches for the first valid URL within a nested object.
*
* @param obj The object to search for URLs.
* @returns The first found URL as a string, or `null` if none are found.
*/
const extractURLFromObject = (obj: Record<string, unknown>): string | null => {
for (const key in obj) {
if (!Object.prototype.hasOwnProperty.call(obj, key)) {
continue;
}
const value = obj[key];
if (typeof value === 'string' && isValidURL(value)) {
return value;
}
if (typeof value === 'object' && value !== null) {
const result = extractURLFromObject(value as Record<string, unknown>);
if (result) {
return result;
}
}
}
return null;
};
/**
* Checks if the given content has object format.
* @param content The content to check.
* @returns `true` if the content has object format, `false` otherwise.
*/
const isStringifiedObject = (content: string) => content.startsWith('{') && content.endsWith('}');
/**
* Decodes a base64 string several times. If the result is a valid URL, it is returned.
* If the result is a JSON object, the first valid URL within the object is returned.
* @param text The base64 string to decode.
* @param times The number of times to decode the base64 string.
* @returns Decoded base64 string or empty string if no valid URL is found.
*/
const decodeBase64SeveralTimes = (text: string, times: number): string | null => {
let result = text;
for (let i = 0; i < times; i += 1) {
try {
result = atob(result);
} catch (e) {
// Not valid base64 string
if (result === text) {
return '';
}
}
}
// if found valid URL, return it
if (isValidURL(result)) {
return result;
}
// if the result is an object, try to extract URL from it
if (isStringifiedObject(result)) {
try {
const parsedResult = JSON.parse(result);
return extractURLFromObject(parsedResult);
} catch (ex) {
return '';
}
}
logMessage(source, `Failed to decode base64 string: ${text}`);
return '';
};
// URL components markers
const SEARCH_QUERY_MARKER = '?';
const SEARCH_PARAMS_MARKER = '&';
const HASHBANG_MARKER = '#!';
const ANCHOR_MARKER = '#';
// decode attempts for base64 string
const DECODE_ATTEMPTS_NUMBER = 10;
/**
* Decodes the search string by removing the search query marker and decoding the base64 string.
* @param search Search string to decode
* @returns Decoded search string or empty string if no valid URL is found
*/
const decodeSearchString = (search: string) => {
const searchString = search.replace(SEARCH_QUERY_MARKER, '');
let decodedParam;
let validEncodedParam;
if (searchString.includes(SEARCH_PARAMS_MARKER)) {
const searchParamsArray = searchString.split(SEARCH_PARAMS_MARKER);
searchParamsArray.forEach((param) => {
decodedParam = decodeBase64SeveralTimes(param, DECODE_ATTEMPTS_NUMBER);
if (decodedParam && decodedParam.length > 0) {
validEncodedParam = decodedParam;
}
});
return validEncodedParam;
}
return decodeBase64SeveralTimes(searchString, DECODE_ATTEMPTS_NUMBER);
};
/**
* Decodes the hash string by removing the hashbang or anchor marker and decoding the base64 string.
* @param hash Hash string to decode
* @returns Decoded hash string or empty string if no valid URL is found
*/
const decodeHashString = (hash: string) => {
let validEncodedHash = '';
if (hash.includes(HASHBANG_MARKER)) {
validEncodedHash = hash.replace(HASHBANG_MARKER, '');
} else if (hash.includes(ANCHOR_MARKER)) {
validEncodedHash = hash.replace(ANCHOR_MARKER, '');
}
return validEncodedHash ? decodeBase64SeveralTimes(validEncodedHash, DECODE_ATTEMPTS_NUMBER) : '';
};
/**
* Removes the hash from the URL.
* @param url URL to remove the hash from
* @returns URL without the hash or empty string if no hash is found
*/
const removeHash = (url: string) => {
const urlObj = new URL(url, window.location.origin);
if (!urlObj.hash) {
return '';
}
urlObj.hash = '';
return urlObj.toString();
};
/**
* Removes the specified parameter from the URL.
* @param url URL to remove the parameter from
* @param transformValue parameter value(s) to remove with marker
* @returns URL without the parameter(s) or empty string if no parameter is found
*/
const removeParam = (url: string, transformValue: string) => {
const urlObj = new URL(url, window.location.origin);
// get the parameter values to remove
const paramNamesToRemoveStr = transformValue.split(MARKER_SEPARATOR)[1];
if (!paramNamesToRemoveStr) {
urlObj.search = '';
return urlObj.toString();
}
const initSearchParamsLength = urlObj.searchParams.toString().length;
const removeParams = paramNamesToRemoveStr.split(COMMA);
removeParams.forEach((param) => {
if (urlObj.searchParams.has(param)) {
urlObj.searchParams.delete(param);
}
});
// if the parameter(s) is not found, return empty string
if (initSearchParamsLength === urlObj.searchParams.toString().length) {
return '';
}
return urlObj.toString();
};
/**
* Extracts the base64 part from a string.
* If no base64 string is found, `null` is returned.
* @param url String to extract the base64 part from.
* @returns The base64 part of the string, or `null` if none is found.
*/
const decodeBase64URL = (url: string) => {
const { search, hash } = new URL(url, document.location.href);
if (search.length > 0) {
return decodeSearchString(search);
}
if (hash.length > 0) {
return decodeHashString(hash);
}
logMessage(source, `Failed to execute base64 from URL: ${url}`);
return null;
};
/**
* Decodes a base64 string from the given href.
* If the href is a valid URL, the base64 string is decoded.
* If the href is not a valid URL, the base64 string is decoded several times.
* @param href The href to decode.
* @returns The decoded base64 string.
*/
const base64Decode = (href: string): string => {
if (isValidURL(href)) {
return decodeBase64URL(href) || '';
}
return decodeBase64SeveralTimes(href, DECODE_ATTEMPTS_NUMBER) || '';
};
/**
* Sanitizes the href attribute of elements matching the given selector.
*
* @param elementSelector The CSS selector to match the elements.
*/
const sanitize = (elementSelector: string): void => {
let elements;
try {
elements = document.querySelectorAll(elementSelector);
} catch (e) {
logMessage(source, `Invalid selector "${elementSelector}"`);
return;
}
elements.forEach((elem) => {
try {
if (!isSanitizableAnchor(elem)) {
logMessage(source, `${elem} is not a valid element to sanitize`);
return;
}
let newHref = extractNewHref(elem, attribute);
// apply transform if specified
if (transform) {
switch (true) {
case transform === BASE64_DECODE_TRANSFORM_MARKER:
newHref = base64Decode(newHref);
break;
case transform === REMOVE_HASH_TRANSFORM_MARKER:
newHref = removeHash(newHref);
break;
case transform.startsWith(REMOVE_PARAM_TRANSFORM_MARKER): {
newHref = removeParam(newHref, transform);
break;
}
default:
logMessage(source, `Invalid transform option: "${transform}"`);
return;
}
}
const newValidHref = getValidURL(newHref);
if (!newValidHref) {
logMessage(source, `Invalid URL: ${newHref}`);
return;
}
const oldHref = elem.href; // Required to log the original URL.
elem.setAttribute('href', newValidHref);
if (newValidHref !== oldHref) {
logMessage(source, `Sanitized "${oldHref}" to "${newValidHref}".`);
}
} catch (ex) {
logMessage(source, `Failed to sanitize ${elem}.`);
}
});
hit(source);
};
const run = () => {
sanitize(selector);
observeDOMChanges(() => sanitize(selector), true);
};
if (document.readyState === 'loading') {
window.addEventListener('DOMContentLoaded', run, { once: true });
} else {
run();
}
}
export const hrefSanitizerNames = [
'href-sanitizer',
// aliases are needed for matching the related scriptlet converted into our syntax
'href-sanitizer.js',
'ubo-href-sanitizer.js',
'ubo-href-sanitizer',
];
// eslint-disable-next-line prefer-destructuring
hrefSanitizer.primaryName = hrefSanitizerNames[0];
hrefSanitizer.injections = [
observeDOMChanges,
hit,
logMessage,
// following helpers should be imported and injected
// because they are used by helpers above
throttle,
];