From adfbcb1f74f3ffce5eae03c7e263a4059b6fa0d4 Mon Sep 17 00:00:00 2001 From: Wilson Lin Date: Mon, 25 Dec 2023 10:28:04 +1100 Subject: [PATCH] Gate possibly noncompliant entity minifications under `allow_optimal_entities` --- CHANGELOG.md | 1 + README.md | 5 +- minhtml/src/main.rs | 5 + minify-html-common/src/tests/mod.rs | 5 +- .../in/wilsonl/minifyhtml/Configuration.java | 9 ++ .../minifyhtml/Configuration.java.gen.js | 2 + minify-html-java/src/main/rust/lib.rs | 1 + minify-html-nodejs/index.d.ts | 34 +++--- minify-html-nodejs/src/lib.rs | 1 + minify-html-python/minify_html.pyi | 1 + minify-html-python/src/lib.rs | 3 + minify-html-ruby/ext/minify_html/src/lib.rs | 1 + minify-html-wasm/src/lib.rs | 1 + minify-html/src/cfg/mod.rs | 10 +- minify-html/src/entity/encode.rs | 18 ++- minify-html/src/minify/attr.rs | 107 ++++++++++++++++-- minify-html/src/minify/content.rs | 21 +++- minify-html/src/minify/rcdata.rs | 21 +++- 18 files changed, 196 insertions(+), 50 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1a566942..b5e859b7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ - `ensure_spec_compliant_unquoted_attribute_values` => `allow_noncompliant_unquoted_attribute_values` - `keep_spaces_between_attributes` => `allow_removing_spaces_between_attributes` - `Cfg::spec_compliant()` => `Cfg::enable_possibly_noncompliant(&mut self)` +- BREAKING: Some entity minifications are now classified as "possibly noncompliant" and can be enabled via the `allow_optimal_entities` option but won't be performed by default. ## 0.15.0 diff --git a/README.md b/README.md index b64b9bb0..0edc7f74 100644 --- a/README.md +++ b/README.md @@ -252,9 +252,12 @@ Note that in all of these syntax, the parsing is "dumb": it will simply look for ### Spec compliance -To minify even further, it's possible to enable options that may output HTML that doesn't fully comply with the specs. This is almost 100% safe on all browsers, as browsers have consistent interpretation and rendering behaviour even for such HTML, which can be taken advantage of for better minification. Refer to these options: +WHATWG is the current HTML standard and [obsoletes all previous standards](https://www.w3.org/html/). WHATWG lists suggested validators [here](https://whatwg.org/validator/). + +To minify even further, it's possible to enable options that may output HTML that doesn't fully pass validation, but is still interpreted and rendered correctly according to the [WHATWG parsing specification](https://html.spec.whatwg.org/multipage/parsing.html), which major browser engines (Firefox, Chrome, Safari) implement. Refer to these options: - `allow_noncompliant_unquoted_attribute_values` +- `allow_optimal_entities` - `allow_removing_spaces_between_attributes` - `minify_doctype` diff --git a/minhtml/src/main.rs b/minhtml/src/main.rs index 8c49e290..72a607c4 100644 --- a/minhtml/src/main.rs +++ b/minhtml/src/main.rs @@ -30,6 +30,10 @@ struct Cli { #[structopt(long)] allow_noncompliant_unquoted_attribute_values: bool, + /// Allow some minifications around entities that may not pass validation, but will still be parsed correctly by almost all browsers. + #[structopt(long)] + allow_optimal_entities: bool, + /// Allow removing_spaces between attributes when possible, which may not be spec compliant. These will still be parsed correctly by almost all browsers. #[structopt(long)] allow_removing_spaces_between_attributes: bool, @@ -110,6 +114,7 @@ fn main() { #[rustfmt::skip] let cfg = Arc::new(Cfg { allow_noncompliant_unquoted_attribute_values: args.allow_noncompliant_unquoted_attribute_values, + allow_optimal_entities: args.allow_optimal_entities, allow_removing_spaces_between_attributes: args.allow_removing_spaces_between_attributes, keep_closing_tags: args.keep_closing_tags, keep_comments: args.keep_comments, diff --git a/minify-html-common/src/tests/mod.rs b/minify-html-common/src/tests/mod.rs index 3a762158..16a3970a 100644 --- a/minify-html-common/src/tests/mod.rs +++ b/minify-html-common/src/tests/mod.rs @@ -432,7 +432,10 @@ pub fn create_common_js_test_data() -> HashMap<&'static [u8], &'static [u8]> { let mut t = HashMap::<&'static [u8], &'static [u8]>::new(); // intentionally malformed - t.insert(b"", b"", + b"", b""); diff --git a/minify-html-java/src/main/java/in/wilsonl/minifyhtml/Configuration.java b/minify-html-java/src/main/java/in/wilsonl/minifyhtml/Configuration.java index 1279c7cf..c06181da 100644 --- a/minify-html-java/src/main/java/in/wilsonl/minifyhtml/Configuration.java +++ b/minify-html-java/src/main/java/in/wilsonl/minifyhtml/Configuration.java @@ -7,6 +7,7 @@ */ public class Configuration { public final boolean allow_noncompliant_unquoted_attribute_values; + public final boolean allow_optimal_entities; public final boolean allow_removing_spaces_between_attributes; public final boolean keep_closing_tags; public final boolean keep_comments; @@ -23,6 +24,7 @@ public class Configuration { private Configuration( boolean allow_noncompliant_unquoted_attribute_values, + boolean allow_optimal_entities, boolean allow_removing_spaces_between_attributes, boolean keep_closing_tags, boolean keep_comments, @@ -38,6 +40,7 @@ private Configuration( boolean remove_processing_instructions ) { this.allow_noncompliant_unquoted_attribute_values = allow_noncompliant_unquoted_attribute_values; + this.allow_optimal_entities = allow_optimal_entities; this.allow_removing_spaces_between_attributes = allow_removing_spaces_between_attributes; this.keep_closing_tags = keep_closing_tags; this.keep_comments = keep_comments; @@ -58,6 +61,7 @@ private Configuration( */ public static class Builder { private boolean allow_noncompliant_unquoted_attribute_values = false; + private boolean allow_optimal_entities = false; private boolean allow_removing_spaces_between_attributes = false; private boolean keep_closing_tags = false; private boolean keep_comments = false; @@ -76,6 +80,10 @@ public Builder setAllowNoncompliantUnquotedAttributeValues(boolean v) { this.allow_noncompliant_unquoted_attribute_values = v; return this; } + public Builder setAllowOptimalEntities(boolean v) { + this.allow_optimal_entities = v; + return this; + } public Builder setAllowRemovingSpacesBetweenAttributes(boolean v) { this.allow_removing_spaces_between_attributes = v; return this; @@ -132,6 +140,7 @@ public Builder setRemoveProcessingInstructions(boolean v) { public Configuration build() { return new Configuration( this.allow_noncompliant_unquoted_attribute_values, + this.allow_optimal_entities, this.allow_removing_spaces_between_attributes, this.keep_closing_tags, this.keep_comments, diff --git a/minify-html-java/src/main/java/in/wilsonl/minifyhtml/Configuration.java.gen.js b/minify-html-java/src/main/java/in/wilsonl/minifyhtml/Configuration.java.gen.js index 6f7bb6d0..c76e5d04 100644 --- a/minify-html-java/src/main/java/in/wilsonl/minifyhtml/Configuration.java.gen.js +++ b/minify-html-java/src/main/java/in/wilsonl/minifyhtml/Configuration.java.gen.js @@ -16,6 +16,8 @@ for (const [_, snake] of cfgRs.matchAll(/^\s*pub ([a-zA-Z0-9_]+): bool,?\s*$/gm) const java = ` package in.wilsonl.minifyhtml; +// WARNING: Do not manually edit, use Configuration.java.gen.js. + /** * Class representing minification configuration. */ diff --git a/minify-html-java/src/main/rust/lib.rs b/minify-html-java/src/main/rust/lib.rs index e72a026d..ead6d2b0 100644 --- a/minify-html-java/src/main/rust/lib.rs +++ b/minify-html-java/src/main/rust/lib.rs @@ -12,6 +12,7 @@ fn build_cfg(env: &JNIEnv, obj: &JObject) -> Cfg { // This is a statement because "attributes on expressions are experimental". let cfg = Cfg { allow_noncompliant_unquoted_attribute_values: env.get_field(*obj, "allow_noncompliant_unquoted_attribute_values", "Z").unwrap().z().unwrap(), + allow_optimal_entities: env.get_field(*obj, "allow_optimal_entities", "Z").unwrap().z().unwrap(), allow_removing_spaces_between_attributes: env.get_field(*obj, "allow_removing_spaces_between_attributes", "Z").unwrap().z().unwrap(), keep_closing_tags: env.get_field(*obj, "keep_closing_tags", "Z").unwrap().z().unwrap(), keep_comments: env.get_field(*obj, "keep_comments", "Z").unwrap().z().unwrap(), diff --git a/minify-html-nodejs/index.d.ts b/minify-html-nodejs/index.d.ts index 7f0da0ed..8730db2b 100644 --- a/minify-html-nodejs/index.d.ts +++ b/minify-html-nodejs/index.d.ts @@ -8,35 +8,35 @@ export function minify( src: Buffer, cfg: { - /** Do not minify DOCTYPEs. Minified DOCTYPEs may not be spec compliant. */ - do_not_minify_doctype?: boolean; - /** Ensure all unquoted attribute values in the output do not contain any characters prohibited by the WHATWG specification. */ - ensure_spec_compliant_unquoted_attribute_values?: boolean; + /** Allow unquoted attribute values in the output to contain characters prohibited by the [WHATWG specification](https://html.spec.whatwg.org/multipage/syntax.html#attributes-2). These will still be parsed correctly by almost all browsers. */ + allow_noncompliant_unquoted_attribute_values?: boolean; + /** Allow some minifications around entities that may not pass validation, but will still be parsed correctly by almost all browsers. */ + allow_optimal_entities?: boolean; + /** Allow removing_spaces between attributes when possible, which may not be spec compliant. These will still be parsed correctly by almost all browsers. */ + allow_removing_spaces_between_attributes?: boolean; /** Do not omit closing tags when possible. */ keep_closing_tags?: boolean; - /** Do not omit `` and `` opening tags when they don't have attributes. */ - keep_html_and_head_opening_tags?: boolean; - /** Keep spaces between attributes when possible to conform to HTML standards. */ - keep_spaces_between_attributes?: boolean; /** Keep all comments. */ keep_comments?: boolean; + /** Do not omit `` and `` opening tags when they don't have attributes. */ + keep_html_and_head_opening_tags?: boolean; + /** Keep `type=text` attribute name and value on `` elements. */ + keep_input_type_text_attr?: boolean; /** Keep SSI comments. */ keep_ssi_comments?: boolean; + /** Minify CSS in `