Skip to content

Commit

Permalink
Gate possibly noncompliant entity minifications under `allow_optimal_…
Browse files Browse the repository at this point in the history
…entities`
  • Loading branch information
wilsonzlin committed Dec 24, 2023
1 parent 53ef28d commit adfbcb1
Show file tree
Hide file tree
Showing 18 changed files with 196 additions and 50 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
- `ensure_spec_compliant_unquoted_attribute_values` => `allow_noncompliant_unquoted_attribute_values`
- `keep_spaces_between_attributes` => `allow_removing_spaces_between_attributes`
- `Cfg::spec_compliant()` => `Cfg::enable_possibly_noncompliant(&mut self)`
- BREAKING: Some entity minifications are now classified as "possibly noncompliant" and can be enabled via the `allow_optimal_entities` option but won't be performed by default.

## 0.15.0

Expand Down
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -252,9 +252,12 @@ Note that in all of these syntax, the parsing is "dumb": it will simply look for

### Spec compliance

To minify even further, it's possible to enable options that may output HTML that doesn't fully comply with the specs. This is almost 100% safe on all browsers, as browsers have consistent interpretation and rendering behaviour even for such HTML, which can be taken advantage of for better minification. Refer to these options:
WHATWG is the current HTML standard and [obsoletes all previous standards](https://www.w3.org/html/). WHATWG lists suggested validators [here](https://whatwg.org/validator/).

To minify even further, it's possible to enable options that may output HTML that doesn't fully pass validation, but is still interpreted and rendered correctly according to the [WHATWG parsing specification](https://html.spec.whatwg.org/multipage/parsing.html), which major browser engines (Firefox, Chrome, Safari) implement. Refer to these options:

- `allow_noncompliant_unquoted_attribute_values`
- `allow_optimal_entities`
- `allow_removing_spaces_between_attributes`
- `minify_doctype`

Expand Down
5 changes: 5 additions & 0 deletions minhtml/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ struct Cli {
#[structopt(long)]
allow_noncompliant_unquoted_attribute_values: bool,

/// Allow some minifications around entities that may not pass validation, but will still be parsed correctly by almost all browsers.
#[structopt(long)]
allow_optimal_entities: bool,

/// Allow removing_spaces between attributes when possible, which may not be spec compliant. These will still be parsed correctly by almost all browsers.
#[structopt(long)]
allow_removing_spaces_between_attributes: bool,
Expand Down Expand Up @@ -110,6 +114,7 @@ fn main() {
#[rustfmt::skip]
let cfg = Arc::new(Cfg {
allow_noncompliant_unquoted_attribute_values: args.allow_noncompliant_unquoted_attribute_values,
allow_optimal_entities: args.allow_optimal_entities,
allow_removing_spaces_between_attributes: args.allow_removing_spaces_between_attributes,
keep_closing_tags: args.keep_closing_tags,
keep_comments: args.keep_comments,
Expand Down
5 changes: 4 additions & 1 deletion minify-html-common/src/tests/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -432,7 +432,10 @@ pub fn create_common_js_test_data() -> HashMap<&'static [u8], &'static [u8]> {
let mut t = HashMap::<&'static [u8], &'static [u8]>::new();

// intentionally malformed
t.insert(b"<script><script></script></script>", b"<script><script></script><script>");
t.insert(
b"<script><script></script></script>",
b"<script><script></script><script>",
);

// js minification
t.insert(b"<script></script>", b"<script></script>");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
*/
public class Configuration {
public final boolean allow_noncompliant_unquoted_attribute_values;
public final boolean allow_optimal_entities;
public final boolean allow_removing_spaces_between_attributes;
public final boolean keep_closing_tags;
public final boolean keep_comments;
Expand All @@ -23,6 +24,7 @@ public class Configuration {

private Configuration(
boolean allow_noncompliant_unquoted_attribute_values,
boolean allow_optimal_entities,
boolean allow_removing_spaces_between_attributes,
boolean keep_closing_tags,
boolean keep_comments,
Expand All @@ -38,6 +40,7 @@ private Configuration(
boolean remove_processing_instructions
) {
this.allow_noncompliant_unquoted_attribute_values = allow_noncompliant_unquoted_attribute_values;
this.allow_optimal_entities = allow_optimal_entities;
this.allow_removing_spaces_between_attributes = allow_removing_spaces_between_attributes;
this.keep_closing_tags = keep_closing_tags;
this.keep_comments = keep_comments;
Expand All @@ -58,6 +61,7 @@ private Configuration(
*/
public static class Builder {
private boolean allow_noncompliant_unquoted_attribute_values = false;
private boolean allow_optimal_entities = false;
private boolean allow_removing_spaces_between_attributes = false;
private boolean keep_closing_tags = false;
private boolean keep_comments = false;
Expand All @@ -76,6 +80,10 @@ public Builder setAllowNoncompliantUnquotedAttributeValues(boolean v) {
this.allow_noncompliant_unquoted_attribute_values = v;
return this;
}
public Builder setAllowOptimalEntities(boolean v) {
this.allow_optimal_entities = v;
return this;
}
public Builder setAllowRemovingSpacesBetweenAttributes(boolean v) {
this.allow_removing_spaces_between_attributes = v;
return this;
Expand Down Expand Up @@ -132,6 +140,7 @@ public Builder setRemoveProcessingInstructions(boolean v) {
public Configuration build() {
return new Configuration(
this.allow_noncompliant_unquoted_attribute_values,
this.allow_optimal_entities,
this.allow_removing_spaces_between_attributes,
this.keep_closing_tags,
this.keep_comments,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ for (const [_, snake] of cfgRs.matchAll(/^\s*pub ([a-zA-Z0-9_]+): bool,?\s*$/gm)
const java = `
package in.wilsonl.minifyhtml;
// WARNING: Do not manually edit, use Configuration.java.gen.js.
/**
* Class representing minification configuration.
*/
Expand Down
1 change: 1 addition & 0 deletions minify-html-java/src/main/rust/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ fn build_cfg(env: &JNIEnv, obj: &JObject) -> Cfg {
// This is a statement because "attributes on expressions are experimental".
let cfg = Cfg {
allow_noncompliant_unquoted_attribute_values: env.get_field(*obj, "allow_noncompliant_unquoted_attribute_values", "Z").unwrap().z().unwrap(),
allow_optimal_entities: env.get_field(*obj, "allow_optimal_entities", "Z").unwrap().z().unwrap(),
allow_removing_spaces_between_attributes: env.get_field(*obj, "allow_removing_spaces_between_attributes", "Z").unwrap().z().unwrap(),
keep_closing_tags: env.get_field(*obj, "keep_closing_tags", "Z").unwrap().z().unwrap(),
keep_comments: env.get_field(*obj, "keep_comments", "Z").unwrap().z().unwrap(),
Expand Down
34 changes: 17 additions & 17 deletions minify-html-nodejs/index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,35 +8,35 @@
export function minify(
src: Buffer,
cfg: {
/** Do not minify DOCTYPEs. Minified DOCTYPEs may not be spec compliant. */
do_not_minify_doctype?: boolean;
/** Ensure all unquoted attribute values in the output do not contain any characters prohibited by the WHATWG specification. */
ensure_spec_compliant_unquoted_attribute_values?: boolean;
/** Allow unquoted attribute values in the output to contain characters prohibited by the [WHATWG specification](https://html.spec.whatwg.org/multipage/syntax.html#attributes-2). These will still be parsed correctly by almost all browsers. */
allow_noncompliant_unquoted_attribute_values?: boolean;
/** Allow some minifications around entities that may not pass validation, but will still be parsed correctly by almost all browsers. */
allow_optimal_entities?: boolean;
/** Allow removing_spaces between attributes when possible, which may not be spec compliant. These will still be parsed correctly by almost all browsers. */
allow_removing_spaces_between_attributes?: boolean;
/** Do not omit closing tags when possible. */
keep_closing_tags?: boolean;
/** Do not omit `<html>` and `<head>` opening tags when they don't have attributes. */
keep_html_and_head_opening_tags?: boolean;
/** Keep spaces between attributes when possible to conform to HTML standards. */
keep_spaces_between_attributes?: boolean;
/** Keep all comments. */
keep_comments?: boolean;
/** Do not omit `<html>` and `<head>` opening tags when they don't have attributes. */
keep_html_and_head_opening_tags?: boolean;
/** Keep `type=text` attribute name and value on `<input>` elements. */
keep_input_type_text_attr?: boolean;
/** Keep SSI comments. */
keep_ssi_comments?: boolean;
/** Minify CSS in `<style>` tags and `style` attributes using [https://github.com/parcel-bundler/lightningcss](lightningcss). */
minify_css?: boolean;
/** Minify DOCTYPEs. Minified DOCTYPEs may not be spec compliant, but will still be parsed correctly by almost all browsers. */
minify_doctype?: boolean;
/** Minify JavaScript in `<script>` tags using [minify-js](https://github.com/wilsonzlin/minify-js). */
minify_js?: boolean;
/** When `{{`, `{#`, or `{%` are seen in content, all source code until the subsequent matching closing `}}`, `#}`, or `%}` respectively gets piped through untouched. */
preserve_brace_template_syntax?: boolean;
/** When `<%` is seen in content, all source code until the subsequent matching closing `%>` gets piped through untouched. */
preserve_chevron_percent_template_syntax?: boolean;
/**
* If enabled, content in `<script>` tags with a JS or no [MIME type](https://mimesniff.spec.whatwg.org/#javascript-mime-type) will be minified using [minify-js](https://github.com/wilsonzlin/minify-js).
*/
minify_js?: boolean;
/**
* If enabled, CSS in `<style>` tags and `style` attributes will be minified.
*/
minify_css?: boolean;
/** Remove all bangs. */
remove_bangs?: boolean;
/** Remove all processing_instructions. */
/** Remove all processing instructions. */
remove_processing_instructions?: boolean;
}
): Buffer;
1 change: 1 addition & 0 deletions minify-html-nodejs/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ fn minify(mut cx: FunctionContext) -> JsResult<JsBuffer> {
#[rustfmt::skip]
let cfg = minify_html::Cfg {
allow_noncompliant_unquoted_attribute_values: get_bool!(cx, opt, "allow_noncompliant_unquoted_attribute_values"),
allow_optimal_entities: get_bool!(cx, opt, "allow_optimal_entities"),
allow_removing_spaces_between_attributes: get_bool!(cx, opt, "allow_removing_spaces_between_attributes"),
keep_closing_tags: get_bool!(cx, opt, "keep_closing_tags"),
keep_comments: get_bool!(cx, opt, "keep_comments"),
Expand Down
1 change: 1 addition & 0 deletions minify-html-python/minify_html.pyi
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
def minify(
code: str,
allow_noncompliant_unquoted_attribute_values: bool = False,
allow_optimal_entities: bool = False,
allow_removing_spaces_between_attributes: bool = False,
keep_closing_tags: bool = False,
keep_comments: bool = False,
Expand Down
3 changes: 3 additions & 0 deletions minify-html-python/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use std::string::String;
#[pyfunction(
py_args = "*",
allow_noncompliant_unquoted_attribute_values = "false",
allow_optimal_entities = "false",
allow_removing_spaces_between_attributes = "false",
keep_closing_tags = "false",
keep_comments = "false",
Expand All @@ -25,6 +26,7 @@ use std::string::String;
fn minify(
code: String,
allow_noncompliant_unquoted_attribute_values: bool,
allow_optimal_entities: bool,
allow_removing_spaces_between_attributes: bool,
keep_closing_tags: bool,
keep_comments: bool,
Expand All @@ -42,6 +44,7 @@ fn minify(
let code = code.into_bytes();
let out_code = minify_html_native(&code, &Cfg {
allow_noncompliant_unquoted_attribute_values,
allow_optimal_entities,
allow_removing_spaces_between_attributes,
keep_closing_tags,
keep_comments,
Expand Down
1 change: 1 addition & 0 deletions minify-html-ruby/ext/minify_html/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ fn minify_html(source: String, cfg: RHash) -> String {
#[rustfmt::skip]
let out_code = minify_html_native(source.as_bytes(), &CfgNative {
allow_noncompliant_unquoted_attribute_values: cfg.aref(StaticSymbol::new("allow_noncompliant_unquoted_attribute_values")).unwrap_or_default(),
allow_optimal_entities: cfg.aref(StaticSymbol::new("allow_optimal_entities")).unwrap_or_default(),
allow_removing_spaces_between_attributes: cfg.aref(StaticSymbol::new("allow_removing_spaces_between_attributes")).unwrap_or_default(),
keep_closing_tags: cfg.aref(StaticSymbol::new("keep_closing_tags")).unwrap_or_default(),
keep_comments: cfg.aref(StaticSymbol::new("keep_comments")).unwrap_or_default(),
Expand Down
1 change: 1 addition & 0 deletions minify-html-wasm/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ pub fn minify(code: &[u8], cfg: &JsValue) -> Vec<u8> {
#[rustfmt::skip]
let cfg = minify_html::Cfg {
allow_noncompliant_unquoted_attribute_values: get_prop!(cfg, "allow_noncompliant_unquoted_attribute_values"),
allow_optimal_entities: get_prop!(cfg, "allow_optimal_entities"),
allow_removing_spaces_between_attributes: get_prop!(cfg, "allow_removing_spaces_between_attributes"),
keep_closing_tags: get_prop!(cfg, "keep_closing_tags"),
keep_comments: get_prop!(cfg, "keep_comments"),
Expand Down
10 changes: 4 additions & 6 deletions minify-html/src/cfg/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
pub struct Cfg {
/// Allow unquoted attribute values in the output to contain characters prohibited by the [WHATWG specification](https://html.spec.whatwg.org/multipage/syntax.html#attributes-2). These will still be parsed correctly by almost all browsers.
pub allow_noncompliant_unquoted_attribute_values: bool,
/// Allow some minifications around entities that may not pass validation, but will still be parsed correctly by almost all browsers.
pub allow_optimal_entities: bool,
/// Allow removing_spaces between attributes when possible, which may not be spec compliant. These will still be parsed correctly by almost all browsers.
pub allow_removing_spaces_between_attributes: bool,
/// Do not omit closing tags when possible.
Expand All @@ -20,12 +22,7 @@ pub struct Cfg {
pub minify_css: bool,
/// Minify DOCTYPEs. Minified DOCTYPEs may not be spec compliant, but will still be parsed correctly by almost all browsers.
pub minify_doctype: bool,
/// Minify JavaScript in `<script>` tags using
/// [minify-js](https://github.com/wilsonzlin/minify-js).
///
/// Only `<script>` tags with a valid or no
/// [MIME type](https://mimesniff.spec.whatwg.org/#javascript-mime-type) is considered to
/// contain JavaScript, as per the specification.
/// Minify JavaScript in `<script>` tags using [minify-js](https://github.com/wilsonzlin/minify-js).
pub minify_js: bool,
/// When `{{`, `{#`, or `{%` are seen in content, all source code until the subsequent matching closing `}}`, `#}`, or `%}` respectively gets piped through untouched.
pub preserve_brace_template_syntax: bool,
Expand All @@ -44,6 +41,7 @@ impl Cfg {

pub fn enable_possibly_noncompliant(&mut self) {
self.allow_noncompliant_unquoted_attribute_values = true;
self.allow_optimal_entities = true;
self.allow_removing_spaces_between_attributes = true;
self.minify_doctype = true;
}
Expand Down
18 changes: 12 additions & 6 deletions minify-html/src/entity/encode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,12 @@ lazy_static! {

// Encodes ampersands when necessary, as well as UTF-8 sequences that are shorter encoded.
// Does not handle context-specific escaping e.g. `>`, `'`, `"`.
pub fn encode_entities(mut code: &[u8], in_attr_val: bool) -> Vec<u8> {
// Set {@param must_end_with_semicolon} to true to pass validation.
pub fn encode_entities(
mut code: &[u8],
in_attr_val: bool,
must_end_with_semicolon: bool,
) -> Vec<u8> {
let mut res = Vec::<u8>::new();
while !code.is_empty() {
let (before, matched) = match memchr(b'&', code) {
Expand All @@ -37,17 +42,18 @@ pub fn encode_entities(mut code: &[u8], in_attr_val: bool) -> Vec<u8> {
EntityType::Named(_)
if in_attr_val
&& code[len - 1] != b';'
&& code
.get(len)
.filter(|&&c| ALPHANUMERIC_OR_EQUALS[c])
.is_some() =>
&& code.get(len).is_some_and(|&c| ALPHANUMERIC_OR_EQUALS[c]) =>
{
// A named entity inside an attribute value that doesn't end with semicolon but is followed by an alphanumeric or `=` character is not decoded, so we don't need to encode.
// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state.
0
}
_ => {
res.extend_from_slice(b"&amp");
if must_end_with_semicolon {
res.extend_from_slice(b"&amp;");
} else {
res.extend_from_slice(b"&amp");
};
// Skip the leading ampersand, as it will be replaced by `&amp`.
1
}
Expand Down
Loading

0 comments on commit adfbcb1

Please sign in to comment.