diff --git a/Cargo.toml b/Cargo.toml index ae111be..7e6feaa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,7 @@ maplit = "1.0" tendril = "0.4" url = "2" once_cell = "1.10" +cssparser = "0.34.0" [dev-dependencies] version-sync = "0.9" diff --git a/src/lib.rs b/src/lib.rs index 605923c..4ace0bd 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -33,6 +33,8 @@ pub mod rcdom; #[cfg(not(ammonia_unstable))] mod rcdom; +mod style; + use html5ever::interface::Attribute; use html5ever::serialize::{serialize, SerializeOpts}; use html5ever::tree_builder::{NodeOrText, TreeSink}; @@ -367,6 +369,7 @@ pub struct Builder<'a> { strip_comments: bool, id_prefix: Option<&'a str>, generic_attribute_prefixes: Option>, + style_properties: Option>, } impl<'a> Default for Builder<'a> { @@ -487,6 +490,7 @@ impl<'a> Default for Builder<'a> { strip_comments: true, id_prefix: None, generic_attribute_prefixes: None, + style_properties: None, } } } @@ -1651,6 +1655,34 @@ impl<'a> Builder<'a> { self } + /// Only allows the specified properties in `style` attributes. + /// + /// Irrelevant if `style` is not an allowed attribute. + /// + /// Note that if style filtering is enabled style properties will be normalised e.g. + /// invalid declarations and @rules will be removed, with only syntactically valid + /// declarations kept. + /// + /// # Examples + /// + /// use ammonia::Builder; + /// use maplit::hashset; + /// + /// # fn main() { + /// let attributes = hashset!["style"]; + /// let properties = hashset!["color"]; + /// let a = Builder::new() + /// .generic_attributes(attributes) + /// .filter_style_properties(properties) + /// .clean("

my html

") + /// .to_string(); + /// assert_eq!(a, "

my html

"); + /// # } + pub fn filter_style_properties(&mut self, value: HashSet<&'a str>) -> &mut Self { + self.style_properties = Some(value); + self + } + /// Constructs a [`Builder`] instance configured with the [default options]. /// /// # Examples @@ -2048,6 +2080,7 @@ impl<'a> Builder<'a> { /// /// * relative URL rewriting /// * adding `` attributes + /// * filtering out banned style properties /// * filtering out banned classes fn adjust_node_attributes( &self, @@ -2141,6 +2174,13 @@ impl<'a> Builder<'a> { attrs.swap_remove(i); } } + if let Some(allowed_values) = &self.style_properties { + for attr in &mut *attrs.borrow_mut() { + if &attr.name.local == "style" { + attr.value = style::filter_style_attribute(&attr.value, allowed_values).into(); + } + } + } if let Some(allowed_values) = self.allowed_classes.get(&*name.local) { for attr in &mut *attrs.borrow_mut() { if &attr.name.local == "class" { diff --git a/src/style.rs b/src/style.rs new file mode 100644 index 0000000..03c70b1 --- /dev/null +++ b/src/style.rs @@ -0,0 +1,332 @@ +//! HTML living standard 3.2.6.5 The `style` attribute: +//! +//! > All HTML elements may have the `style` content attribute set. +//! > This is a style attribute as defined by [CSS Style Attributes](CSSATTR). +//! +//! CSSATTR 3. Syntax and Parsing +//! +//! The value of the style attribute must match the syntax of the contents of a CSS +//! declaration block (excluding the delimiting braces), whose formal grammar is given +//! below in the terms and conventions of the CSS core grammar: +//! +//! ```yacc +//! style-attribute +//! : S* declaration-list +//! ; +//! +//! declaration-list +//! : declaration [ ';' S* declaration-list ]? +//! | at-rule declaration-list +//! | /* empty */ +//! ; +//! ``` +//! +//! > Note that because there is no open brace delimiting the declaration list in +//! > the CSS style attribute syntax, a close brace (`}`) in the style attribute's +//! > value does not terminate the style data: it is merely an invalid token. +//! +//! > [...] Although the grammar allows it, no at-rule valid in style attributes is +//! > define[d] at the moment. The forward-compatible parsing rules are such that +//! > a declaration following an at-rule is *not* ignored +//! +//! [CSSATTR]: https://w3c.github.io/csswg-drafts/css-style-attr/ +use std::collections::HashSet; + +use cssparser::{BasicParseErrorKind, DeclarationParser, Delimiter::CurlyBracketBlock, ParseError, ParseErrorKind, Parser, ParserInput, ToCss, Token}; + + + +/// Filters `style` to only keep the declarations whose property name are listed in +/// `names`. Also normalises the style attribute by stripping broken declarations +/// and constructs per [CSSATTR] rules. +pub fn filter_style_attribute( + style: &str, + names: &HashSet<&str>, +) -> String { + // add room for the trailing semicolon because we lazy + let mut out = String::with_capacity(style.len() + 1); + + let mut input = ParserInput::new(style); + let mut p = Parser::new(&mut input); + + loop { + match parse_one_declaration(&mut p, names) { + Ok((name, value)) => { + if !name.is_empty() { + out.push_str(&name); + out.push(':'); + out.push_str(&value); + out.push(';'); + } + }, + Err(e) => match e.kind { + ParseErrorKind::Basic(BasicParseErrorKind::EndOfInput) => break, + ParseErrorKind::Basic(BasicParseErrorKind::UnexpectedToken(Token::Semicolon)) => (), + ParseErrorKind::Basic(BasicParseErrorKind::UnexpectedToken(_)) => { + advance(&mut p); + }, + _ => unreachable!( + "parse_one_declaration should only attempt to parse an ident, a colon, \ + or a Declaration, so its only errors should be EOF or an unexpected token" + ), + }, + } + } + if !out.is_empty() { + // remove trailing semicolon (?) + out.pop(); + } + out +} + + +/// The builtin parse_one_declaration errors on a declaration list, that is not what we want. +/// +/// Also we don't need the errorneous slice on failure, since we just skip. +/// +/// Finally, add property filtering directly so we don't need to pay for the +/// `DeclarationParser::parse_value` if the property is not whitelisted. If +/// a property is filtered out, it gets parsed as `("", "")`. +pub fn parse_one_declaration<'i, 't>( + input: &mut Parser<'i, 't>, + valid_properties: &HashSet<&str>, +) -> Result<(cssparser::CowRcStr<'i>, String), ParseError<'i, ()>> +{ + let name = input.expect_ident()?.clone(); + if !valid_properties.contains(&*name) { + advance(input); + return Ok(("".into(), String::new())); + } + input.expect_colon()?; + Declarations.parse_value(name, input) +} + + +struct Declarations; +impl <'i> DeclarationParser<'i> for Declarations { + type Declaration = (cssparser::CowRcStr<'i>, String); + type Error = (); + + fn parse_value<'t>( + &mut self, + name: cssparser::CowRcStr<'i>, + input: &mut Parser<'i, 't>, + ) -> Result> { + let mut value = String::new(); + loop { + let t = match input.next() { + Err(e) if e.kind == cssparser::BasicParseErrorKind::EndOfInput => { + &Token::Semicolon + } + t => t?, + }; + use Token::*; + match t { + Semicolon => { + if value.chars().all(char::is_whitespace) { + return Ok(("".into(), String::new())); + } + break + } + + BadString(_) | BadUrl(_) => { + let err = cssparser::BasicParseErrorKind::UnexpectedToken(t.clone()); + return Err(input.new_error(err)); + } + + Function(_) => { + if !value.is_empty() && value.chars().last() != Some(' ') { + value.push(' '); + } + let Ok(_) = t.to_css(&mut value) else { + let err = cssparser::BasicParseErrorKind::UnexpectedToken(t.clone()); + return Err(input.new_error::<()>(err)); + }; + input.parse_nested_block(|p| { + let mut first = true; + loop { + match p.next() { + Ok(t) => { + if t.is_parse_error() { + let err = cssparser::BasicParseErrorKind::UnexpectedToken(t.clone()); + return Err(p.new_error(err)); + } + if !first && t != &Comma { + value.push(' '); + } + let Ok(_) = t.to_css(&mut value) else { + let err = cssparser::BasicParseErrorKind::UnexpectedToken(t.clone()); + return Err(p.new_error::<()>(err)); + }; + first = false; + } + Err(e) if e.kind == BasicParseErrorKind::EndOfInput => break Ok(()), + Err(e) => return Err(e.into()), + } + } + })?; + value.push(')'); + continue; + } + + _ => (), + } + if !value.is_empty() && value.chars().last() != Some(' ') { + value.push(' '); + } + let Ok(_) = t.to_css(&mut value) else { + let err = cssparser::BasicParseErrorKind::UnexpectedToken(t.clone()); + return Err(input.new_error(err)); + }; + } + if value.chars().all(char::is_whitespace) { + Err(input.new_error(cssparser::BasicParseErrorKind::EndOfInput)) + } else { + Ok((name, value)) + } + } +} + +// find end of declaration (EOF or semicolon) in order to recover +fn advance<'i, 't>(p: &mut Parser<'i, 't>) { + loop { + match p.next() { + Ok(Token::Semicolon) => { return } + // cssparser automatically handles paired delimiters, if we encounter a curly + // bracket the next token is whatever follows the corresponding closing + // bracket, which may be a new declaration + Ok(Token::CurlyBracketBlock) => { return }, + Err(e) if e.kind == cssparser::BasicParseErrorKind::EndOfInput => { return } + _ => () + } + + } +} + +#[cfg(test)] +mod tests { + use super::filter_style_attribute; + use std::{collections::HashSet, sync::LazyLock}; + + #[test] + fn single_declaration() { + assert_eq!( + filter_style_attribute("font-style: italic", &HashSet::from(["font-style"])), + "font-style:italic", + ); + } + + #[test] + fn terminated_declaration() { + assert_eq!( + filter_style_attribute("font-style: italic;", &HashSet::from(["font-style"])), + "font-style:italic", + ); + } + + #[test] + fn complex() { + assert_eq!( + filter_style_attribute( + "background: no-repeat center/80% url(\"../img/image.png\");", + &HashSet::from(["background"]), + ), + "background:no-repeat center / 80% url(\"../img/image.png\")", + ) + } + + /// forward-compatible parsing rules should just skip the unknown / contextually invalid at-rule + #[test] + fn at_rule() { + assert_eq!( + filter_style_attribute( + "@unsupported { splines: reticulating } color: green", + &HashSet::from(["color", "splines"]), + ), + "color:green", + ); + } + + #[test] + fn invalid_at_rules() { + assert_eq!( + filter_style_attribute("@charset 'utf-8'; color: green", &HashSet::from(["color"])), + "color:green", + ); + assert_eq!( + filter_style_attribute("@foo url(https://example.org); color: green", &HashSet::from(["color"])), + "color:green", + ); + assert_eq!( + filter_style_attribute("@media screen { color: red }; color: green", &HashSet::from(["color"])), + "color:green", + ); + + assert_eq!( + filter_style_attribute("@scope (main) { div { color: red } }; color: green", &HashSet::from(["color"])), + "color:green", + ); + } + + #[test] + fn empty_value() { + assert_eq!( + filter_style_attribute("content: ''", &HashSet::from(["content"])), + "content:\"\"", + ) + } + + static ALLOWED: LazyLock> = LazyLock::new(|| HashSet::from(["color", "foo"])); + #[test] + fn multiple() { + assert_eq!(filter_style_attribute("foo: 1; color: green", &ALLOWED), "foo:1;color:green"); + } + + /// https://www.w3.org/TR/CSS21/syndata.html#:~:text=malformed%20declarations + #[test] + fn malformed_declarations() { + let h = &HashSet::from(["color"]); + for decl in [ + "color:green", + "color:green; color", + "color:green; color:", + "color:green; color{;color:maroon}", + ] { + assert_eq!( + filter_style_attribute(decl, h), + "color:green", + "{}", decl, + ); + } + // should we also keep track of properties and remove duplicates? + for decl in [ + "color:red; color; color:green", + "color:red; color:; color:green", + "color:red; color{;color:maroon}; color:green", + ] { + assert_eq!( + filter_style_attribute(decl, h), + "color:red;color:green", + "{}", decl, + ); + } + } + + #[ignore = "can't recover from such a BadString (servo/rust-cssparser#393)"] + #[test] + fn badstring_escaped_newline() { + assert_eq!(filter_style_attribute("foo: '\n'; color: green", &ALLOWED), "color:green"); + } + + #[ignore = "can't recover from such a BadString (servo/rust-cssparser#393)"] + #[test] + fn badstring_literal_newline() { + assert_eq!(filter_style_attribute("foo: ' + '; color: green", &ALLOWED), "color:green"); + } + + #[test] + fn bad_url() { + assert_eq!(filter_style_attribute("foo: url(x'y); color: green", &ALLOWED), "color:green"); + } +} \ No newline at end of file