From cb4a0f744712c82db4dbee90b99851b71523c10e Mon Sep 17 00:00:00 2001 From: rhysd Date: Mon, 15 Apr 2024 20:20:34 +0900 Subject: [PATCH] ignore UTF-8 BOM on syntax detection (fix #529) --- src/parsing/syntax_set.rs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/parsing/syntax_set.rs b/src/parsing/syntax_set.rs index 5de15fc..3833fed 100644 --- a/src/parsing/syntax_set.rs +++ b/src/parsing/syntax_set.rs @@ -213,6 +213,7 @@ impl SyntaxSet { /// This uses regexes that come with some sublime syntax grammars for matching things like /// shebangs and mode lines like `-*- Mode: C -*-` pub fn find_syntax_by_first_line<'a>(&'a self, s: &str) -> Option<&'a SyntaxReference> { + let s = s.strip_prefix("\u{feff}").unwrap_or(s); // Strip UTF-8 BOM let cache = self.first_line_cache(); for &(ref reg, i) in cache.regexes.iter().rev() { if reg.search(s, 0, s.len(), None) { @@ -1401,6 +1402,16 @@ mod tests { assert_prototype_only_on(&["main"], &rebuilt, &rebuilt.syntaxes()[0]); } + #[test] + fn find_syntax_set_from_line_with_bom() { + // Regression test for #529 + let syntax_set = SyntaxSet::load_defaults_newlines(); + let syntax_ref = syntax_set + .find_syntax_by_first_line("\u{feff}") + .unwrap(); + assert_eq!(syntax_ref.name, "XML"); + } + fn assert_ops_contain(ops: &[(usize, ScopeStackOp)], expected: &(usize, ScopeStackOp)) { assert!( ops.contains(expected),