diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b77a3d3..1521856e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,19 @@ # Unreleased +* Improve the YAML compile cache to support `UTF-8` symbols. (#398) + [The default `MessagePack` symbol serializer assumes all symbols are ASCII](https://github.com/msgpack/msgpack-ruby/pull/211), + because of this, non-ASCII compatible symbol would be restored with `ASCII_8BIT` encoding (AKA `BINARY`). + Bootsnap now properly cache them in `UTF-8`. + + Note that the above only apply for actual YAML symbols (e..g `--- :foo`). + The issue is still present for string keys parsed with `YAML.load_file(..., symbolize_names: true)`, that is a bug + in `msgpack` that will hopefully be solved soon, see: https://github.com/msgpack/msgpack-ruby/pull/246 + +* Entirely disable the YAML compile cache if `Encoding.default_internal` is set to an encoding not supported by `msgpack`. (#398) + `Psych` coerce strings to `Encoding.default_internal`, but `MessagePack` doesn't. So in this scenario we can't provide + YAML caching at all without returning the strings in the wrong encoding. + This never came up in practice but might as well be safe. + # 1.10.2 * Reduce the `Kernel.require` extra stack frames some more. Now bootsnap should only add one extra frame per `require` call. diff --git a/ext/bootsnap/bootsnap.c b/ext/bootsnap/bootsnap.c index 33e9b76d..ad0cca15 100644 --- a/ext/bootsnap/bootsnap.c +++ b/ext/bootsnap/bootsnap.c @@ -75,7 +75,7 @@ struct bs_cache_key { STATIC_ASSERT(sizeof(struct bs_cache_key) == KEY_SIZE); /* Effectively a schema version. Bumping invalidates all previous caches */ -static const uint32_t current_version = 4; +static const uint32_t current_version = 5; /* hash of e.g. "x86_64-darwin17", invalidating when ruby is recompiled on a * new OS ABI, etc. */ diff --git a/lib/bootsnap/compile_cache/yaml.rb b/lib/bootsnap/compile_cache/yaml.rb index 9ab6707a..98cba35c 100644 --- a/lib/bootsnap/compile_cache/yaml.rb +++ b/lib/bootsnap/compile_cache/yaml.rb @@ -5,7 +5,15 @@ module Bootsnap module CompileCache module YAML - UnsupportedTags = Class.new(StandardError) + Uncompilable = Class.new(StandardError) + UnsupportedTags = Class.new(Uncompilable) + + SUPPORTED_INTERNAL_ENCODINGS = [ + nil, # UTF-8 + Encoding::UTF_8, + Encoding::ASCII, + Encoding::BINARY, + ].freeze class << self attr_accessor(:msgpack_factory, :supported_options) @@ -16,7 +24,9 @@ def cache_dir=(cache_dir) end def precompile(path) - Bootsnap::CompileCache::Native.precompile( + return false unless CompileCache::YAML.supported_internal_encoding? + + CompileCache::Native.precompile( cache_dir, path.to_s, @implementation, @@ -29,6 +39,44 @@ def install!(cache_dir) ::YAML.singleton_class.prepend(@implementation::Patch) end + # Psych coerce strings to `Encoding.default_internal` but Message Pack only support + # UTF-8, US-ASCII and BINARY. So if Encoding.default_internal is set to anything else + # we can't safely use the cache + def supported_internal_encoding? + SUPPORTED_INTERNAL_ENCODINGS.include?(Encoding.default_internal) + end + + module EncodingAwareSymbols + extend self + + if Symbol.method_defined?(:name) + def pack(symbol) + if symbol.encoding == Encoding::UTF_8 + 1.chr << symbol.name + else + 0.chr << symbol.name + end + end + else + def pack(symbol) + if symbol.encoding == Encoding::UTF_8 + 1.chr << symbol.to_s + else + 0.chr << symbol.to_s + end + end + end + + def unpack(payload) + payload.freeze + string = payload.byteslice(1..-1) + if payload.ord == 1 # Encoding::UTF_8 + string.force_encoding(Encoding::UTF_8) + end + string.to_sym + end + end + def init! require("yaml") require("msgpack") @@ -43,7 +91,12 @@ def init! # We want them to roundtrip cleanly, so we use a custom factory. # see: https://github.com/msgpack/msgpack-ruby/pull/122 factory = MessagePack::Factory.new - factory.register_type(0x00, Symbol) + factory.register_type( + 0x00, + Symbol, + packer: EncodingAwareSymbols.method(:pack).to_proc, + unpacker: EncodingAwareSymbols.method(:unpack).to_proc, + ) if defined? MessagePack::Timestamp factory.register_type( @@ -124,7 +177,7 @@ def input_to_storage(contents, _) packer.pack(false) # not safe loaded packer.pack(obj) packer.to_s - rescue NoMethodError, RangeError, UnsupportedTags + rescue NoMethodError, RangeError, Uncompilable UNCOMPILABLE # The object included things that we can't serialize end @@ -179,44 +232,48 @@ def input_to_output(data, kwargs) module Patch def load_file(path, *args) + return super unless CompileCache::YAML.supported_internal_encoding? + return super if args.size > 1 if (kwargs = args.first) return super unless kwargs.is_a?(Hash) - return super unless (kwargs.keys - ::Bootsnap::CompileCache::YAML.supported_options).empty? + return super unless (kwargs.keys - CompileCache::YAML.supported_options).empty? end begin - ::Bootsnap::CompileCache::Native.fetch( - Bootsnap::CompileCache::YAML.cache_dir, + CompileCache::Native.fetch( + CompileCache::YAML.cache_dir, File.realpath(path), - ::Bootsnap::CompileCache::YAML::Psych4::SafeLoad, + CompileCache::YAML::Psych4::SafeLoad, kwargs, ) rescue Errno::EACCES - ::Bootsnap::CompileCache.permission_error(path) + CompileCache.permission_error(path) end end ruby2_keywords :load_file if respond_to?(:ruby2_keywords, true) def unsafe_load_file(path, *args) + return super unless CompileCache::YAML.supported_internal_encoding? + return super if args.size > 1 if (kwargs = args.first) return super unless kwargs.is_a?(Hash) - return super unless (kwargs.keys - ::Bootsnap::CompileCache::YAML.supported_options).empty? + return super unless (kwargs.keys - CompileCache::YAML.supported_options).empty? end begin - ::Bootsnap::CompileCache::Native.fetch( - Bootsnap::CompileCache::YAML.cache_dir, + CompileCache::Native.fetch( + CompileCache::YAML.cache_dir, File.realpath(path), - ::Bootsnap::CompileCache::YAML::Psych4::UnsafeLoad, + CompileCache::YAML::Psych4::UnsafeLoad, kwargs, ) rescue Errno::EACCES - ::Bootsnap::CompileCache.permission_error(path) + CompileCache.permission_error(path) end end @@ -233,7 +290,7 @@ def input_to_storage(contents, _) packer.pack(false) # not safe loaded packer.pack(obj) packer.to_s - rescue NoMethodError, RangeError, UnsupportedTags + rescue NoMethodError, RangeError, Uncompilable UNCOMPILABLE # The object included things that we can't serialize end @@ -253,44 +310,48 @@ def input_to_output(data, kwargs) module Patch def load_file(path, *args) + return super unless CompileCache::YAML.supported_internal_encoding? + return super if args.size > 1 if (kwargs = args.first) return super unless kwargs.is_a?(Hash) - return super unless (kwargs.keys - ::Bootsnap::CompileCache::YAML.supported_options).empty? + return super unless (kwargs.keys - CompileCache::YAML.supported_options).empty? end begin - ::Bootsnap::CompileCache::Native.fetch( - Bootsnap::CompileCache::YAML.cache_dir, + CompileCache::Native.fetch( + CompileCache::YAML.cache_dir, File.realpath(path), - ::Bootsnap::CompileCache::YAML::Psych3, + CompileCache::YAML::Psych3, kwargs, ) rescue Errno::EACCES - ::Bootsnap::CompileCache.permission_error(path) + CompileCache.permission_error(path) end end ruby2_keywords :load_file if respond_to?(:ruby2_keywords, true) def unsafe_load_file(path, *args) + return super unless CompileCache::YAML.supported_internal_encoding? + return super if args.size > 1 if (kwargs = args.first) return super unless kwargs.is_a?(Hash) - return super unless (kwargs.keys - ::Bootsnap::CompileCache::YAML.supported_options).empty? + return super unless (kwargs.keys - CompileCache::YAML.supported_options).empty? end begin - ::Bootsnap::CompileCache::Native.fetch( - Bootsnap::CompileCache::YAML.cache_dir, + CompileCache::Native.fetch( + CompileCache::YAML.cache_dir, File.realpath(path), - ::Bootsnap::CompileCache::YAML::Psych3, + CompileCache::YAML::Psych3, kwargs, ) rescue Errno::EACCES - ::Bootsnap::CompileCache.permission_error(path) + CompileCache.permission_error(path) end end diff --git a/test/compile_cache/yaml_test.rb b/test/compile_cache/yaml_test.rb index 1f6c09c7..5d593394 100644 --- a/test/compile_cache/yaml_test.rb +++ b/test/compile_cache/yaml_test.rb @@ -49,6 +49,27 @@ def test_yaml_tags assert_equal "YAML tags are not supported: !ruby/object", error.message end + def test_symbols_encoding + symbols = [:ascii, :utf8_fée] + Help.set_file("a.yml", YAML.dump(symbols), 100) + + loaded_symbols = FakeYaml.load_file("a.yml") + assert_equal(symbols, loaded_symbols) + assert_equal(symbols.map(&:encoding), loaded_symbols.map(&:encoding)) + end + + def test_custom_symbols_encoding + sym = "壁に耳あり、障子に目あり".to_sym + Help.set_file("a.yml", YAML.dump(sym), 100) + # YAML is limited to UTF-8 and UTF-16 by spec, but Psych does respect Encoding.default_internal + # so strings and symbol can actually be of any encoding. + assert_raises FakeYaml::Fallback do + with_default_encoding_internal(Encoding::EUC_JP) do + FakeYaml.load_file("a.yml") + end + end + end + if YAML::VERSION >= "4" def test_load_psych_4_with_alias Help.set_file("a.yml", "foo: &foo\n bar: 42\nplop:\n <<: *foo", 100) @@ -170,4 +191,20 @@ def test_unsafe_load_file assert_equal({"foo" => {"bar" => 42}, "plop" => {"bar" => 42}}, FakeYaml.unsafe_load_file("a.yml")) end end + + private + + def with_default_encoding_internal(encoding) + original_internal = Encoding.default_internal + $VERBOSE = false + Encoding.default_internal = encoding + $VERBOSE = true + begin + yield + ensure + $VERBOSE = false + Encoding.default_internal = original_internal + $VERBOSE = true + end + end end diff --git a/test/compile_cache_key_format_test.rb b/test/compile_cache_key_format_test.rb index bf7f2e3c..689238e7 100644 --- a/test/compile_cache_key_format_test.rb +++ b/test/compile_cache_key_format_test.rb @@ -21,7 +21,7 @@ class CompileCacheKeyFormatTest < Minitest::Test def test_key_version key = cache_key_for_file(FILE) - exp = [4].pack("L") + exp = [5].pack("L") assert_equal(exp, key[R[:version]]) end