From c6c756087a89f40f4c17476f87b70c0f96fa39f7 Mon Sep 17 00:00:00 2001 From: Jackson Owens Date: Tue, 16 May 2023 11:40:23 -0400 Subject: [PATCH] db: add reserved Pebblev4 sstable format This commit defines but leaves unused a new sstable table Pebblev4 that will subsume Pebblev2 and Pebblev3 sstable formats. Future work (#2465, #2340) will require additional sstable table formats. Stabilization of the table format extensions introduced in the Pebblev3 table format is required before that can happen. The new Pebblev4 format will include Pebblev3's extensions. This commit adjusts the code to not respect the Experimental.EnableValueBlocks setting in future format major versions that make use of the Pebblev4 sstable format. There's some subtlety involved in this change, hence the introduction of the Pebblev4 sstable format before it's used. --- compaction.go | 13 ++++++------- data_test.go | 2 ++ format_major_version.go | 3 +-- options.go | 7 +++++-- sstable/format.go | 12 ++++++++---- sstable/format_test.go | 10 ++++++++-- sstable/reader.go | 6 +++--- sstable/suffix_rewriter.go | 2 +- sstable/suffix_rewriter_test.go | 4 +--- sstable/table.go | 2 +- sstable/testdata/writer_value_blocks | 4 ++-- sstable/writer.go | 2 +- table_cache.go | 2 +- testdata/format_major_version_pebblev1_migration | 2 ++ 14 files changed, 42 insertions(+), 29 deletions(-) diff --git a/compaction.go b/compaction.go index ba8d2cc5d3..fcc840dd39 100644 --- a/compaction.go +++ b/compaction.go @@ -2772,17 +2772,16 @@ func (d *DB) runCompaction( // The table is typically written at the maximum allowable format implied by // the current format major version of the DB. tableFormat := formatVers.MaxTableFormat() - if tableFormat > sstable.TableFormatPebblev3 { - // Since TableFormatPebblev3 does not currently subsume - // TableFormatPebblev2, this panic ensures that we have carefully thought - // through what we are doing before we introduce a format beyond - // TableFormatPebblev3. - panic("cannot handle table format beyond TableFormatPebblev3") - } + + // In format major versions with maximum table formats of Pebblev3, value + // blocks were conditional on an experimental setting. In format major + // versions with maximum table formats of Pebblev4 and higher, value blocks + // are always enabled. if tableFormat == sstable.TableFormatPebblev3 && (d.opts.Experimental.EnableValueBlocks == nil || !d.opts.Experimental.EnableValueBlocks()) { tableFormat = sstable.TableFormatPebblev2 } + writerOpts := d.opts.MakeWriterOptions(c.outputLevel.level, tableFormat) if formatVers < FormatBlockPropertyCollector { // Cannot yet write block properties. diff --git a/data_test.go b/data_test.go index 74973ee61f..ce1d06f6a4 100644 --- a/data_test.go +++ b/data_test.go @@ -606,6 +606,8 @@ func runBuildCmd(td *datadriven.TestData, d *DB, fs vfs.FS) error { tableFormat = sstable.TableFormatPebblev2 case "pebblev3": tableFormat = sstable.TableFormatPebblev3 + case "pebblev4": + tableFormat = sstable.TableFormatPebblev4 default: return errors.Errorf("unknown format string %s", cmdArg.Vals[0]) } diff --git a/format_major_version.go b/format_major_version.go index 48b3b2840f..6d96297075 100644 --- a/format_major_version.go +++ b/format_major_version.go @@ -168,8 +168,7 @@ func (v FormatMajorVersion) MaxTableFormat() sstable.TableFormat { case FormatRangeKeys, FormatMinTableFormatPebblev1, FormatPrePebblev1Marked, FormatUnusedPrePebblev1MarkedCompacted: return sstable.TableFormatPebblev2 - case FormatSSTableValueBlocks, FormatFlushableIngest, - FormatPrePebblev1MarkedCompacted: + case FormatSSTableValueBlocks, FormatFlushableIngest, FormatPrePebblev1MarkedCompacted: return sstable.TableFormatPebblev3 default: panic(fmt.Sprintf("pebble: unsupported format major version: %s", v)) diff --git a/options.go b/options.go index 4e99437c07..2bdc9d505c 100644 --- a/options.go +++ b/options.go @@ -613,8 +613,11 @@ type Options struct { PointTombstoneWeight float64 // EnableValueBlocks is used to decide whether to enable writing - // TableFormatPebblev3 sstables. WARNING: do not return true yet, since - // support for TableFormatPebblev3 is incomplete and not production ready. + // TableFormatPebblev3 sstables. This setting is only respected by a + // specific subset of format major versions: FormatSSTableValueBlocks, + // FormatFlushableIngest and FormatPrePebblev1MarkedCompacted. In lower + // format major versions, value blocks are never enabled. In higher + // format major versions, value blocks are always enabled. EnableValueBlocks func() bool // ShortAttributeExtractor is used iff EnableValueBlocks() returns true diff --git a/sstable/format.go b/sstable/format.go index f719ec0e73..91909835c2 100644 --- a/sstable/format.go +++ b/sstable/format.go @@ -23,12 +23,10 @@ const ( TableFormatRocksDBv2 TableFormatPebblev1 // Block properties. TableFormatPebblev2 // Range keys. - // TableFormatPebblev3 is not currently intended to subsume v2, as - // supporting value blocks adds a 1 byte prefix to each value. After - // thorough experimentation and some production experience, this may change. TableFormatPebblev3 // Value blocks. + TableFormatPebblev4 // Reserved. - TableFormatMax = TableFormatPebblev3 + TableFormatMax = TableFormatPebblev4 ) // ParseTableFormat parses the given magic bytes and version into its @@ -52,6 +50,8 @@ func ParseTableFormat(magic []byte, version uint32) (TableFormat, error) { return TableFormatPebblev2, nil case 3: return TableFormatPebblev3, nil + case 4: + return TableFormatPebblev4, nil default: return TableFormatUnspecified, base.CorruptionErrorf( "pebble/table: unsupported pebble format version %d", errors.Safe(version), @@ -77,6 +77,8 @@ func (f TableFormat) AsTuple() (string, uint32) { return pebbleDBMagic, 2 case TableFormatPebblev3: return pebbleDBMagic, 3 + case TableFormatPebblev4: + return pebbleDBMagic, 4 default: panic("sstable: unknown table format version tuple") } @@ -95,6 +97,8 @@ func (f TableFormat) String() string { return "(Pebble,v2)" case TableFormatPebblev3: return "(Pebble,v3)" + case TableFormatPebblev4: + return "(Pebble,v4)" default: panic("sstable: unknown table format version tuple") } diff --git a/sstable/format_test.go b/sstable/format_test.go index cd1d1735be..f5589c1239 100644 --- a/sstable/format_test.go +++ b/sstable/format_test.go @@ -49,6 +49,12 @@ func TestTableFormat_RoundTrip(t *testing.T) { version: 3, want: TableFormatPebblev3, }, + { + name: "PebbleDBv4", + magic: pebbleDBMagic, + version: 4, + want: TableFormatPebblev4, + }, // Invalid cases. { name: "Invalid RocksDB version", @@ -59,8 +65,8 @@ func TestTableFormat_RoundTrip(t *testing.T) { { name: "Invalid PebbleDB version", magic: pebbleDBMagic, - version: 4, - wantErr: "pebble/table: unsupported pebble format version 4", + version: 5, + wantErr: "pebble/table: unsupported pebble format version 5", }, { name: "Unknown magic string", diff --git a/sstable/reader.go b/sstable/reader.go index 97f027e817..0bbfd628a2 100644 --- a/sstable/reader.go +++ b/sstable/reader.go @@ -435,7 +435,7 @@ func (i *singleLevelIterator) init( return err } i.dataRH = objstorageprovider.UsePreallocatedReadHandle(ctx, r.readable, &i.dataRHPrealloc) - if r.tableFormat == TableFormatPebblev3 { + if r.tableFormat >= TableFormatPebblev3 { if r.Properties.NumValueBlocks > 0 { // NB: we cannot avoid this ~248 byte allocation, since valueBlockReader // can outlive the singleLevelIterator due to be being embedded in a @@ -1874,7 +1874,7 @@ func (i *twoLevelIterator) init( return err } i.dataRH = r.readable.NewReadHandle(ctx) - if r.tableFormat == TableFormatPebblev3 { + if r.tableFormat >= TableFormatPebblev3 { if r.Properties.NumValueBlocks > 0 { i.vbReader = &valueBlockReader{ ctx: ctx, @@ -4067,7 +4067,7 @@ func (l *Layout) Describe( formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset) if fmtRecord != nil { fmt.Fprintf(w, " ") - if l.Format != TableFormatPebblev3 { + if l.Format < TableFormatPebblev3 { fmtRecord(key, value.InPlaceValue()) } else { // InPlaceValue() will succeed even for data blocks where the diff --git a/sstable/suffix_rewriter.go b/sstable/suffix_rewriter.go index 39089c9884..984e8d139e 100644 --- a/sstable/suffix_rewriter.go +++ b/sstable/suffix_rewriter.go @@ -223,7 +223,7 @@ func rewriteBlocks( // in the block, which includes the 1-byte prefix. This is fine since bw // also does not know about the prefix and will preserve it in bw.add. v := val.InPlaceValue() - if invariants.Enabled && r.tableFormat == TableFormatPebblev3 && + if invariants.Enabled && r.tableFormat >= TableFormatPebblev3 && key.Kind() == InternalKeyKindSet { if len(v) < 1 { return errors.Errorf("value has no prefix") diff --git a/sstable/suffix_rewriter_test.go b/sstable/suffix_rewriter_test.go index 47141ff70b..da6a47232c 100644 --- a/sstable/suffix_rewriter_test.go +++ b/sstable/suffix_rewriter_test.go @@ -18,9 +18,7 @@ func TestRewriteSuffixProps(t *testing.T) { from, to := []byte("_212"), []byte("_646") format := TableFormatPebblev2 - if rand.Intn(2) != 0 { - format = TableFormatPebblev3 - } + format += TableFormat(rand.Intn(int(TableFormatPebblev4 - TableFormatPebblev2))) t.Logf("table format: %s\n", format.String()) wOpts := WriterOptions{ FilterPolicy: bloom.FilterPolicy(10), diff --git a/sstable/table.go b/sstable/table.go index f5e6a1c1d9..5267123029 100644 --- a/sstable/table.go +++ b/sstable/table.go @@ -428,7 +428,7 @@ func supportsTwoLevelIndex(format TableFormat) bool { switch format { case TableFormatLevelDB: return false - case TableFormatRocksDBv2, TableFormatPebblev1, TableFormatPebblev2, TableFormatPebblev3: + case TableFormatRocksDBv2, TableFormatPebblev1, TableFormatPebblev2, TableFormatPebblev3, TableFormatPebblev4: return true default: panic("sstable: unspecified table format version") diff --git a/sstable/testdata/writer_value_blocks b/sstable/testdata/writer_value_blocks index 04d664bc34..745cc97ad8 100644 --- a/sstable/testdata/writer_value_blocks +++ b/sstable/testdata/writer_value_blocks @@ -214,7 +214,7 @@ layout 1255 meta: offset=1185, length=64 1258 index: offset=264, length=77 1261 [padding] - 1295 version: 3 + 1295 version: 4 1299 magic number: 0xf09faab3f09faab3 1307 EOF @@ -337,6 +337,6 @@ layout 856 meta: offset=818, length=32 859 index: offset=71, length=22 861 [padding] - 896 version: 3 + 896 version: 4 900 magic number: 0xf09faab3f09faab3 908 EOF diff --git a/sstable/writer.go b/sstable/writer.go index aaeda43ea1..d8186881c1 100644 --- a/sstable/writer.go +++ b/sstable/writer.go @@ -2110,7 +2110,7 @@ func NewWriter(writable objstorage.Writable, o WriterOptions, extraOpts ...Write Format: o.Comparer.FormatKey, }, } - if w.tableFormat == TableFormatPebblev3 { + if w.tableFormat >= TableFormatPebblev3 { w.shortAttributeExtractor = o.ShortAttributeExtractor w.requiredInPlaceValueBound = o.RequiredInPlaceValueBound w.valueBlockWriter = newValueBlockWriter( diff --git a/table_cache.go b/table_cache.go index 0b97ce6903..9a3c5b5724 100644 --- a/table_cache.go +++ b/table_cache.go @@ -447,7 +447,7 @@ func (c *tableCacheShard) newIters( return nil, nil, err } var rp sstable.ReaderProvider - if tableFormat == sstable.TableFormatPebblev3 && v.reader.Properties.NumValueBlocks > 0 { + if tableFormat >= sstable.TableFormatPebblev3 && v.reader.Properties.NumValueBlocks > 0 { rp = &tableCacheShardReaderProvider{c: c, file: file, dbOpts: dbOpts} } diff --git a/testdata/format_major_version_pebblev1_migration b/testdata/format_major_version_pebblev1_migration index 52c356e932..c579a53050 100644 --- a/testdata/format_major_version_pebblev1_migration +++ b/testdata/format_major_version_pebblev1_migration @@ -70,6 +70,7 @@ tally-table-formats (Pebble,v1): 1 (Pebble,v2): 2 (Pebble,v3): 0 +(Pebble,v4): 0 # Upgrade the DB to FormatMinTableFormatPebblev1. @@ -166,3 +167,4 @@ tally-table-formats (Pebble,v1): 1 (Pebble,v2): 4 (Pebble,v3): 0 +(Pebble,v4): 0