Skip to content

Commit

Permalink
db: add reserved Pebblev4 sstable format
Browse files Browse the repository at this point in the history
This commit defines but leaves unused a new sstable table Pebblev4 that will
subsume Pebblev2 and Pebblev3 sstable formats. Future work (cockroachdb#2465, cockroachdb#2340) will
require additional sstable table formats. Stabilization of the table format
extensions introduced in the Pebblev3 table format is required before that can
happen. The new Pebblev4 format will include Pebblev3's extensions. This commit
adjusts the code to not respect the Experimental.EnableValueBlocks setting in
future format major versions that make use of the Pebblev4 sstable format.

There's some subtlety involved in this change, hence the introduction of the
Pebblev4 sstable format before it's used.
  • Loading branch information
jbowens committed May 17, 2023
1 parent 5a6b91b commit c6c7560
Show file tree
Hide file tree
Showing 14 changed files with 42 additions and 29 deletions.
13 changes: 6 additions & 7 deletions compaction.go
Original file line number Diff line number Diff line change
Expand Up @@ -2772,17 +2772,16 @@ func (d *DB) runCompaction(
// The table is typically written at the maximum allowable format implied by
// the current format major version of the DB.
tableFormat := formatVers.MaxTableFormat()
if tableFormat > sstable.TableFormatPebblev3 {
// Since TableFormatPebblev3 does not currently subsume
// TableFormatPebblev2, this panic ensures that we have carefully thought
// through what we are doing before we introduce a format beyond
// TableFormatPebblev3.
panic("cannot handle table format beyond TableFormatPebblev3")
}

// In format major versions with maximum table formats of Pebblev3, value
// blocks were conditional on an experimental setting. In format major
// versions with maximum table formats of Pebblev4 and higher, value blocks
// are always enabled.
if tableFormat == sstable.TableFormatPebblev3 &&
(d.opts.Experimental.EnableValueBlocks == nil || !d.opts.Experimental.EnableValueBlocks()) {
tableFormat = sstable.TableFormatPebblev2
}

writerOpts := d.opts.MakeWriterOptions(c.outputLevel.level, tableFormat)
if formatVers < FormatBlockPropertyCollector {
// Cannot yet write block properties.
Expand Down
2 changes: 2 additions & 0 deletions data_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -606,6 +606,8 @@ func runBuildCmd(td *datadriven.TestData, d *DB, fs vfs.FS) error {
tableFormat = sstable.TableFormatPebblev2
case "pebblev3":
tableFormat = sstable.TableFormatPebblev3
case "pebblev4":
tableFormat = sstable.TableFormatPebblev4
default:
return errors.Errorf("unknown format string %s", cmdArg.Vals[0])
}
Expand Down
3 changes: 1 addition & 2 deletions format_major_version.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,7 @@ func (v FormatMajorVersion) MaxTableFormat() sstable.TableFormat {
case FormatRangeKeys, FormatMinTableFormatPebblev1, FormatPrePebblev1Marked,
FormatUnusedPrePebblev1MarkedCompacted:
return sstable.TableFormatPebblev2
case FormatSSTableValueBlocks, FormatFlushableIngest,
FormatPrePebblev1MarkedCompacted:
case FormatSSTableValueBlocks, FormatFlushableIngest, FormatPrePebblev1MarkedCompacted:
return sstable.TableFormatPebblev3
default:
panic(fmt.Sprintf("pebble: unsupported format major version: %s", v))
Expand Down
7 changes: 5 additions & 2 deletions options.go
Original file line number Diff line number Diff line change
Expand Up @@ -613,8 +613,11 @@ type Options struct {
PointTombstoneWeight float64

// EnableValueBlocks is used to decide whether to enable writing
// TableFormatPebblev3 sstables. WARNING: do not return true yet, since
// support for TableFormatPebblev3 is incomplete and not production ready.
// TableFormatPebblev3 sstables. This setting is only respected by a
// specific subset of format major versions: FormatSSTableValueBlocks,
// FormatFlushableIngest and FormatPrePebblev1MarkedCompacted. In lower
// format major versions, value blocks are never enabled. In higher
// format major versions, value blocks are always enabled.
EnableValueBlocks func() bool

// ShortAttributeExtractor is used iff EnableValueBlocks() returns true
Expand Down
12 changes: 8 additions & 4 deletions sstable/format.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,10 @@ const (
TableFormatRocksDBv2
TableFormatPebblev1 // Block properties.
TableFormatPebblev2 // Range keys.
// TableFormatPebblev3 is not currently intended to subsume v2, as
// supporting value blocks adds a 1 byte prefix to each value. After
// thorough experimentation and some production experience, this may change.
TableFormatPebblev3 // Value blocks.
TableFormatPebblev4 // Reserved.

TableFormatMax = TableFormatPebblev3
TableFormatMax = TableFormatPebblev4
)

// ParseTableFormat parses the given magic bytes and version into its
Expand All @@ -52,6 +50,8 @@ func ParseTableFormat(magic []byte, version uint32) (TableFormat, error) {
return TableFormatPebblev2, nil
case 3:
return TableFormatPebblev3, nil
case 4:
return TableFormatPebblev4, nil
default:
return TableFormatUnspecified, base.CorruptionErrorf(
"pebble/table: unsupported pebble format version %d", errors.Safe(version),
Expand All @@ -77,6 +77,8 @@ func (f TableFormat) AsTuple() (string, uint32) {
return pebbleDBMagic, 2
case TableFormatPebblev3:
return pebbleDBMagic, 3
case TableFormatPebblev4:
return pebbleDBMagic, 4
default:
panic("sstable: unknown table format version tuple")
}
Expand All @@ -95,6 +97,8 @@ func (f TableFormat) String() string {
return "(Pebble,v2)"
case TableFormatPebblev3:
return "(Pebble,v3)"
case TableFormatPebblev4:
return "(Pebble,v4)"
default:
panic("sstable: unknown table format version tuple")
}
Expand Down
10 changes: 8 additions & 2 deletions sstable/format_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,12 @@ func TestTableFormat_RoundTrip(t *testing.T) {
version: 3,
want: TableFormatPebblev3,
},
{
name: "PebbleDBv4",
magic: pebbleDBMagic,
version: 4,
want: TableFormatPebblev4,
},
// Invalid cases.
{
name: "Invalid RocksDB version",
Expand All @@ -59,8 +65,8 @@ func TestTableFormat_RoundTrip(t *testing.T) {
{
name: "Invalid PebbleDB version",
magic: pebbleDBMagic,
version: 4,
wantErr: "pebble/table: unsupported pebble format version 4",
version: 5,
wantErr: "pebble/table: unsupported pebble format version 5",
},
{
name: "Unknown magic string",
Expand Down
6 changes: 3 additions & 3 deletions sstable/reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -435,7 +435,7 @@ func (i *singleLevelIterator) init(
return err
}
i.dataRH = objstorageprovider.UsePreallocatedReadHandle(ctx, r.readable, &i.dataRHPrealloc)
if r.tableFormat == TableFormatPebblev3 {
if r.tableFormat >= TableFormatPebblev3 {
if r.Properties.NumValueBlocks > 0 {
// NB: we cannot avoid this ~248 byte allocation, since valueBlockReader
// can outlive the singleLevelIterator due to be being embedded in a
Expand Down Expand Up @@ -1874,7 +1874,7 @@ func (i *twoLevelIterator) init(
return err
}
i.dataRH = r.readable.NewReadHandle(ctx)
if r.tableFormat == TableFormatPebblev3 {
if r.tableFormat >= TableFormatPebblev3 {
if r.Properties.NumValueBlocks > 0 {
i.vbReader = &valueBlockReader{
ctx: ctx,
Expand Down Expand Up @@ -4067,7 +4067,7 @@ func (l *Layout) Describe(
formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset)
if fmtRecord != nil {
fmt.Fprintf(w, " ")
if l.Format != TableFormatPebblev3 {
if l.Format < TableFormatPebblev3 {
fmtRecord(key, value.InPlaceValue())
} else {
// InPlaceValue() will succeed even for data blocks where the
Expand Down
2 changes: 1 addition & 1 deletion sstable/suffix_rewriter.go
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ func rewriteBlocks(
// in the block, which includes the 1-byte prefix. This is fine since bw
// also does not know about the prefix and will preserve it in bw.add.
v := val.InPlaceValue()
if invariants.Enabled && r.tableFormat == TableFormatPebblev3 &&
if invariants.Enabled && r.tableFormat >= TableFormatPebblev3 &&
key.Kind() == InternalKeyKindSet {
if len(v) < 1 {
return errors.Errorf("value has no prefix")
Expand Down
4 changes: 1 addition & 3 deletions sstable/suffix_rewriter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,7 @@ func TestRewriteSuffixProps(t *testing.T) {
from, to := []byte("_212"), []byte("_646")

format := TableFormatPebblev2
if rand.Intn(2) != 0 {
format = TableFormatPebblev3
}
format += TableFormat(rand.Intn(int(TableFormatPebblev4 - TableFormatPebblev2)))
t.Logf("table format: %s\n", format.String())
wOpts := WriterOptions{
FilterPolicy: bloom.FilterPolicy(10),
Expand Down
2 changes: 1 addition & 1 deletion sstable/table.go
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,7 @@ func supportsTwoLevelIndex(format TableFormat) bool {
switch format {
case TableFormatLevelDB:
return false
case TableFormatRocksDBv2, TableFormatPebblev1, TableFormatPebblev2, TableFormatPebblev3:
case TableFormatRocksDBv2, TableFormatPebblev1, TableFormatPebblev2, TableFormatPebblev3, TableFormatPebblev4:
return true
default:
panic("sstable: unspecified table format version")
Expand Down
4 changes: 2 additions & 2 deletions sstable/testdata/writer_value_blocks
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ layout
1255 meta: offset=1185, length=64
1258 index: offset=264, length=77
1261 [padding]
1295 version: 3
1295 version: 4
1299 magic number: 0xf09faab3f09faab3
1307 EOF

Expand Down Expand Up @@ -337,6 +337,6 @@ layout
856 meta: offset=818, length=32
859 index: offset=71, length=22
861 [padding]
896 version: 3
896 version: 4
900 magic number: 0xf09faab3f09faab3
908 EOF
2 changes: 1 addition & 1 deletion sstable/writer.go
Original file line number Diff line number Diff line change
Expand Up @@ -2110,7 +2110,7 @@ func NewWriter(writable objstorage.Writable, o WriterOptions, extraOpts ...Write
Format: o.Comparer.FormatKey,
},
}
if w.tableFormat == TableFormatPebblev3 {
if w.tableFormat >= TableFormatPebblev3 {
w.shortAttributeExtractor = o.ShortAttributeExtractor
w.requiredInPlaceValueBound = o.RequiredInPlaceValueBound
w.valueBlockWriter = newValueBlockWriter(
Expand Down
2 changes: 1 addition & 1 deletion table_cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,7 @@ func (c *tableCacheShard) newIters(
return nil, nil, err
}
var rp sstable.ReaderProvider
if tableFormat == sstable.TableFormatPebblev3 && v.reader.Properties.NumValueBlocks > 0 {
if tableFormat >= sstable.TableFormatPebblev3 && v.reader.Properties.NumValueBlocks > 0 {
rp = &tableCacheShardReaderProvider{c: c, file: file, dbOpts: dbOpts}
}

Expand Down
2 changes: 2 additions & 0 deletions testdata/format_major_version_pebblev1_migration
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ tally-table-formats
(Pebble,v1): 1
(Pebble,v2): 2
(Pebble,v3): 0
(Pebble,v4): 0

# Upgrade the DB to FormatMinTableFormatPebblev1.

Expand Down Expand Up @@ -166,3 +167,4 @@ tally-table-formats
(Pebble,v1): 1
(Pebble,v2): 4
(Pebble,v3): 0
(Pebble,v4): 0

0 comments on commit c6c7560

Please sign in to comment.