Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
130867: server/license: use system.migrations to check if cluster is new or old r=rafiss a=spilchen

The grace period when no license is installed varies based on whether the cluster is new or existing. Older clusters receive a 30-day grace period, while new clusters only get 7 days. This logic was introduced in a previous commit, but it relied on state being passed down to the license enforcer, which was really only intended for debugging purposes.

This update changes that approach by allowing the enforcer to determine whether the cluster is new or old independently, using the minimum timestamp from system.migrations. If the enforcer's start time is within a 2-hour window of the earliest timestamp in system.migrations, the cluster is considered new.

This change will be backported to 23.1, 23.2, 24.1 and 24.2.

Epic: CRDB-39988
Informs: CRDB-39991
Release note: None

Co-authored-by: Matt Spilchen <[email protected]>
  • Loading branch information
craig[bot] and spilchen committed Sep 19, 2024
2 parents fbed78a + 272869f commit a835697
Show file tree
Hide file tree
Showing 9 changed files with 127 additions and 33 deletions.
2 changes: 1 addition & 1 deletion pkg/config/system_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -566,7 +566,7 @@ func TestGetZoneConfigForKey(t *testing.T) {
{roachpb.RKey(keys.TimeseriesPrefix.PrefixEnd()), keys.SystemRangesID},
{roachpb.RKey(keys.TableDataMin), keys.SystemDatabaseID},
{roachpb.RKey(keys.SystemConfigSplitKey), keys.SystemDatabaseID},
{roachpb.RKey(keys.GracePeriodInitTimestamp), keys.SystemRangesID},
{roachpb.RKey(keys.ClusterInitGracePeriodTimestamp), keys.SystemRangesID},

{tkey(keys.ZonesTableID), keys.ZonesTableID},
{roachpb.RKey(keys.SystemZonesTableSpan.Key), keys.ZonesTableID},
Expand Down
4 changes: 2 additions & 2 deletions pkg/keys/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -301,10 +301,10 @@ var (
// BootstrapVersionKey is the key at which clusters bootstrapped with a version
// > 1.0 persist the version at which they were bootstrapped.
BootstrapVersionKey = roachpb.Key(makeKey(SystemPrefix, roachpb.RKey("bootstrap-version")))
// GracePeriodInitTimestamp is used for license enforcement. It marks the timestamp
// ClusterInitGracePeriodTimestamp is used for license enforcement. It marks the timestamp
// set during cluster initialization, by which a license must be installed to avoid
// throttling. The value is stored as the number of seconds since the Unix epoch.
GracePeriodInitTimestamp = roachpb.Key(makeKey(SystemPrefix, roachpb.RKey("lic-gpi-ts")))
ClusterInitGracePeriodTimestamp = roachpb.Key(makeKey(SystemPrefix, roachpb.RKey("lic-gpi-ts")))
//
// NodeIDGenerator is the global node ID generator sequence.
NodeIDGenerator = roachpb.Key(makeKey(SystemPrefix, roachpb.RKey("node-idgen")))
Expand Down
18 changes: 9 additions & 9 deletions pkg/keys/doc.go
Original file line number Diff line number Diff line change
Expand Up @@ -246,15 +246,15 @@ var _ = [...]interface{}{
// 2. System keys: This is where we store global, system data which is
// replicated across the cluster.
SystemPrefix,
NodeLivenessPrefix, // "\x00liveness-"
BootstrapVersionKey, // "bootstrap-version"
GracePeriodInitTimestamp, // "lic-gpi-ts"
NodeIDGenerator, // "node-idgen"
RangeIDGenerator, // "range-idgen"
StatusPrefix, // "status-"
StatusNodePrefix, // "status-node-"
StoreIDGenerator, // "store-idgen"
StartupMigrationPrefix, // "system-version/"
NodeLivenessPrefix, // "\x00liveness-"
BootstrapVersionKey, // "bootstrap-version"
ClusterInitGracePeriodTimestamp, // "lic-gpi-ts"
NodeIDGenerator, // "node-idgen"
RangeIDGenerator, // "range-idgen"
StatusPrefix, // "status-"
StatusNodePrefix, // "status-node-"
StoreIDGenerator, // "store-idgen"
StartupMigrationPrefix, // "system-version/"
// StartupMigrationLease, // "system-version/lease" - removed in 23.1
TimeseriesPrefix, // "tsd"
SystemSpanConfigPrefix, // "xffsys-scfg"
Expand Down
1 change: 1 addition & 0 deletions pkg/server/license/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ go_library(
"//pkg/sql/isql",
"//pkg/sql/pgwire/pgcode",
"//pkg/sql/pgwire/pgerror",
"//pkg/sql/sem/tree",
"//pkg/util/envutil",
"//pkg/util/log",
"//pkg/util/syncutil",
Expand Down
61 changes: 51 additions & 10 deletions pkg/server/license/enforcer.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/sql/isql"
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror"
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
"github.com/cockroachdb/cockroach/pkg/util/envutil"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
Expand Down Expand Up @@ -110,6 +111,11 @@ type TestingKnobs struct {
// OverrideMaxOpenTransactions if set, overrides the maximum open transactions
// when checking if active throttling.
OverrideMaxOpenTransactions *int64

// OverwriteClusterInitGracePeriodTS, if true, forces the enforcer to
// overwrite the existing cluster initialization grace period timestamp,
// even if one is already set.
OverwriteClusterInitGracePeriodTS bool
}

// TelemetryStatusReporter is the interface we use to find the last ping
Expand Down Expand Up @@ -163,9 +169,7 @@ func (e *Enforcer) GetTestingKnobs() *TestingKnobs {
// Start will load the necessary metadata for the enforcer. It reads from the
// KV license metadata and will populate any missing data as needed. The DB
// passed in must have access to the system tenant.
func (e *Enforcer) Start(
ctx context.Context, st *cluster.Settings, db isql.DB, initialStart bool,
) error {
func (e *Enforcer) Start(ctx context.Context, st *cluster.Settings, db isql.DB) error {
// We always start disabled. If an error occurs, the enforcer setup will be
// incomplete, but the server will continue to start. To ensure stability in
// that case, we leave throttling disabled.
Expand All @@ -175,7 +179,7 @@ func (e *Enforcer) Start(
e.maybeLogActiveOverrides(ctx)

if !startDisabled {
if err := e.maybeWriteClusterInitGracePeriodTS(ctx, db, initialStart); err != nil {
if err := e.maybeWriteClusterInitGracePeriodTS(ctx, db); err != nil {
return err
}
}
Expand All @@ -199,20 +203,24 @@ func (e *Enforcer) Start(

// maybeWriteClusterInitGracePeriodTS checks if the cluster init grace period
// timestamp needs to be written to the KV layer and writes it if needed.
func (e *Enforcer) maybeWriteClusterInitGracePeriodTS(
ctx context.Context, db isql.DB, initialStart bool,
) error {
func (e *Enforcer) maybeWriteClusterInitGracePeriodTS(ctx context.Context, db isql.DB) error {
return db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error {
// We could use a conditional put for this logic. However, we want to read
// and cache the value, and the common case is that the value will be read.
// Only during the initialization of the first node in the cluster will we
// need to write a new timestamp. So, we optimize for the case where the
// timestamp already exists.
val, err := txn.KV().Get(ctx, keys.GracePeriodInitTimestamp)
val, err := txn.KV().Get(ctx, keys.ClusterInitGracePeriodTimestamp)
if err != nil {
return err
}
if val.Value == nil {
tk := e.GetTestingKnobs()
if val.Value == nil || (tk != nil && tk.OverwriteClusterInitGracePeriodTS) {
initialStart, err := e.getIsNewClusterEstimate(ctx, txn)
if err != nil {
return err
}

// The length of the grace period without a license varies based on the
// cluster's creation time. Older databases built when we had a
// CockroachDB core license are given more time.
Expand All @@ -224,7 +232,7 @@ func (e *Enforcer) maybeWriteClusterInitGracePeriodTS(
end := e.getStartTime().Add(gracePeriodLength)
log.Infof(ctx, "generated new cluster init grace period end time: %s", end.UTC().String())
e.clusterInitGracePeriodEndTS.Store(end.Unix())
return txn.KV().Put(ctx, keys.GracePeriodInitTimestamp, e.clusterInitGracePeriodEndTS.Load())
return txn.KV().Put(ctx, keys.ClusterInitGracePeriodTimestamp, e.clusterInitGracePeriodEndTS.Load())
}
e.clusterInitGracePeriodEndTS.Store(val.ValueInt())
log.Infof(ctx, "fetched existing cluster init grace period end time: %s", e.GetClusterInitGracePeriodEndTS().String())
Expand Down Expand Up @@ -487,3 +495,36 @@ func (e *Enforcer) getInitialIsDisabledValue() bool {
}
return !tk.Enable
}

// getIsNewClusterEstimate is a helper to determine if the cluster is a newly
// created one. This is used in Start processing to help determine the length
// of the grace period when no license is installed.
func (e *Enforcer) getIsNewClusterEstimate(ctx context.Context, txn isql.Txn) (bool, error) {
data, err := txn.QueryRow(ctx, "check if enforcer start is near cluster init time", txn.KV(),
`SELECT min(completed_at) FROM system.migrations`)
if err != nil {
return false, err
}
if len(data) == 0 {
return false, errors.New("no rows found in system.migrations")
}
var ts time.Time
switch t := data[0].(type) {
case *tree.DTimestampTZ:
ts = t.Time
default:
return false, errors.Newf("unexpected data type: %v", t)
}

// We are going to lean on system.migrations to determine if the cluster is
// new or not. If the cluster is new, the minimum value for the completed_at
// column should roughly match the start time of this enforcer. We will query
// that value and see if it's within 2 hours of it. If it's within that range
// we treat it as if it's a new cluster.
st := e.getStartTime()
if st.After(ts.Add(-1*time.Hour)) && st.Before(ts.Add(1*time.Hour)) {
return true, nil
}
log.Infof(ctx, "cluster init is not within the bounds of the enforcer start time: %v", ts)
return false, nil
}
63 changes: 60 additions & 3 deletions pkg/server/license/enforcer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,14 @@ func (m mockTelemetryStatusReporter) GetLastSuccessfulTelemetryPing() time.Time
return m.lastPingTime
}

func TestGracePeriodInitTSCache(t *testing.T) {
func TestClusterInitGracePeriod_NoOverwrite(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)

// This is the timestamp that we'll override the grace period init timestamp with.
// This will be set when bringing up the server.
ts1 := timeutil.Unix(1724329716, 0)
ts1End := ts1.Add(7 * 24 * time.Hour) // Calculate the end of the grace period based on ts1
ts1End := ts1.Add(30 * 24 * time.Hour) // Calculate the end of the grace period based on ts1

ctx := context.Background()
srv := serverutils.StartServerOnly(t, base.TestServerArgs{
Expand Down Expand Up @@ -76,7 +76,7 @@ func TestGracePeriodInitTSCache(t *testing.T) {
require.Equal(t, ts2End, enforcer.GetClusterInitGracePeriodEndTS())
// Start the enforcer to read the timestamp from the KV.
enforcer.SetTelemetryStatusReporter(&mockTelemetryStatusReporter{lastPingTime: ts1})
err := enforcer.Start(ctx, srv.ClusterSettings(), srv.SystemLayer().InternalDB().(descs.DB), false /* initialStart */)
err := enforcer.Start(ctx, srv.ClusterSettings(), srv.SystemLayer().InternalDB().(descs.DB))
require.NoError(t, err)
require.Equal(t, ts1End, enforcer.GetClusterInitGracePeriodEndTS())

Expand All @@ -86,6 +86,63 @@ func TestGracePeriodInitTSCache(t *testing.T) {
require.Equal(t, ts1End, srv.ApplicationLayer().ExecutorConfig().(sql.ExecutorConfig).LicenseEnforcer.GetClusterInitGracePeriodEndTS())
}

func TestClusterInitGracePeriod_NewClusterEstimation(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)

// This will be the start time of the enforcer for each unit test
ts1 := timeutil.Unix(1631494860, 0)

ctx := context.Background()
srv := serverutils.StartServerOnly(t, base.TestServerArgs{
Knobs: base.TestingKnobs{
Server: &server.TestingKnobs{
LicenseTestingKnobs: license.TestingKnobs{
Enable: true,
OverrideStartTime: &ts1,
},
},
},
})
defer srv.Stopper().Stop(ctx)

for _, tc := range []struct {
desc string
minSysMigrationTime time.Time
expGracePeriodEndTS time.Time
}{
{"init-2020", timeutil.Unix(1577836800, 0), ts1.Add(30 * 24 * time.Hour)},
{"init-5min-ago", ts1.Add(-5 * time.Minute), ts1.Add(7 * 24 * time.Hour)},
{"init-59min-ago", ts1.Add(-59 * time.Minute), ts1.Add(7 * 24 * time.Hour)},
{"init-1h1min-ago", ts1.Add(-61 * time.Minute), ts1.Add(30 * 24 * time.Hour)},
} {
t.Run(tc.desc, func(t *testing.T) {
enforcer := &license.Enforcer{}
enforcer.SetTestingKnobs(&license.TestingKnobs{
Enable: true,
OverrideStartTime: &ts1,
OverwriteClusterInitGracePeriodTS: true,
})

// Set up the min time in system.migrations. This is used by the enforcer
// to figure out if the cluster is new or old. The grace period length is
// adjusted based on this.
db := srv.SystemLayer().InternalDB().(descs.DB)
err := db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error {
_, err := txn.Exec(ctx, "add new min to system.migrations", txn.KV(),
"UPDATE system.migrations SET completed_at=$1 LIMIT 1",
tc.minSysMigrationTime)
return err
})
require.NoError(t, err)

err = enforcer.Start(ctx, srv.ClusterSettings(), srv.SystemLayer().InternalDB().(descs.DB))
require.NoError(t, err)
require.Equal(t, tc.expGracePeriodEndTS, enforcer.GetClusterInitGracePeriodEndTS())
})
}
}

func TestThrottle(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
Expand Down
1 change: 0 additions & 1 deletion pkg/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -2096,7 +2096,6 @@ func (s *topLevelServer) PreStart(ctx context.Context) error {
s.stopper,
s.cfg.TestingKnobs,
orphanedLeasesTimeThresholdNanos,
s.InitialStart(),
); err != nil {
return err
}
Expand Down
9 changes: 3 additions & 6 deletions pkg/server/server_sql.go
Original file line number Diff line number Diff line change
Expand Up @@ -1445,7 +1445,6 @@ func (s *SQLServer) preStart(
stopper *stop.Stopper,
knobs base.TestingKnobs,
orphanedLeasesTimeThresholdNanos int64,
initialStart bool,
) error {
// If necessary, start the tenant proxy first, to ensure all other
// components can properly route to KV nodes. The Start method will block
Expand Down Expand Up @@ -1763,7 +1762,7 @@ func (s *SQLServer) preStart(
)
s.execCfg.SyntheticPrivilegeCache.Start(ctx)

s.startLicenseEnforcer(ctx, knobs, initialStart)
s.startLicenseEnforcer(ctx, knobs)

// Report a warning if the server is being shut down via the stopper
// before it was gracefully drained. This warning may be innocuous
Expand Down Expand Up @@ -1914,9 +1913,7 @@ func (s *SQLServer) StartDiagnostics(ctx context.Context) {
s.diagnosticsReporter.PeriodicallyReportDiagnostics(ctx, s.stopper)
}

func (s *SQLServer) startLicenseEnforcer(
ctx context.Context, knobs base.TestingKnobs, initialStart bool,
) {
func (s *SQLServer) startLicenseEnforcer(ctx context.Context, knobs base.TestingKnobs) {
// Start the license enforcer. This is only started for the system tenant since
// it requires access to the system keyspace. For secondary tenants, this struct
// is shared to provide access to the values cached from the KV read.
Expand All @@ -1930,7 +1927,7 @@ func (s *SQLServer) startLicenseEnforcer(
// diagnostics reporter. This will be handled in CRDB-39991
err := startup.RunIdempotentWithRetry(ctx, s.stopper.ShouldQuiesce(), "license enforcer start",
func(ctx context.Context) error {
return licenseEnforcer.Start(ctx, s.cfg.Settings, s.internalDB, initialStart)
return licenseEnforcer.Start(ctx, s.cfg.Settings, s.internalDB)
})
// This is not a critical component. If it fails to start, we log a warning
// rather than prevent the entire server from starting.
Expand Down
1 change: 0 additions & 1 deletion pkg/server/tenant.go
Original file line number Diff line number Diff line change
Expand Up @@ -844,7 +844,6 @@ func (s *SQLServerWrapper) PreStart(ctx context.Context) error {
s.stopper,
s.sqlServer.cfg.TestingKnobs,
orphanedLeasesTimeThresholdNanos,
false, /* initialStart */
); err != nil {
return err
}
Expand Down

0 comments on commit a835697

Please sign in to comment.