Skip to content

Commit

Permalink
Add simplified mirror-3dc support in CMS (ydb-platform#11190)
Browse files Browse the repository at this point in the history
  • Loading branch information
pixcc authored Nov 1, 2024
1 parent b3d3a0f commit 509064d
Show file tree
Hide file tree
Showing 7 changed files with 126 additions and 42 deletions.
7 changes: 4 additions & 3 deletions ydb/core/cms/cms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -928,7 +928,7 @@ bool TCms::TryToLockVDisk(const TActionOptions& opts,
return false;
}

auto counters = CreateErasureCounter(ClusterInfo->BSGroup(groupId).Erasure.GetErasure(), vdisk, groupId);
auto counters = CreateErasureCounter(ClusterInfo->BSGroup(groupId).Erasure.GetErasure(), vdisk, groupId, TabletCounters);
counters->CountGroupState(ClusterInfo, State->Config.DefaultRetryTime, duration, error);

switch (opts.AvailabilityMode) {
Expand All @@ -943,10 +943,11 @@ bool TCms::TryToLockVDisk(const TActionOptions& opts,
}
break;
case MODE_FORCE_RESTART:
if ( counters->GroupAlreadyHasLockedDisks() && opts.PartialPermissionAllowed) {
if (counters->GroupAlreadyHasLockedDisks() && !counters->GroupHasMoreThanOneDiskPerNode() && opts.PartialPermissionAllowed) {
TabletCounters->Cumulative()[COUNTER_PARTIAL_PERMISSIONS_OPTIMIZED].Increment(1);
error.Code = TStatus::DISALLOW_TEMP;
error.Reason = "You cannot get two or more disks from the same group at the same time"
" without specifying the PartialPermissionAllowed parameter";
" in partial permissions allowed mode";
error.Deadline = defaultDeadline;
return false;
}
Expand Down
21 changes: 21 additions & 0 deletions ydb/core/cms/cms_maintenance_api_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,27 @@ Y_UNIT_TEST_SUITE(TMaintenanceApiTest) {
UNIT_ASSERT_VALUES_EQUAL(a2.reason(), ActionState::ACTION_REASON_TOO_MANY_UNAVAILABLE_VDISKS);
UNIT_ASSERT(a2.reason_details().Contains("too many unavailable vdisks"));
}

Y_UNIT_TEST(SimplifiedMirror3DC) {
TTestEnvOpts options(3);
options.UseMirror3dcErasure = true;
options.DataCenterCount = 3;
TCmsTestEnv env(options);

auto response = env.CheckMaintenanceTaskCreate(
"task-1",
Ydb::StatusIds::SUCCESS,
Ydb::Maintenance::AVAILABILITY_MODE_WEAK,
MakeActionGroup(
MakeLockAction(env.GetNodeId(0), TDuration::Minutes(10))
)
);

UNIT_ASSERT_VALUES_EQUAL(response.action_group_states().size(), 1);
UNIT_ASSERT_VALUES_EQUAL(response.action_group_states(0).action_states().size(), 1);
const auto &a = response.action_group_states(0).action_states(0);
UNIT_ASSERT_VALUES_EQUAL(a.status(), ActionState::ACTION_STATUS_PERFORMED);
}
}

} // namespace NKikimr::NCmsTest
68 changes: 46 additions & 22 deletions ydb/core/cms/cms_ut_common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,20 +239,21 @@ class TFakeTenantPool : public TActorBootstrapped<TFakeTenantPool> {

void GenerateExtendedInfo(TTestActorRuntime &runtime, NKikimrBlobStorage::TBaseConfig *config,
ui32 pdisks, ui32 vdiskPerPdisk = 4, const TNodeTenantsMap &tenants = {}, bool useMirror3dcErasure = false)
{
{
constexpr ui32 MIRROR_3DC_VDISKS_COUNT = 9;
constexpr ui32 BLOCK_4_2_VDISKS_COUNT = 8;

ui32 numNodes = runtime.GetNodeCount();
ui32 numNodeGroups = pdisks * vdiskPerPdisk;
ui32 vdisksPerNode = pdisks * vdiskPerPdisk;
ui32 numGroups;

if (numNodes < 9)
useMirror3dcErasure = false;

if (useMirror3dcErasure)
numGroups = numNodes * numNodeGroups / 9;
else if (numNodes >= 8)
numGroups = numNodes * numNodeGroups / 8;
numGroups = numNodes * vdisksPerNode / MIRROR_3DC_VDISKS_COUNT;
else if (numNodes >= BLOCK_4_2_VDISKS_COUNT)
numGroups = numNodes * vdisksPerNode / BLOCK_4_2_VDISKS_COUNT;
else
numGroups = numNodes * numNodeGroups;
numGroups = numNodes * vdisksPerNode;

ui32 maxOneGroupVdisksPerNode = useMirror3dcErasure && numNodes < MIRROR_3DC_VDISKS_COUNT ? 3 : 1;

auto now = runtime.GetTimeProvider()->Now();
for (ui32 groupId = 0; groupId < numGroups; ++groupId) {
Expand All @@ -261,7 +262,7 @@ void GenerateExtendedInfo(TTestActorRuntime &runtime, NKikimrBlobStorage::TBaseC
group.SetGroupGeneration(1);
if (useMirror3dcErasure)
group.SetErasureSpecies("mirror-3-dc");
else if (numNodes >= 8)
else if (numNodes >= BLOCK_4_2_VDISKS_COUNT)
group.SetErasureSpecies("block-4-2");
else
group.SetErasureSpecies("none");
Expand All @@ -284,12 +285,18 @@ void GenerateExtendedInfo(TTestActorRuntime &runtime, NKikimrBlobStorage::TBaseC
} else {
node.SystemStateInfo.AddRoles("Storage");
}

ui32 groupShift = (nodeIndex / 8) * pdisks * vdiskPerPdisk;
if (numNodes < 8)
groupShift = nodeIndex * numNodeGroups;
if (useMirror3dcErasure)
groupShift = (nodeIndex / 9) * pdisks * vdiskPerPdisk;

ui32 groupsPerNode = vdisksPerNode / maxOneGroupVdisksPerNode;
ui32 groupShift;
if (useMirror3dcErasure) {
ui32 groupNodesSize = MIRROR_3DC_VDISKS_COUNT / maxOneGroupVdisksPerNode;
groupShift = (nodeIndex / groupNodesSize) * groupsPerNode;
} else if (numNodes >= BLOCK_4_2_VDISKS_COUNT) {
ui32 groupNodesSize = BLOCK_4_2_VDISKS_COUNT / maxOneGroupVdisksPerNode;
groupShift = (nodeIndex / groupNodesSize) * groupsPerNode;
} else {
groupShift = nodeIndex * groupsPerNode;
}

for (ui32 pdiskIndex = 0; pdiskIndex < pdisks; ++pdiskIndex) {
auto pdiskId = nodeId * pdisks + pdiskIndex;
Expand All @@ -316,12 +323,28 @@ void GenerateExtendedInfo(TTestActorRuntime &runtime, NKikimrBlobStorage::TBaseC

for (ui8 vdiskIndex = 0; vdiskIndex < vdiskPerPdisk; ++vdiskIndex) {
ui32 vdiskId = pdiskIndex * vdiskPerPdisk + vdiskIndex;
ui32 groupId = groupShift + vdiskId;
ui32 groupId = groupShift + vdiskId / maxOneGroupVdisksPerNode;

if (groupId >= config->GroupSize()) {
break;
}

ui32 failRealm = 0;
if (useMirror3dcErasure)
failRealm = (nodeIndex % 9) / 3;
if (useMirror3dcErasure) {
if (numNodes >= MIRROR_3DC_VDISKS_COUNT) {
failRealm = (nodeIndex % MIRROR_3DC_VDISKS_COUNT) / 3;
} else {
failRealm = nodeIndex % 3;
}
}

TVDiskID id = {(ui8)groupId, 1, (ui8)failRealm, (ui8)(nodeIndex % 8), (ui8)0};
TVDiskID id = {
(ui8)groupId,
1,
(ui8)failRealm,
(ui8)(nodeIndex % BLOCK_4_2_VDISKS_COUNT),
(ui8)(vdiskId % maxOneGroupVdisksPerNode)
};

auto &vdisk = node.VDiskStateInfo[id];
VDiskIDFromVDiskID(id, vdisk.MutableVDiskId());
Expand All @@ -339,7 +362,8 @@ void GenerateExtendedInfo(TTestActorRuntime &runtime, NKikimrBlobStorage::TBaseC
vdiskConfig.SetGroupId(groupId);
vdiskConfig.SetGroupGeneration(1);
vdiskConfig.SetFailRealmIdx(failRealm);
vdiskConfig.SetFailDomainIdx(nodeIndex % 8);
vdiskConfig.SetFailDomainIdx(nodeIndex % BLOCK_4_2_VDISKS_COUNT);
vdiskConfig.SetVDiskIdx(vdiskId % maxOneGroupVdisksPerNode);

config->MutableGroup(groupId)->AddVSlotId()
->CopyFrom(vdiskConfig.GetVSlotId());
Expand Down
12 changes: 11 additions & 1 deletion ydb/core/cms/cms_ut_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -411,14 +411,15 @@ class TCmsTestEnv : public TTestBasicRuntime {
Ydb::Maintenance::MaintenanceTaskResult CheckMaintenanceTaskCreate(
const TString &taskUid,
Ydb::StatusIds::StatusCode code,
Ydb::Maintenance::AvailabilityMode availabilityMode,
const Ts&... actionGroups)
{
auto ev = std::make_unique<NCms::TEvCms::TEvCreateMaintenanceTaskRequest>();
ev->Record.SetUserSID("test-user");

auto *req = ev->Record.MutableRequest();
req->mutable_task_options()->set_task_uid(taskUid);
req->mutable_task_options()->set_availability_mode(Ydb::Maintenance::AVAILABILITY_MODE_STRONG);
req->mutable_task_options()->set_availability_mode(availabilityMode);
AddActionGroups(*req, actionGroups...);

SendToPipe(CmsId, Sender, ev.release(), 0, GetPipeConfigWithRetries());
Expand All @@ -430,6 +431,15 @@ class TCmsTestEnv : public TTestBasicRuntime {
return rec.GetResult();
}

template <typename... Ts>
Ydb::Maintenance::MaintenanceTaskResult CheckMaintenanceTaskCreate(
const TString &taskUid,
Ydb::StatusIds::StatusCode code,
const Ts&... actionGroups)
{
return CheckMaintenanceTaskCreate(taskUid, code, Ydb::Maintenance::AVAILABILITY_MODE_STRONG, actionGroups...);
}

void EnableBSBaseConfig();
void DisableBSBaseConfig();

Expand Down
38 changes: 28 additions & 10 deletions ydb/core/cms/erasure_checkers.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#include "erasure_checkers.h"

#include <ydb/core/protos/counters_cms.pb.h>
#include <ydb/core/tablet/tablet_counters.h>

namespace NKikimr::NCms {

bool TErasureCounterBase::IsDown(const TVDiskInfo &vdisk, TClusterInfoPtr info, TDuration &retryTime, TErrorInfo &error) {
Expand Down Expand Up @@ -43,6 +46,10 @@ bool TErasureCounterBase::GroupAlreadyHasLockedDisks() const {
return HasAlreadyLockedDisks;
}

bool TErasureCounterBase::GroupHasMoreThanOneDiskPerNode() const {
return HasMoreThanOneDiskPerNode;
}

static TString DumpVDisksInfo(const THashMap<TVDiskID, TString>& vdisks, TClusterInfoPtr info) {
if (vdisks.empty()) {
return "<empty>";
Expand Down Expand Up @@ -121,11 +128,18 @@ bool TErasureCounterBase::CountVDisk(const TVDiskInfo &vdisk, TClusterInfoPtr in
}

void TErasureCounterBase::CountGroupState(TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo &error) {
for (const auto &vdId : info->BSGroup(GroupId).VDisks) {
if (vdId != VDisk.VDiskId)
CountVDisk(info->VDisk(vdId), info, retryTime, duration, error);
const auto& group = info->BSGroup(GroupId);

TSet<ui32> groupNodes;
for (const auto &vdId : group.VDisks) {
const auto &vd = info->VDisk(vdId);
if (vd.VDiskId != VDisk.VDiskId)
CountVDisk(vd, info, retryTime, duration, error);
groupNodes.insert(vd.NodeId);
}

HasMoreThanOneDiskPerNode = group.VDisks.size() > groupNodes.size();

if (Locked && error.Code == TStatus::DISALLOW) {
HasAlreadyLockedDisks = true;
}
Expand All @@ -136,10 +150,11 @@ void TErasureCounterBase::CountGroupState(TClusterInfoPtr info, TDuration retryT
bool TDefaultErasureCounter::CheckForKeepAvailability(TClusterInfoPtr info, TErrorInfo &error,
TInstant &defaultDeadline, bool allowPartial) const
{
if (HasAlreadyLockedDisks && allowPartial) {
if (HasAlreadyLockedDisks && !HasMoreThanOneDiskPerNode && allowPartial) {
CmsCounters->Cumulative()[COUNTER_PARTIAL_PERMISSIONS_OPTIMIZED].Increment(1);
error.Code = TStatus::DISALLOW_TEMP;
error.Reason = "You cannot get two or more disks from the same group at the same time"
" without specifying the PartialPermissionAllowed parameter";
" in partial permissions allowed mode";
error.Deadline = defaultDeadline;
return false;
}
Expand Down Expand Up @@ -170,10 +185,11 @@ bool TDefaultErasureCounter::CheckForKeepAvailability(TClusterInfoPtr info, TErr
bool TMirror3dcCounter::CheckForKeepAvailability(TClusterInfoPtr info, TErrorInfo &error,
TInstant &defaultDeadline, bool allowPartial) const
{
if (HasAlreadyLockedDisks && allowPartial) {
if (HasAlreadyLockedDisks && !HasMoreThanOneDiskPerNode && allowPartial) {
CmsCounters->Cumulative()[COUNTER_PARTIAL_PERMISSIONS_OPTIMIZED].Increment(1);
error.Code = TStatus::DISALLOW_TEMP;
error.Reason = "You cannot get two or more disks from the same group at the same time"
" without specifying the PartialPermissionAllowed parameter";
" in partial permissions allowed mode";
error.Deadline = defaultDeadline;
return false;
}
Expand Down Expand Up @@ -237,7 +253,9 @@ void TMirror3dcCounter::CountGroupState(TClusterInfoPtr info, TDuration retryTim
++DataCenterDisabledNodes[VDisk.VDiskId.FailRealm];
}

TSimpleSharedPtr<IErasureCounter> CreateErasureCounter(TErasureType::EErasureSpecies es, const TVDiskInfo &vdisk, ui32 groupId) {
TSimpleSharedPtr<IErasureCounter> CreateErasureCounter(TErasureType::EErasureSpecies es,
const TVDiskInfo &vdisk, ui32 groupId, TTabletCountersBase* cmsCounters)
{
switch (es) {
case TErasureType::ErasureNone:
case TErasureType::ErasureMirror3:
Expand All @@ -257,9 +275,9 @@ TSimpleSharedPtr<IErasureCounter> CreateErasureCounter(TErasureType::EErasureSpe
case TErasureType::Erasure2Plus2Block:
case TErasureType::Erasure2Plus2Stripe:
case TErasureType::ErasureMirror3of4:
return TSimpleSharedPtr<IErasureCounter>(new TDefaultErasureCounter(vdisk, groupId));
return TSimpleSharedPtr<IErasureCounter>(new TDefaultErasureCounter(vdisk, groupId, cmsCounters));
case TErasureType::ErasureMirror3dc:
return TSimpleSharedPtr<IErasureCounter>(new TMirror3dcCounter(vdisk, groupId));
return TSimpleSharedPtr<IErasureCounter>(new TMirror3dcCounter(vdisk, groupId, cmsCounters));
default:
Y_ABORT("Unknown erasure type: %d", es);
}
Expand Down
20 changes: 14 additions & 6 deletions ydb/core/cms/erasure_checkers.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class IErasureCounter {
virtual ~IErasureCounter() = default;

virtual bool GroupAlreadyHasLockedDisks() const = 0;
virtual bool GroupHasMoreThanOneDiskPerNode() const = 0;
virtual bool CheckForMaxAvailability(TClusterInfoPtr info, TErrorInfo& error, TInstant& defaultDeadline, bool allowPartial) const = 0;
virtual bool CheckForKeepAvailability(TClusterInfoPtr info, TErrorInfo& error, TInstant& defaultDeadline, bool allowPartial) const = 0;
virtual void CountGroupState(TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo& error) = 0;
Expand All @@ -33,29 +34,35 @@ class TErasureCounterBase: public IErasureCounter {
const TVDiskInfo& VDisk;
const ui32 GroupId;
bool HasAlreadyLockedDisks;
bool HasMoreThanOneDiskPerNode;

TTabletCountersBase* CmsCounters;

protected:
bool IsDown(const TVDiskInfo& vdisk, TClusterInfoPtr info, TDuration& retryTime, TErrorInfo& error);
bool IsLocked(const TVDiskInfo& vdisk, TClusterInfoPtr info, TDuration& retryTime, TDuration& duration, TErrorInfo& error);
bool CountVDisk(const TVDiskInfo& vdisk, TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo& error) override;

public:
TErasureCounterBase(const TVDiskInfo& vdisk, ui32 groupId)
TErasureCounterBase(const TVDiskInfo& vdisk, ui32 groupId, TTabletCountersBase* cmsCounters)
: VDisk(vdisk)
, GroupId(groupId)
, HasAlreadyLockedDisks(false)
, HasMoreThanOneDiskPerNode(false)
, CmsCounters(cmsCounters)
{
}

bool GroupAlreadyHasLockedDisks() const final;
bool GroupHasMoreThanOneDiskPerNode() const final;
bool CheckForMaxAvailability(TClusterInfoPtr info, TErrorInfo& error, TInstant& defaultDeadline, bool allowPartial) const final;
void CountGroupState(TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo &error) override;
};

class TDefaultErasureCounter: public TErasureCounterBase {
public:
TDefaultErasureCounter(const TVDiskInfo& vdisk, ui32 groupId)
: TErasureCounterBase(vdisk, groupId)
TDefaultErasureCounter(const TVDiskInfo& vdisk, ui32 groupId, TTabletCountersBase* cmsCounters)
: TErasureCounterBase(vdisk, groupId, cmsCounters)
{
}

Expand All @@ -69,15 +76,16 @@ class TMirror3dcCounter: public TErasureCounterBase {
bool CountVDisk(const TVDiskInfo& vdisk, TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo& error) override;

public:
TMirror3dcCounter(const TVDiskInfo& vdisk, ui32 groupId)
: TErasureCounterBase(vdisk, groupId)
TMirror3dcCounter(const TVDiskInfo& vdisk, ui32 groupId, TTabletCountersBase* cmsCounters)
: TErasureCounterBase(vdisk, groupId, cmsCounters)
{
}

bool CheckForKeepAvailability(TClusterInfoPtr info, TErrorInfo& error, TInstant& defaultDeadline, bool allowPartial) const override;
void CountGroupState(TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo &error) override;
};

TSimpleSharedPtr<IErasureCounter> CreateErasureCounter(TErasureType::EErasureSpecies es, const TVDiskInfo& vdisk, ui32 groupId);
TSimpleSharedPtr<IErasureCounter> CreateErasureCounter(TErasureType::EErasureSpecies es,
const TVDiskInfo &vdisk, ui32 groupId, TTabletCountersBase* cmsCounters);

} // namespace NKikimr::NCms
2 changes: 2 additions & 0 deletions ydb/core/protos/counters_cms.proto
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ enum ESimpleCounters {

enum ECumulativeCounters {
COUNTER_CUMULATIVE_IGNORE = 0;

COUNTER_PARTIAL_PERMISSIONS_OPTIMIZED = 1 [(CounterOpts) = {Name: "PartialPermissionsOptimized"}];
}

enum EPercentileCounters {
Expand Down

0 comments on commit 509064d

Please sign in to comment.