From 509064d0f221e21c325670da1aa800f3bb37ce67 Mon Sep 17 00:00:00 2001 From: Ilia Shakhov Date: Fri, 1 Nov 2024 16:12:55 +0300 Subject: [PATCH] Add simplified mirror-3dc support in CMS (#11190) --- ydb/core/cms/cms.cpp | 7 +-- ydb/core/cms/cms_maintenance_api_ut.cpp | 21 ++++++++ ydb/core/cms/cms_ut_common.cpp | 68 +++++++++++++++++-------- ydb/core/cms/cms_ut_common.h | 12 ++++- ydb/core/cms/erasure_checkers.cpp | 38 ++++++++++---- ydb/core/cms/erasure_checkers.h | 20 +++++--- ydb/core/protos/counters_cms.proto | 2 + 7 files changed, 126 insertions(+), 42 deletions(-) diff --git a/ydb/core/cms/cms.cpp b/ydb/core/cms/cms.cpp index aec03a08dea7..f223217fecc9 100644 --- a/ydb/core/cms/cms.cpp +++ b/ydb/core/cms/cms.cpp @@ -928,7 +928,7 @@ bool TCms::TryToLockVDisk(const TActionOptions& opts, return false; } - auto counters = CreateErasureCounter(ClusterInfo->BSGroup(groupId).Erasure.GetErasure(), vdisk, groupId); + auto counters = CreateErasureCounter(ClusterInfo->BSGroup(groupId).Erasure.GetErasure(), vdisk, groupId, TabletCounters); counters->CountGroupState(ClusterInfo, State->Config.DefaultRetryTime, duration, error); switch (opts.AvailabilityMode) { @@ -943,10 +943,11 @@ bool TCms::TryToLockVDisk(const TActionOptions& opts, } break; case MODE_FORCE_RESTART: - if ( counters->GroupAlreadyHasLockedDisks() && opts.PartialPermissionAllowed) { + if (counters->GroupAlreadyHasLockedDisks() && !counters->GroupHasMoreThanOneDiskPerNode() && opts.PartialPermissionAllowed) { + TabletCounters->Cumulative()[COUNTER_PARTIAL_PERMISSIONS_OPTIMIZED].Increment(1); error.Code = TStatus::DISALLOW_TEMP; error.Reason = "You cannot get two or more disks from the same group at the same time" - " without specifying the PartialPermissionAllowed parameter"; + " in partial permissions allowed mode"; error.Deadline = defaultDeadline; return false; } diff --git a/ydb/core/cms/cms_maintenance_api_ut.cpp b/ydb/core/cms/cms_maintenance_api_ut.cpp index 151461525408..52b8b37f81a6 100644 --- a/ydb/core/cms/cms_maintenance_api_ut.cpp +++ b/ydb/core/cms/cms_maintenance_api_ut.cpp @@ -95,6 +95,27 @@ Y_UNIT_TEST_SUITE(TMaintenanceApiTest) { UNIT_ASSERT_VALUES_EQUAL(a2.reason(), ActionState::ACTION_REASON_TOO_MANY_UNAVAILABLE_VDISKS); UNIT_ASSERT(a2.reason_details().Contains("too many unavailable vdisks")); } + + Y_UNIT_TEST(SimplifiedMirror3DC) { + TTestEnvOpts options(3); + options.UseMirror3dcErasure = true; + options.DataCenterCount = 3; + TCmsTestEnv env(options); + + auto response = env.CheckMaintenanceTaskCreate( + "task-1", + Ydb::StatusIds::SUCCESS, + Ydb::Maintenance::AVAILABILITY_MODE_WEAK, + MakeActionGroup( + MakeLockAction(env.GetNodeId(0), TDuration::Minutes(10)) + ) + ); + + UNIT_ASSERT_VALUES_EQUAL(response.action_group_states().size(), 1); + UNIT_ASSERT_VALUES_EQUAL(response.action_group_states(0).action_states().size(), 1); + const auto &a = response.action_group_states(0).action_states(0); + UNIT_ASSERT_VALUES_EQUAL(a.status(), ActionState::ACTION_STATUS_PERFORMED); + } } } // namespace NKikimr::NCmsTest diff --git a/ydb/core/cms/cms_ut_common.cpp b/ydb/core/cms/cms_ut_common.cpp index 4a0f6715e431..c3f0c16eb3d9 100644 --- a/ydb/core/cms/cms_ut_common.cpp +++ b/ydb/core/cms/cms_ut_common.cpp @@ -239,20 +239,21 @@ class TFakeTenantPool : public TActorBootstrapped { void GenerateExtendedInfo(TTestActorRuntime &runtime, NKikimrBlobStorage::TBaseConfig *config, ui32 pdisks, ui32 vdiskPerPdisk = 4, const TNodeTenantsMap &tenants = {}, bool useMirror3dcErasure = false) -{ +{ + constexpr ui32 MIRROR_3DC_VDISKS_COUNT = 9; + constexpr ui32 BLOCK_4_2_VDISKS_COUNT = 8; + ui32 numNodes = runtime.GetNodeCount(); - ui32 numNodeGroups = pdisks * vdiskPerPdisk; + ui32 vdisksPerNode = pdisks * vdiskPerPdisk; ui32 numGroups; - - if (numNodes < 9) - useMirror3dcErasure = false; - if (useMirror3dcErasure) - numGroups = numNodes * numNodeGroups / 9; - else if (numNodes >= 8) - numGroups = numNodes * numNodeGroups / 8; + numGroups = numNodes * vdisksPerNode / MIRROR_3DC_VDISKS_COUNT; + else if (numNodes >= BLOCK_4_2_VDISKS_COUNT) + numGroups = numNodes * vdisksPerNode / BLOCK_4_2_VDISKS_COUNT; else - numGroups = numNodes * numNodeGroups; + numGroups = numNodes * vdisksPerNode; + + ui32 maxOneGroupVdisksPerNode = useMirror3dcErasure && numNodes < MIRROR_3DC_VDISKS_COUNT ? 3 : 1; auto now = runtime.GetTimeProvider()->Now(); for (ui32 groupId = 0; groupId < numGroups; ++groupId) { @@ -261,7 +262,7 @@ void GenerateExtendedInfo(TTestActorRuntime &runtime, NKikimrBlobStorage::TBaseC group.SetGroupGeneration(1); if (useMirror3dcErasure) group.SetErasureSpecies("mirror-3-dc"); - else if (numNodes >= 8) + else if (numNodes >= BLOCK_4_2_VDISKS_COUNT) group.SetErasureSpecies("block-4-2"); else group.SetErasureSpecies("none"); @@ -284,12 +285,18 @@ void GenerateExtendedInfo(TTestActorRuntime &runtime, NKikimrBlobStorage::TBaseC } else { node.SystemStateInfo.AddRoles("Storage"); } - - ui32 groupShift = (nodeIndex / 8) * pdisks * vdiskPerPdisk; - if (numNodes < 8) - groupShift = nodeIndex * numNodeGroups; - if (useMirror3dcErasure) - groupShift = (nodeIndex / 9) * pdisks * vdiskPerPdisk; + + ui32 groupsPerNode = vdisksPerNode / maxOneGroupVdisksPerNode; + ui32 groupShift; + if (useMirror3dcErasure) { + ui32 groupNodesSize = MIRROR_3DC_VDISKS_COUNT / maxOneGroupVdisksPerNode; + groupShift = (nodeIndex / groupNodesSize) * groupsPerNode; + } else if (numNodes >= BLOCK_4_2_VDISKS_COUNT) { + ui32 groupNodesSize = BLOCK_4_2_VDISKS_COUNT / maxOneGroupVdisksPerNode; + groupShift = (nodeIndex / groupNodesSize) * groupsPerNode; + } else { + groupShift = nodeIndex * groupsPerNode; + } for (ui32 pdiskIndex = 0; pdiskIndex < pdisks; ++pdiskIndex) { auto pdiskId = nodeId * pdisks + pdiskIndex; @@ -316,12 +323,28 @@ void GenerateExtendedInfo(TTestActorRuntime &runtime, NKikimrBlobStorage::TBaseC for (ui8 vdiskIndex = 0; vdiskIndex < vdiskPerPdisk; ++vdiskIndex) { ui32 vdiskId = pdiskIndex * vdiskPerPdisk + vdiskIndex; - ui32 groupId = groupShift + vdiskId; + ui32 groupId = groupShift + vdiskId / maxOneGroupVdisksPerNode; + + if (groupId >= config->GroupSize()) { + break; + } + ui32 failRealm = 0; - if (useMirror3dcErasure) - failRealm = (nodeIndex % 9) / 3; + if (useMirror3dcErasure) { + if (numNodes >= MIRROR_3DC_VDISKS_COUNT) { + failRealm = (nodeIndex % MIRROR_3DC_VDISKS_COUNT) / 3; + } else { + failRealm = nodeIndex % 3; + } + } - TVDiskID id = {(ui8)groupId, 1, (ui8)failRealm, (ui8)(nodeIndex % 8), (ui8)0}; + TVDiskID id = { + (ui8)groupId, + 1, + (ui8)failRealm, + (ui8)(nodeIndex % BLOCK_4_2_VDISKS_COUNT), + (ui8)(vdiskId % maxOneGroupVdisksPerNode) + }; auto &vdisk = node.VDiskStateInfo[id]; VDiskIDFromVDiskID(id, vdisk.MutableVDiskId()); @@ -339,7 +362,8 @@ void GenerateExtendedInfo(TTestActorRuntime &runtime, NKikimrBlobStorage::TBaseC vdiskConfig.SetGroupId(groupId); vdiskConfig.SetGroupGeneration(1); vdiskConfig.SetFailRealmIdx(failRealm); - vdiskConfig.SetFailDomainIdx(nodeIndex % 8); + vdiskConfig.SetFailDomainIdx(nodeIndex % BLOCK_4_2_VDISKS_COUNT); + vdiskConfig.SetVDiskIdx(vdiskId % maxOneGroupVdisksPerNode); config->MutableGroup(groupId)->AddVSlotId() ->CopyFrom(vdiskConfig.GetVSlotId()); diff --git a/ydb/core/cms/cms_ut_common.h b/ydb/core/cms/cms_ut_common.h index c719133702f5..9e502d59e21b 100644 --- a/ydb/core/cms/cms_ut_common.h +++ b/ydb/core/cms/cms_ut_common.h @@ -411,6 +411,7 @@ class TCmsTestEnv : public TTestBasicRuntime { Ydb::Maintenance::MaintenanceTaskResult CheckMaintenanceTaskCreate( const TString &taskUid, Ydb::StatusIds::StatusCode code, + Ydb::Maintenance::AvailabilityMode availabilityMode, const Ts&... actionGroups) { auto ev = std::make_unique(); @@ -418,7 +419,7 @@ class TCmsTestEnv : public TTestBasicRuntime { auto *req = ev->Record.MutableRequest(); req->mutable_task_options()->set_task_uid(taskUid); - req->mutable_task_options()->set_availability_mode(Ydb::Maintenance::AVAILABILITY_MODE_STRONG); + req->mutable_task_options()->set_availability_mode(availabilityMode); AddActionGroups(*req, actionGroups...); SendToPipe(CmsId, Sender, ev.release(), 0, GetPipeConfigWithRetries()); @@ -430,6 +431,15 @@ class TCmsTestEnv : public TTestBasicRuntime { return rec.GetResult(); } + template + Ydb::Maintenance::MaintenanceTaskResult CheckMaintenanceTaskCreate( + const TString &taskUid, + Ydb::StatusIds::StatusCode code, + const Ts&... actionGroups) + { + return CheckMaintenanceTaskCreate(taskUid, code, Ydb::Maintenance::AVAILABILITY_MODE_STRONG, actionGroups...); + } + void EnableBSBaseConfig(); void DisableBSBaseConfig(); diff --git a/ydb/core/cms/erasure_checkers.cpp b/ydb/core/cms/erasure_checkers.cpp index 1d753ba32cf0..6b406a7bfa46 100644 --- a/ydb/core/cms/erasure_checkers.cpp +++ b/ydb/core/cms/erasure_checkers.cpp @@ -1,5 +1,8 @@ #include "erasure_checkers.h" +#include +#include + namespace NKikimr::NCms { bool TErasureCounterBase::IsDown(const TVDiskInfo &vdisk, TClusterInfoPtr info, TDuration &retryTime, TErrorInfo &error) { @@ -43,6 +46,10 @@ bool TErasureCounterBase::GroupAlreadyHasLockedDisks() const { return HasAlreadyLockedDisks; } +bool TErasureCounterBase::GroupHasMoreThanOneDiskPerNode() const { + return HasMoreThanOneDiskPerNode; +} + static TString DumpVDisksInfo(const THashMap& vdisks, TClusterInfoPtr info) { if (vdisks.empty()) { return ""; @@ -121,11 +128,18 @@ bool TErasureCounterBase::CountVDisk(const TVDiskInfo &vdisk, TClusterInfoPtr in } void TErasureCounterBase::CountGroupState(TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo &error) { - for (const auto &vdId : info->BSGroup(GroupId).VDisks) { - if (vdId != VDisk.VDiskId) - CountVDisk(info->VDisk(vdId), info, retryTime, duration, error); + const auto& group = info->BSGroup(GroupId); + + TSet groupNodes; + for (const auto &vdId : group.VDisks) { + const auto &vd = info->VDisk(vdId); + if (vd.VDiskId != VDisk.VDiskId) + CountVDisk(vd, info, retryTime, duration, error); + groupNodes.insert(vd.NodeId); } + HasMoreThanOneDiskPerNode = group.VDisks.size() > groupNodes.size(); + if (Locked && error.Code == TStatus::DISALLOW) { HasAlreadyLockedDisks = true; } @@ -136,10 +150,11 @@ void TErasureCounterBase::CountGroupState(TClusterInfoPtr info, TDuration retryT bool TDefaultErasureCounter::CheckForKeepAvailability(TClusterInfoPtr info, TErrorInfo &error, TInstant &defaultDeadline, bool allowPartial) const { - if (HasAlreadyLockedDisks && allowPartial) { + if (HasAlreadyLockedDisks && !HasMoreThanOneDiskPerNode && allowPartial) { + CmsCounters->Cumulative()[COUNTER_PARTIAL_PERMISSIONS_OPTIMIZED].Increment(1); error.Code = TStatus::DISALLOW_TEMP; error.Reason = "You cannot get two or more disks from the same group at the same time" - " without specifying the PartialPermissionAllowed parameter"; + " in partial permissions allowed mode"; error.Deadline = defaultDeadline; return false; } @@ -170,10 +185,11 @@ bool TDefaultErasureCounter::CheckForKeepAvailability(TClusterInfoPtr info, TErr bool TMirror3dcCounter::CheckForKeepAvailability(TClusterInfoPtr info, TErrorInfo &error, TInstant &defaultDeadline, bool allowPartial) const { - if (HasAlreadyLockedDisks && allowPartial) { + if (HasAlreadyLockedDisks && !HasMoreThanOneDiskPerNode && allowPartial) { + CmsCounters->Cumulative()[COUNTER_PARTIAL_PERMISSIONS_OPTIMIZED].Increment(1); error.Code = TStatus::DISALLOW_TEMP; error.Reason = "You cannot get two or more disks from the same group at the same time" - " without specifying the PartialPermissionAllowed parameter"; + " in partial permissions allowed mode"; error.Deadline = defaultDeadline; return false; } @@ -237,7 +253,9 @@ void TMirror3dcCounter::CountGroupState(TClusterInfoPtr info, TDuration retryTim ++DataCenterDisabledNodes[VDisk.VDiskId.FailRealm]; } -TSimpleSharedPtr CreateErasureCounter(TErasureType::EErasureSpecies es, const TVDiskInfo &vdisk, ui32 groupId) { +TSimpleSharedPtr CreateErasureCounter(TErasureType::EErasureSpecies es, + const TVDiskInfo &vdisk, ui32 groupId, TTabletCountersBase* cmsCounters) +{ switch (es) { case TErasureType::ErasureNone: case TErasureType::ErasureMirror3: @@ -257,9 +275,9 @@ TSimpleSharedPtr CreateErasureCounter(TErasureType::EErasureSpe case TErasureType::Erasure2Plus2Block: case TErasureType::Erasure2Plus2Stripe: case TErasureType::ErasureMirror3of4: - return TSimpleSharedPtr(new TDefaultErasureCounter(vdisk, groupId)); + return TSimpleSharedPtr(new TDefaultErasureCounter(vdisk, groupId, cmsCounters)); case TErasureType::ErasureMirror3dc: - return TSimpleSharedPtr(new TMirror3dcCounter(vdisk, groupId)); + return TSimpleSharedPtr(new TMirror3dcCounter(vdisk, groupId, cmsCounters)); default: Y_ABORT("Unknown erasure type: %d", es); } diff --git a/ydb/core/cms/erasure_checkers.h b/ydb/core/cms/erasure_checkers.h index 517e473e3ce7..6d701e012682 100644 --- a/ydb/core/cms/erasure_checkers.h +++ b/ydb/core/cms/erasure_checkers.h @@ -20,6 +20,7 @@ class IErasureCounter { virtual ~IErasureCounter() = default; virtual bool GroupAlreadyHasLockedDisks() const = 0; + virtual bool GroupHasMoreThanOneDiskPerNode() const = 0; virtual bool CheckForMaxAvailability(TClusterInfoPtr info, TErrorInfo& error, TInstant& defaultDeadline, bool allowPartial) const = 0; virtual bool CheckForKeepAvailability(TClusterInfoPtr info, TErrorInfo& error, TInstant& defaultDeadline, bool allowPartial) const = 0; virtual void CountGroupState(TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo& error) = 0; @@ -33,6 +34,9 @@ class TErasureCounterBase: public IErasureCounter { const TVDiskInfo& VDisk; const ui32 GroupId; bool HasAlreadyLockedDisks; + bool HasMoreThanOneDiskPerNode; + + TTabletCountersBase* CmsCounters; protected: bool IsDown(const TVDiskInfo& vdisk, TClusterInfoPtr info, TDuration& retryTime, TErrorInfo& error); @@ -40,22 +44,25 @@ class TErasureCounterBase: public IErasureCounter { bool CountVDisk(const TVDiskInfo& vdisk, TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo& error) override; public: - TErasureCounterBase(const TVDiskInfo& vdisk, ui32 groupId) + TErasureCounterBase(const TVDiskInfo& vdisk, ui32 groupId, TTabletCountersBase* cmsCounters) : VDisk(vdisk) , GroupId(groupId) , HasAlreadyLockedDisks(false) + , HasMoreThanOneDiskPerNode(false) + , CmsCounters(cmsCounters) { } bool GroupAlreadyHasLockedDisks() const final; + bool GroupHasMoreThanOneDiskPerNode() const final; bool CheckForMaxAvailability(TClusterInfoPtr info, TErrorInfo& error, TInstant& defaultDeadline, bool allowPartial) const final; void CountGroupState(TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo &error) override; }; class TDefaultErasureCounter: public TErasureCounterBase { public: - TDefaultErasureCounter(const TVDiskInfo& vdisk, ui32 groupId) - : TErasureCounterBase(vdisk, groupId) + TDefaultErasureCounter(const TVDiskInfo& vdisk, ui32 groupId, TTabletCountersBase* cmsCounters) + : TErasureCounterBase(vdisk, groupId, cmsCounters) { } @@ -69,8 +76,8 @@ class TMirror3dcCounter: public TErasureCounterBase { bool CountVDisk(const TVDiskInfo& vdisk, TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo& error) override; public: - TMirror3dcCounter(const TVDiskInfo& vdisk, ui32 groupId) - : TErasureCounterBase(vdisk, groupId) + TMirror3dcCounter(const TVDiskInfo& vdisk, ui32 groupId, TTabletCountersBase* cmsCounters) + : TErasureCounterBase(vdisk, groupId, cmsCounters) { } @@ -78,6 +85,7 @@ class TMirror3dcCounter: public TErasureCounterBase { void CountGroupState(TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo &error) override; }; -TSimpleSharedPtr CreateErasureCounter(TErasureType::EErasureSpecies es, const TVDiskInfo& vdisk, ui32 groupId); +TSimpleSharedPtr CreateErasureCounter(TErasureType::EErasureSpecies es, + const TVDiskInfo &vdisk, ui32 groupId, TTabletCountersBase* cmsCounters); } // namespace NKikimr::NCms diff --git a/ydb/core/protos/counters_cms.proto b/ydb/core/protos/counters_cms.proto index 6118ea83a249..331e8e0e3353 100644 --- a/ydb/core/protos/counters_cms.proto +++ b/ydb/core/protos/counters_cms.proto @@ -16,6 +16,8 @@ enum ESimpleCounters { enum ECumulativeCounters { COUNTER_CUMULATIVE_IGNORE = 0; + + COUNTER_PARTIAL_PERMISSIONS_OPTIMIZED = 1 [(CounterOpts) = {Name: "PartialPermissionsOptimized"}]; } enum EPercentileCounters {