Skip to content

Commit

Permalink
issue-2715: add critical events for double migration (#2716)
Browse files Browse the repository at this point in the history
* issue-2715: add critical events for NBS-3726

* issue-2715: correct issues

* issue-2715: correct issues

* issue-2715: change report message
  • Loading branch information
vladstepanyuk authored Dec 19, 2024
1 parent 28f9bfc commit ff81788
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 35 deletions.
1 change: 1 addition & 0 deletions cloud/blockstore/libs/diagnostics/critical_events.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ namespace NCloud::NBlockStore {
xxx(DiskRegistryCleanupAgentConfigError) \
xxx(DiskRegistryOccupiedDeviceConfigurationHasChanged) \
xxx(MirroredDiskChecksumMismatchUponRead) \
xxx(DiskRegistryWrongMigratedDeviceOwnership) \
// BLOCKSTORE_CRITICAL_EVENTS

#define BLOCKSTORE_IMPOSSIBLE_EVENTS(xxx) \
Expand Down
52 changes: 17 additions & 35 deletions cloud/blockstore/libs/storage/disk_registry/disk_registry_state.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5032,18 +5032,10 @@ void TDiskRegistryState::ApplyAgentStateChange(
continue;
}

bool found = false;
for (const auto& d: disk.Devices) {
if (d == deviceId) {
found = true;
break;
}
}

if (!found) {
// this device is allocated for this disk but it is not among
// this disk's active device set
// NBS-3726
if (Find(disk.Devices, deviceId) == disk.Devices.end()) {
ReportDiskRegistryWrongMigratedDeviceOwnership(
TStringBuilder() << "ApplyAgentStateChange: device "
<< deviceId << " not found");
continue;
}

Expand Down Expand Up @@ -6145,9 +6137,10 @@ NProto::TError TDiskRegistryState::FinishDeviceMigration(
auto devIt = Find(disk.Devices, sourceId);

if (devIt == disk.Devices.end()) {
// temporarily commented out, see NBS-3726
// return MakeError(E_INVALID_STATE, TStringBuilder() <<
// "device " << sourceId.Quote() << " not found");
auto message = ReportDiskRegistryWrongMigratedDeviceOwnership(
TStringBuilder() << "FinishDeviceMigration: device "
<< sourceId.Quote() << " not found");
return MakeError(E_INVALID_STATE, std::move(message));
}

if (auto it = disk.MigrationTarget2Source.find(targetId);
Expand All @@ -6174,27 +6167,16 @@ NProto::TError TDiskRegistryState::FinishDeviceMigration(
}

const ui64 seqNo = AddReallocateRequest(db, diskId);
// this condition is needed because of NBS-3726
if (devIt != disk.Devices.end()) {
*devIt = targetId;
disk.FinishedMigrations.push_back({
.DeviceId = sourceId,
.SeqNo = seqNo
});
*devIt = targetId;
disk.FinishedMigrations.push_back({.DeviceId = sourceId, .SeqNo = seqNo});

if (disk.MasterDiskId) {
const bool replaced = ReplicaTable.ReplaceDevice(
disk.MasterDiskId,
sourceId,
targetId);
// targetId is actually fully initialized after migration
ReplicaTable.MarkReplacementDevice(
disk.MasterDiskId,
targetId,
false);

Y_DEBUG_ABORT_UNLESS(replaced);
}
if (disk.MasterDiskId) {
const bool replaced =
ReplicaTable.ReplaceDevice(disk.MasterDiskId, sourceId, targetId);
// targetId is actually fully initialized after migration
ReplicaTable.MarkReplacementDevice(disk.MasterDiskId, targetId, false);

Y_DEBUG_ABORT_UNLESS(replaced);
}

*diskStateUpdated = TryUpdateDiskState(db, diskId, disk, timestamp);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,14 @@ Y_UNIT_TEST_SUITE(TDiskRegistryStateMigrationTest)
UNIT_ASSERT_VALUES_EQUAL(0, affectedDisks.size());
});

NMonitoring::TDynamicCountersPtr counters =
new NMonitoring::TDynamicCounters();
InitCriticalEventsCounter(counters);
auto configCounter = counters->GetCounter(
"AppCriticalEvents/DiskRegistryWrongMigratedDeviceOwnership",
true);
UNIT_ASSERT_VALUES_EQUAL(0, configCounter->Val());

executor.WriteTx([&] (TDiskRegistryDatabase db) mutable {
auto affectedDisks = ChangeAgentState(
state,
Expand All @@ -331,6 +339,9 @@ Y_UNIT_TEST_SUITE(TDiskRegistryStateMigrationTest)
}

UNIT_ASSERT(state.IsMigrationListEmpty());
// Now bug is fixed, but, if it reproduce in future, we must report
// event.
UNIT_ASSERT_VALUES_EQUAL(1, configCounter->Val());
}

Y_UNIT_TEST(ShouldEraseMigrationsForDeletedDisk)
Expand Down

0 comments on commit ff81788

Please sign in to comment.