Skip to content

Commit

Permalink
Print long request history to log #8131
Browse files Browse the repository at this point in the history
  • Loading branch information
VPolka committed Nov 5, 2024
1 parent b985182 commit 5408146
Show file tree
Hide file tree
Showing 31 changed files with 504 additions and 122 deletions.
25 changes: 15 additions & 10 deletions ydb/core/blobstorage/dsproxy/dsproxy.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ const ui32 MaskSizeBits = 32;
constexpr bool DefaultEnablePutBatching = true;
constexpr bool DefaultEnableVPatch = false;

constexpr TDuration DefaultLongRequestThreshold = TDuration::Seconds(50);

constexpr bool WithMovingPatchRequestToStaticNode = true;

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -175,7 +177,7 @@ struct TCommonParameters {
TIntrusivePtr<TBlobStorageGroupProxyMon> Mon;
TActorId Source = TActorId{};
ui64 Cookie = 0;
TInstant Now;
TMonotonic Now;
TIntrusivePtr<TStoragePoolCounters>& StoragePoolCounters;
ui32 RestartCounter;
NWilson::TTraceId TraceId = {};
Expand Down Expand Up @@ -210,10 +212,10 @@ class TBlobStorageGroupRequestActor : public TActor<TDerived> {
, Span(std::move(span))
, RestartCounter(params.Common.RestartCounter)
, CostModel(GroupQueues->CostModel)
, RequestStartTime(params.Common.Now)
, Source(params.Common.Source)
, Cookie(params.Common.Cookie)
, LatencyQueueKind(params.Common.LatencyQueueKind)
, RequestStartTime(params.Common.Now)
, RacingDomains(&Info->GetTopology())
, ExecutionRelay(std::move(params.Common.ExecutionRelay))
{
Expand Down Expand Up @@ -572,7 +574,7 @@ class TBlobStorageGroupRequestActor : public TActor<TDerived> {
}

if (LatencyQueueKind) {
SendToProxy(std::make_unique<TEvLatencyReport>(*LatencyQueueKind, now - RequestStartTime));
SendToProxy(std::make_unique<TEvLatencyReport>(*LatencyQueueKind, TActivationContext::Monotonic() - RequestStartTime));
}

// KIKIMR-6737
Expand Down Expand Up @@ -639,6 +641,7 @@ class TBlobStorageGroupRequestActor : public TActor<TDerived> {
bool Dead = false;
const ui32 RestartCounter = 0;
std::shared_ptr<const TCostModel> CostModel;
const TMonotonic RequestStartTime;

private:
const TActorId Source;
Expand All @@ -647,7 +650,6 @@ class TBlobStorageGroupRequestActor : public TActor<TDerived> {
ui32 RequestsInFlight = 0;
std::unique_ptr<IEventBase> Response;
const TMaybe<TGroupStat::EKind> LatencyQueueKind;
const TInstant RequestStartTime;
THPTimer Timer;
std::deque<std::unique_ptr<IEventHandle>> PostponedQ;
TBlobStorageGroupInfo::TGroupFailDomains RacingDomains; // a set of domains we've received RACE from
Expand All @@ -670,8 +672,7 @@ struct TBlobStorageGroupRangeParameters {
TTypeSpecificParameters TypeSpecific = {
.LogComponent = NKikimrServices::BS_PROXY_RANGE,
.Name = "DSProxy.Range",
.Activity = NKikimrServices::TActivity::BS_GROUP_RANGE
,
.Activity = NKikimrServices::TActivity::BS_GROUP_RANGE,
};
};
IActor* CreateBlobStorageGroupRangeRequest(TBlobStorageGroupRangeParameters params);
Expand All @@ -686,6 +687,7 @@ struct TBlobStorageGroupPutParameters {
bool TimeStatsEnabled;
TDiskResponsivenessTracker::TPerDiskStatsPtr Stats;
bool EnableRequestMod3x3ForMinLatency;
TDuration LongRequestThreshold;
};
IActor* CreateBlobStorageGroupPutRequest(TBlobStorageGroupPutParameters params);

Expand All @@ -703,6 +705,7 @@ struct TBlobStorageGroupMultiPutParameters {
NKikimrBlobStorage::EPutHandleClass HandleClass;
TEvBlobStorage::TEvPut::ETactic Tactic;
bool EnableRequestMod3x3ForMinLatency;
TDuration LongRequestThreshold;

static ui32 CalculateRestartCounter(TBatchedVec<TEvBlobStorage::TEvPut::TPtr>& events) {
ui32 maxRestarts = 0;
Expand All @@ -722,6 +725,7 @@ struct TBlobStorageGroupGetParameters {
.Activity = NKikimrServices::TActivity::BS_PROXY_GET_ACTOR,
};
TNodeLayoutInfoPtr NodeLayout;
TDuration LongRequestThreshold;
};
IActor* CreateBlobStorageGroupGetRequest(TBlobStorageGroupGetParameters params);

Expand Down Expand Up @@ -823,11 +827,12 @@ IActor* CreateBlobStorageGroupAssimilateRequest(TBlobStorageGroupAssimilateParam
IActor* CreateBlobStorageGroupEjectedProxy(ui32 groupId, TIntrusivePtr<TDsProxyNodeMon> &nodeMon);

IActor* CreateBlobStorageGroupProxyConfigured(TIntrusivePtr<TBlobStorageGroupInfo>&& info,
bool forceWaitAllDrives, TIntrusivePtr<TDsProxyNodeMon> &nodeMon,
TIntrusivePtr<TStoragePoolCounters>&& storagePoolCounters, const TControlWrapper &enablePutBatching,
const TControlWrapper &enableVPatch);
bool forceWaitAllDrives, TIntrusivePtr<TDsProxyNodeMon> &nodeMon, TIntrusivePtr<TStoragePoolCounters>&& storagePoolCounters,
const TControlWrapper &enablePutBatching, const TControlWrapper &enableVPatch,
const TControlWrapper &longRequestThresholdMs = TControlWrapper(DefaultLongRequestThreshold.MilliSeconds(), 1, 1'000'000));

IActor* CreateBlobStorageGroupProxyUnconfigured(ui32 groupId, TIntrusivePtr<TDsProxyNodeMon> &nodeMon,
const TControlWrapper &enablePutBatching, const TControlWrapper &enableVPatch);
const TControlWrapper &enablePutBatching, const TControlWrapper &enableVPatch,
const TControlWrapper &longRequestThresholdMs = TControlWrapper(DefaultLongRequestThreshold.MilliSeconds(), 1, 1'000'000));

}//NKikimr
1 change: 1 addition & 0 deletions ydb/core/blobstorage/dsproxy/dsproxy_blackboard.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "defs.h"

#include "dsproxy.h"
#include "request_history.h"

#include <ydb/core/blobstorage/base/batched_vec.h>
#include <ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.h>
Expand Down
4 changes: 1 addition & 3 deletions ydb/core/blobstorage/dsproxy/dsproxy_block.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ class TBlobStorageGroupBlockRequest : public TBlobStorageGroupRequestActor<TBlob
const ui32 Generation;
const TInstant Deadline;
const ui64 IssuerGuid;
TInstant StartTime;
bool SeenAlready = false;

TGroupQuorumTracker QuorumTracker;
Expand Down Expand Up @@ -100,7 +99,7 @@ class TBlobStorageGroupBlockRequest : public TBlobStorageGroupRequestActor<TBlob
std::unique_ptr<TEvBlobStorage::TEvBlockResult> result(new TEvBlobStorage::TEvBlockResult(status));
result->ErrorReason = ErrorReason;
A_LOG_LOG_S(true, PriorityForStatusResult(status), "DSPB04", "Result# " << result->Print(false));
Mon->CountBlockResponseTime(TActivationContext::Now() - StartTime);
Mon->CountBlockResponseTime(TActivationContext::Monotonic() - RequestStartTime);
return SendResponseAndDie(std::move(result));
}

Expand Down Expand Up @@ -138,7 +137,6 @@ class TBlobStorageGroupBlockRequest : public TBlobStorageGroupRequestActor<TBlob
, Generation(params.Common.Event->Generation)
, Deadline(params.Common.Event->Deadline)
, IssuerGuid(params.Common.Event->IssuerGuid)
, StartTime(params.Common.Now)
, QuorumTracker(Info.Get())
{}

Expand Down
2 changes: 0 additions & 2 deletions ydb/core/blobstorage/dsproxy/dsproxy_collect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ class TBlobStorageGroupCollectGarbageRequest : public TBlobStorageGroupRequestAc
const bool Decommission;

TGroupQuorumTracker QuorumTracker;
TInstant StartTime;

ui32 RequestsSent = 0;
ui32 ResponsesReceived = 0;
Expand Down Expand Up @@ -156,7 +155,6 @@ class TBlobStorageGroupCollectGarbageRequest : public TBlobStorageGroupRequestAc
, Collect(params.Common.Event->Collect)
, Decommission(params.Common.Event->Decommission)
, QuorumTracker(Info.Get())
, StartTime(params.Common.Now)
{}

void Bootstrap() {
Expand Down
4 changes: 1 addition & 3 deletions ydb/core/blobstorage/dsproxy/dsproxy_discover.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,6 @@ class TBlobStorageGroupDiscoverRequest : public TBlobStorageGroupRequestActor<TB
const bool ReadBody;
const bool DiscoverBlockedGeneration;
const TInstant Deadline;
const TInstant StartTime;

TGroupResponseTracker GroupResponseTracker;
std::unique_ptr<TEvBlobStorage::TEvDiscoverResult> PendingResult;
Expand All @@ -295,7 +294,7 @@ class TBlobStorageGroupDiscoverRequest : public TBlobStorageGroupRequestActor<TB
template<typename TPtr>
void SendResult(TPtr& result) {
Y_ABORT_UNLESS(result);
const TDuration duration = TActivationContext::Now() - StartTime;
const TDuration duration = TActivationContext::Monotonic() - RequestStartTime;
Mon->CountDiscoverResponseTime(duration);
const bool success = result->Status == NKikimrProto::OK;
LWPROBE(DSProxyRequestDuration, TEvBlobStorage::EvDiscover, 0, duration.SecondsFloat() * 1000.0,
Expand Down Expand Up @@ -884,7 +883,6 @@ class TBlobStorageGroupDiscoverRequest : public TBlobStorageGroupRequestActor<TB
, ReadBody(params.Common.Event->ReadBody)
, DiscoverBlockedGeneration(params.Common.Event->DiscoverBlockedGeneration)
, Deadline(params.Common.Event->Deadline)
, StartTime(params.Common.Now)
, GroupResponseTracker(Info)
, IsGetBlockDone(!DiscoverBlockedGeneration)
, ForceBlockedGeneration(params.Common.Event->ForceBlockedGeneration)
Expand Down
6 changes: 2 additions & 4 deletions ydb/core/blobstorage/dsproxy/dsproxy_discover_m3dc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,6 @@ class TDiscoverWorker {
class TBlobStorageGroupMirror3dcDiscoverRequest : public TBlobStorageGroupRequestActor<TBlobStorageGroupMirror3dcDiscoverRequest>{
const ui64 TabletId;
const ui32 MinGeneration;
const TInstant StartTime;
const TInstant Deadline;
const bool ReadBody;
const bool DiscoverBlockedGeneration;
Expand Down Expand Up @@ -461,7 +460,6 @@ class TBlobStorageGroupMirror3dcDiscoverRequest : public TBlobStorageGroupReques
: TBlobStorageGroupRequestActor(params, NWilson::TSpan(TWilson::BlobStorage, std::move(params.Common.TraceId), "DSProxy.Discover(mirror-3-dc)"))
, TabletId(params.Common.Event->TabletId)
, MinGeneration(params.Common.Event->MinGeneration)
, StartTime(params.Common.Now)
, Deadline(params.Common.Event->Deadline)
, ReadBody(params.Common.Event->ReadBody)
, DiscoverBlockedGeneration(params.Common.Event->DiscoverBlockedGeneration)
Expand Down Expand Up @@ -653,7 +651,7 @@ class TBlobStorageGroupMirror3dcDiscoverRequest : public TBlobStorageGroupReques
R_LOG_DEBUG_S("DSPDM03", "Response# " << response->ToString());

Y_ABORT_UNLESS(!Responded);
const TDuration duration = TActivationContext::Now() - StartTime;
const TDuration duration = TActivationContext::Monotonic() - RequestStartTime;
LWPROBE(DSProxyRequestDuration, TEvBlobStorage::EvDiscover, 0, duration.SecondsFloat() * 1000.0,
TabletId, Info->GroupID.GetRawId(), TLogoBlobID::MaxChannel, "", true);
SendResponseAndDie(std::move(response));
Expand All @@ -666,7 +664,7 @@ class TBlobStorageGroupMirror3dcDiscoverRequest : public TBlobStorageGroupReques

Y_ABORT_UNLESS(!Responded);
Y_ABORT_UNLESS(status != NKikimrProto::OK);
const TDuration duration = TActivationContext::Now() - StartTime;
const TDuration duration = TActivationContext::Monotonic() - RequestStartTime;
LWPROBE(DSProxyRequestDuration, TEvBlobStorage::EvDiscover, 0, duration.SecondsFloat() * 1000.0,
TabletId, Info->GroupID.GetRawId(), TLogoBlobID::MaxChannel, "", false);
std::unique_ptr<TEvBlobStorage::TEvDiscoverResult> response(new TEvBlobStorage::TEvDiscoverResult(
Expand Down
2 changes: 0 additions & 2 deletions ydb/core/blobstorage/dsproxy/dsproxy_discover_m3of4.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ class TBlobStorageGroupMirror3of4DiscoverRequest
{
const ui64 TabletId;
const ui32 MinGeneration;
const TInstant StartTime;
const TInstant Deadline;
const bool ReadBody;
const bool DiscoverBlockedGeneration;
Expand All @@ -33,7 +32,6 @@ class TBlobStorageGroupMirror3of4DiscoverRequest
: TBlobStorageGroupRequestActor(params, NWilson::TSpan(TWilson::BlobStorage, std::move(params.Common.TraceId), "DSProxy.Discover(mirror-3of4)"))
, TabletId(params.Common.Event->TabletId)
, MinGeneration(params.Common.Event->MinGeneration)
, StartTime(params.Common.Now)
, Deadline(params.Common.Event->Deadline)
, ReadBody(params.Common.Event->ReadBody)
, DiscoverBlockedGeneration(params.Common.Event->DiscoverBlockedGeneration)
Expand Down
44 changes: 32 additions & 12 deletions ydb/core/blobstorage/dsproxy/dsproxy_get.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
#include "dsproxy.h"
#include "dsproxy_mon.h"
#include "root_cause.h"
#include <ydb/core/blobstorage/dsproxy/dsproxy_request_reporting.h>
#include <ydb/core/blobstorage/vdisk/common/vdisk_events.h>
#include <ydb/core/blobstorage/lwtrace_probes/blobstorage_probes.h>
#include <ydb/core/util/stlog.h>
#include <library/cpp/containers/stack_vector/stack_vec.h>
#include <library/cpp/digest/crc32c/crc32c.h>
#include <util/generic/set.h>
Expand Down Expand Up @@ -36,8 +38,7 @@ class TBlobStorageGroupGetRequest : public TBlobStorageGroupRequestActor<TBlobSt
TRootCause RootCauseTrack;
NLWTrace::TOrbit Orbit;
const TInstant Deadline;
TInstant StartTime;
TInstant StartTimePut;
TMonotonic StartTimePut;
ui32 RequestsSent = 0;
ui32 ResponsesReceived = 0;
ui32 GroupSize;
Expand All @@ -57,6 +58,8 @@ class TBlobStorageGroupGetRequest : public TBlobStorageGroupRequestActor<TBlobSt
ui32 PutsAccelerated = 0;
bool IsPutAccelerateScheduled = false;

TDuration LongRequestThreshold;

void Handle(TEvAccelerateGet::TPtr &ev) {
IsGetAccelerateScheduled = false;
RootCauseTrack.OnAccelerate(ev->Get()->CauseIdx);
Expand All @@ -75,6 +78,7 @@ class TBlobStorageGroupGetRequest : public TBlobStorageGroupRequestActor<TBlobSt
}
GetsAccelerated++;

GetImpl.History.AddAcceleration(false);
TDeque<std::unique_ptr<TEvBlobStorage::TEvVGet>> vGets;
TDeque<std::unique_ptr<TEvBlobStorage::TEvVPut>> vPuts;
GetImpl.AccelerateGet(LogCtx, GetUnresponsiveDisksMask(), vGets, vPuts);
Expand All @@ -90,6 +94,7 @@ class TBlobStorageGroupGetRequest : public TBlobStorageGroupRequestActor<TBlobSt
}
PutsAccelerated++;

GetImpl.History.AddAcceleration(true);
TDeque<std::unique_ptr<TEvBlobStorage::TEvVGet>> vGets;
TDeque<std::unique_ptr<TEvBlobStorage::TEvVPut>> vPuts;
GetImpl.AcceleratePut(LogCtx, GetUnresponsiveDisksMask(), vGets, vPuts);
Expand All @@ -107,7 +112,7 @@ class TBlobStorageGroupGetRequest : public TBlobStorageGroupRequestActor<TBlobSt
if (vPuts.size()) {
if (!IsPutStarted) {
IsPutStarted = true;
StartTimePut = TActivationContext::Now();
StartTimePut = TActivationContext::Monotonic();
}
}
for (size_t i = 0; i < vGets.size(); ++i) {
Expand Down Expand Up @@ -138,6 +143,7 @@ class TBlobStorageGroupGetRequest : public TBlobStorageGroupRequestActor<TBlobSt
}
SendToQueues(vGets, false);
SendToQueues(vPuts, false);
GetImpl.History.AddAllWaiting();
}

ui32 CountDisksWithActiveRequests() {
Expand Down Expand Up @@ -316,8 +322,8 @@ class TBlobStorageGroupGetRequest : public TBlobStorageGroupRequestActor<TBlobSt
// Count VDisks that have requests in flight, if there is no more than 2 such VDisks, Accelerate
if (CountDisksWithActiveRequests() <= 2) {
ui64 timeToAccelerateUs = GetImpl.GetTimeToAccelerateGetNs(LogCtx, GetsAccelerated) / 1000;
TInstant now = TActivationContext::Now();
TDuration timeSinceStart = (now > StartTime) ? (now - StartTime) : TDuration::MilliSeconds(0);
TMonotonic now = TActivationContext::Monotonic();
TDuration timeSinceStart = (now > RequestStartTime) ? (now - RequestStartTime) : TDuration::MilliSeconds(0);
if (timeSinceStart.MicroSeconds() < timeToAccelerateUs) {
ui64 causeIdx = RootCauseTrack.RegisterAccelerate();
Schedule(TDuration::MicroSeconds(timeToAccelerateUs - timeSinceStart.MicroSeconds()),
Expand All @@ -335,7 +341,7 @@ class TBlobStorageGroupGetRequest : public TBlobStorageGroupRequestActor<TBlobSt
// Count VDisks that have requests in flight, if there is no more than 2 such VDisks, Accelerate
if (CountDisksWithActiveRequests() <= 2) {
ui64 timeToAccelerateUs = GetImpl.GetTimeToAcceleratePutNs(LogCtx, PutsAccelerated) / 1000;
TInstant now = TActivationContext::Now();
TMonotonic now = TActivationContext::Monotonic();
TDuration timeSinceStart = (now > StartTimePut) ? (now - StartTimePut) : TDuration::MilliSeconds(0);
if (timeSinceStart.MicroSeconds() < timeToAccelerateUs) {
ui64 causeIdx = RootCauseTrack.RegisterAccelerate();
Expand All @@ -350,9 +356,10 @@ class TBlobStorageGroupGetRequest : public TBlobStorageGroupRequestActor<TBlobSt
}

void SendReplyAndDie(TAutoPtr<TEvBlobStorage::TEvGetResult> &evResult) {
const TInstant now = TActivationContext::Now();
const TDuration duration = (now > StartTime) ? (now - StartTime) : TDuration::MilliSeconds(0);
Mon->CountGetResponseTime(Info->GetDeviceType(), GetImpl.GetHandleClass(), evResult->PayloadSizeBytes(), duration);
const TMonotonic now = TActivationContext::Monotonic();
const TDuration duration = now - RequestStartTime;
NKikimrBlobStorage::EGetHandleClass handleClass = GetImpl.GetHandleClass();
Mon->CountGetResponseTime(Info->GetDeviceType(), handleClass, evResult->PayloadSizeBytes(), duration);
*Mon->ActiveGetCapacity -= ReportedBytes;
ReportedBytes = 0;
bool success = evResult->Status == NKikimrProto::OK;
Expand All @@ -368,9 +375,22 @@ class TBlobStorageGroupGetRequest : public TBlobStorageGroupRequestActor<TBlobSt
LWTRACK(DSProxyGetReply, Orbit);
evResult->Orbit = std::move(Orbit);
LWPROBE(DSProxyRequestDuration, TEvBlobStorage::EvGet, requestSize, duration.SecondsFloat() * 1000.0, tabletId,
evResult->GroupId, channel, NKikimrBlobStorage::EGetHandleClass_Name(GetImpl.GetHandleClass()),
evResult->GroupId, channel, NKikimrBlobStorage::EGetHandleClass_Name(handleClass),
success);
A_LOG_LOG_S(true, success ? NLog::PRI_INFO : NLog::PRI_NOTICE, "BPG68", "Result# " << evResult->Print(false));

if (TActivationContext::Monotonic() - RequestStartTime >= LongRequestThreshold) {
if (AllowToReport(GetImpl.GetHandleClass())) {
STLOG(PRI_WARN, BS_PROXY_GET, BPG71, "Long TEvGet request detected", \
(LongRequestThreshold, LongRequestThreshold), \
(GroupId, Info->GroupID), \
(SubrequestsCount, evResult->ResponseSz), \
(RequestTotalSize, requestSize), \
(HandleClass, NKikimrBlobStorage::EGetHandleClass_Name(handleClass)), \
(RestartCounter, RestartCounter), \
(History, GetImpl.PrintHistory()));
}
}
return SendResponseAndDie(std::unique_ptr<TEvBlobStorage::TEvGetResult>(evResult.Release()));
}

Expand Down Expand Up @@ -398,10 +418,10 @@ class TBlobStorageGroupGetRequest : public TBlobStorageGroupRequestActor<TBlobSt
LogCtx.RequestPrefix)
, Orbit(std::move(params.Common.Event->Orbit))
, Deadline(params.Common.Event->Deadline)
, StartTime(params.Common.Now)
, StartTimePut(StartTime)
, StartTimePut(RequestStartTime)
, GroupSize(Info->Type.BlobSubgroupSize())
, ReportedBytes(0)
, LongRequestThreshold(params.LongRequestThreshold)
{
ReportBytes(sizeof(*this));
MaxSaneRequests = params.Common.Event->QuerySize * Info->Type.TotalPartCount() *
Expand Down
Loading

0 comments on commit 5408146

Please sign in to comment.