From 016f5d52b1e86a8132489b4a210c4cfa6712ec6d Mon Sep 17 00:00:00 2001 From: tbs60 Date: Mon, 8 Jul 2024 17:37:47 +0800 Subject: [PATCH] fix: aoivd dead lock when send files to worker,issue: #262 --- .../worker/pkg/client/bkcommondist_handler.go | 24 +++++++------- .../client/bkcommondist_handler_long_tcp.go | 2 +- .../bk_dist/worker/pkg/client/slots.go | 31 ++++++++++--------- 3 files changed, 29 insertions(+), 28 deletions(-) diff --git a/src/backend/booster/bk_dist/worker/pkg/client/bkcommondist_handler.go b/src/backend/booster/bk_dist/worker/pkg/client/bkcommondist_handler.go index f1fecb48..ac703b6f 100644 --- a/src/backend/booster/bk_dist/worker/pkg/client/bkcommondist_handler.go +++ b/src/backend/booster/bk_dist/worker/pkg/client/bkcommondist_handler.go @@ -379,7 +379,7 @@ func (r *CommonRemoteHandler) ExecuteSendFile( defer func() { r.updateJobStatsFunc() }() - blog.Debugf("start send files to server %s", server) + blog.Infof("start send files to server %s", server) r.recordStats.RemoteWorker = server.Server if len(req.Files) == 0 { @@ -499,6 +499,12 @@ func (r *CommonRemoteHandler) ExecuteSendFile( client := NewTCPClient(r.ioTimeout) if err := client.Connect(getRealServer(server.Server)); err != nil { blog.Warnf("error: %v", err) + + if memorylocked { + r.slot.Unlock(locksize) + blog.Debugf("remotehandle: succeed to release one memory lock") + } + return nil, err } d := time.Now().Sub(t) @@ -516,14 +522,13 @@ func (r *CommonRemoteHandler) ExecuteSendFile( blog.Debugf("success connect to server %s", server) err = SendMessages(client, messages) + if memorylocked { + r.slot.Unlock(locksize) + blog.Debugf("remotehandle: succeed to release one memory lock") + } + if err != nil { blog.Warnf("error: %v", err) - - if memorylocked { - r.slot.Unlock(locksize) - blog.Debugf("remotehandle: succeed to release one memory lock") - } - return nil, err } @@ -536,11 +541,6 @@ func (r *CommonRemoteHandler) ExecuteSendFile( debug.FreeOSMemory() // free memory anyway - if memorylocked { - r.slot.Unlock(locksize) - blog.Debugf("remotehandle: succeed to release one memory lock") - } - blog.Debugf("success sent to server %s", server) // receive result data, err := receiveSendFileRsp(client) diff --git a/src/backend/booster/bk_dist/worker/pkg/client/bkcommondist_handler_long_tcp.go b/src/backend/booster/bk_dist/worker/pkg/client/bkcommondist_handler_long_tcp.go index 29bd16e1..82e136e9 100644 --- a/src/backend/booster/bk_dist/worker/pkg/client/bkcommondist_handler_long_tcp.go +++ b/src/backend/booster/bk_dist/worker/pkg/client/bkcommondist_handler_long_tcp.go @@ -167,7 +167,7 @@ func (r *CommonRemoteHandler) ExecuteSendFileLongTCP( defer func() { r.updateJobStatsFunc() }() - blog.Debugf("start send files to server %s", server) + blog.Infof("start send files to server %s", server) r.recordStats.RemoteWorker = server.Server if len(req.Files) == 0 { diff --git a/src/backend/booster/bk_dist/worker/pkg/client/slots.go b/src/backend/booster/bk_dist/worker/pkg/client/slots.go index d9e7000f..ad993992 100644 --- a/src/backend/booster/bk_dist/worker/pkg/client/slots.go +++ b/src/backend/booster/bk_dist/worker/pkg/client/slots.go @@ -163,22 +163,23 @@ func (lr *slot) handleLock(ctx context.Context) { } func (lr *slot) hasEnoughtSlots(pairChan chanPair) bool { + // 这儿的内存判断,可能导致一直拿不到锁,先屏蔽掉 // 如果已经锁定的内存超过了largeLocked,再次申请内存时,需要关注当前系统内存情况 - if lr.occupiedSlots > largeLocked { - v, err := mem.VirtualMemory() - if err == nil { - if v.Available < leastFree || - v.Available < uint64(pairChan.weight) || - v.UsedPercent > maxMemPercent { - blog.Infof("send slot: request size:%d,locked size:%d,Available:%d,UsedPercent:%f", - pairChan.weight, - lr.occupiedSlots, - v.Available, - v.UsedPercent) - return false - } - } - } + // if lr.occupiedSlots > largeLocked { + // v, err := mem.VirtualMemory() + // if err == nil { + // if v.Available < leastFree || + // v.Available < uint64(pairChan.weight) || + // v.UsedPercent > maxMemPercent { + // blog.Infof("send slot: request size:%d,locked size:%d,Available:%d,UsedPercent:%f", + // pairChan.weight, + // lr.occupiedSlots, + // v.Available, + // v.UsedPercent) + // return false + // } + // } + // } if lr.occupiedSlots+pairChan.weight < lr.totalSlots { return true