diff --git a/hk/housekeeper.go b/hk/housekeeper.go index 12d3886ac5..cd03c5a13a 100644 --- a/hk/housekeeper.go +++ b/hk/housekeeper.go @@ -1,14 +1,13 @@ // Package hk provides mechanism for registering cleanup // functions which are invoked at specified intervals. /* - * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved. */ package hk import ( "container/heap" "os" - "os/signal" "syscall" "time" @@ -75,7 +74,7 @@ func _init(mustRun bool) { if mustRun { HK.running.Store(false) } else { - HK.running.Store(true) // tests only + HK.running.Store(true) // mustRun == false: tests only } heap.Init(HK.actions) } @@ -121,11 +120,8 @@ func (hk *hk) terminate() { func (*hk) Stop(error) { HK.stopCh.Close() } func (hk *hk) Run() (err error) { - signal.Notify(hk.sigCh, - syscall.SIGINT, // kill -SIGINT (Ctrl-C) - syscall.SIGTERM, // kill -SIGTERM - syscall.SIGQUIT, // kill -SIGQUIT - ) + hk.setSignal() // SIGINT, et al. - see handleSignal() below + hk.timer = time.NewTimer(time.Hour) hk.running.Store(true) err = hk._run() @@ -198,10 +194,9 @@ func (hk *hk) _run() error { case s, ok := <-hk.sigCh: if ok { - signal.Stop(hk.sigCh) - err := cos.NewSignalError(s.(syscall.Signal)) - hk.Stop(err) - return err + if err := hk.handleSignal(s.(syscall.Signal)); err != nil { + return err + } } } } diff --git a/hk/sig_darwin.go b/hk/sig_darwin.go new file mode 100644 index 0000000000..f256809e5d --- /dev/null +++ b/hk/sig_darwin.go @@ -0,0 +1,12 @@ +// Package hk provides mechanism for registering cleanup +// functions which are invoked at specified intervals. +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + */ +package hk + +import "errors" + +func numOpenFiles() (int, error) { + return 0, errors.New("num-open-files not implemented yet") +} diff --git a/hk/sig_linux.go b/hk/sig_linux.go new file mode 100644 index 0000000000..e20cbfe7dc --- /dev/null +++ b/hk/sig_linux.go @@ -0,0 +1,32 @@ +// Package hk provides mechanism for registering cleanup +// functions which are invoked at specified intervals. +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + */ +package hk + +import ( + "os" + "path/filepath" + "strconv" +) + +// TODO: consider moving to `cos` and logging (`stats`) every 4h or so +func numOpenFiles() (int, error) { + var ( + pid = os.Getpid() + proddir = filepath.Join("/proc", strconv.Itoa(pid), "fd") + dir, err = os.Open(proddir) + ) + if err != nil { + return 0, err + } + defer dir.Close() + + // read just the names + names, e := dir.Readdirnames(0) + if e != nil { + return 0, e + } + return len(names), nil +} diff --git a/hk/sigprod.go b/hk/sigprod.go new file mode 100644 index 0000000000..2c355305cf --- /dev/null +++ b/hk/sigprod.go @@ -0,0 +1,49 @@ +// Package hk provides mechanism for registering cleanup +// functions which are invoked at specified intervals. +/* + * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved. + */ +package hk + +import ( + "os/signal" + "runtime" + "strings" + "syscall" + + "github.com/NVIDIA/aistore/cmn/cos" + "github.com/NVIDIA/aistore/cmn/nlog" + "github.com/NVIDIA/aistore/sys" +) + +func (hk *hk) setSignal() { + signal.Notify(hk.sigCh, + // ignore + syscall.SIGHUP, // kill -SIGHUP + // terminate + syscall.SIGINT, // kill -SIGINT (Ctrl-C) + syscall.SIGTERM, // kill -SIGTERM + syscall.SIGQUIT, // kill -SIGQUIT + ) +} + +func (hk *hk) handleSignal(s syscall.Signal) error { + if s == syscall.SIGHUP { + // no-op: show up in the log with some useful info + var ( + sb strings.Builder + mem sys.MemStat + ngr = runtime.NumGoroutine() + ) + erm := mem.Get() + mem.Str(&sb) + nfd, erf := numOpenFiles() + nlog.Infoln("ngr [", ngr, sys.NumCPU(), "] mem [", sb.String(), erm, "]", "num-fd [", nfd, erf, "]") + return nil + } + + signal.Stop(hk.sigCh) + err := cos.NewSignalError(s) + hk.Stop(err) + return err +}