From 8b3ff0d2b4fe332577e24574a404fe25f976c096 Mon Sep 17 00:00:00 2001 From: Albert Louis Rossi <arossi@fnal.gov> Date: Mon, 2 Oct 2023 08:22:03 -0500 Subject: [PATCH] dcache-qos: fix scanner operation completion logic Motivation: When system scan operations run, the operation map fails to understand when they have all completed. This is due to a faulty way of computing the highest index of the targets processed. Modification: Fix the issue by implementing a `hasNext` method which is properly global rather than checking indices on individual operations (which is flawed because it assumes they run in serial order). The patch also corrects/improves several logging statements. Result: Scans are properly marked as having completed and the end timestamps as reported by `info` are correct. Target: master Request: 9.2 Patch: https://rb.dcache.org/r/14097 Requires-notes: yes (for 9.2) Acked-by: Tigran --- .../scanner/data/ScanOperationMap.java | 8 +- .../scanner/data/SystemOperationMap.java | 111 +++++++++++++----- .../scanner/data/SystemScanOperation.java | 5 - 3 files changed, 84 insertions(+), 40 deletions(-) diff --git a/modules/dcache-qos/src/main/java/org/dcache/qos/services/scanner/data/ScanOperationMap.java b/modules/dcache-qos/src/main/java/org/dcache/qos/services/scanner/data/ScanOperationMap.java index f28ca14ee1c..e491133499f 100644 --- a/modules/dcache-qos/src/main/java/org/dcache/qos/services/scanner/data/ScanOperationMap.java +++ b/modules/dcache-qos/src/main/java/org/dcache/qos/services/scanner/data/ScanOperationMap.java @@ -108,17 +108,17 @@ public void run() { break; } - LOGGER.trace("Pool watchdog initiating scan."); + LOGGER.trace("calling run scans."); runScans(); - LOGGER.trace("Pool watchdog scan completed."); + LOGGER.trace("run scans returned."); recordSweep(start, System.currentTimeMillis()); } - LOGGER.info("Exiting pool operation consumer."); + LOGGER.info("Exiting operation consumer."); clear(); - LOGGER.info("Pool operation queues cleared."); + LOGGER.info("operation queues cleared."); } public void runNow() { diff --git a/modules/dcache-qos/src/main/java/org/dcache/qos/services/scanner/data/SystemOperationMap.java b/modules/dcache-qos/src/main/java/org/dcache/qos/services/scanner/data/SystemOperationMap.java index 8867df72558..446de2c505c 100644 --- a/modules/dcache-qos/src/main/java/org/dcache/qos/services/scanner/data/SystemOperationMap.java +++ b/modules/dcache-qos/src/main/java/org/dcache/qos/services/scanner/data/SystemOperationMap.java @@ -112,6 +112,36 @@ public class SystemOperationMap extends ScanOperationMap { ALL_IDLE_ENABLED_POOLS = filter; } + static class ScanIndices { + final long start; + final long end; + long currentOffset = 0L; + long nextOffset = 0L; + + public ScanIndices(long start, long end) { + this.start = start; + this.end = end; + } + + long currentOffset() { + return currentOffset; + } + + long nextOffset() { + return nextOffset; + } + + boolean hasNext() { + return nextOffset < end; + } + + boolean setNextOffset(long from, long to, long batchSize) { + currentOffset = start + (from * batchSize); + nextOffset = Math.min(start + (to * batchSize), end); + return currentOffset < nextOffset; + } + } + private final Map<String, SystemScanOperation> online = new HashMap<>(); private final Map<String, SystemScanOperation> qosNearline = new HashMap<>(); private final EvictingQueue<String> history = EvictingQueue.create(100); @@ -135,6 +165,9 @@ public class SystemOperationMap extends ScanOperationMap { private long lastPoolScanStart; private long nextPoolScanStart; + private ScanIndices qosIndices; + private ScanIndices onlineIndices; + private SysOpHandler handler; private QoSScannerCounters counters; @@ -190,7 +223,7 @@ public void cancelAll(boolean qos) { public String configSettings() { return String.format("system online scan window %s %s\n" + "system online scan is %s\n" - + "system qosNearline scan window %s %s\n" + + "system qos (nearline) scan window %s %s\n" + "max concurrent operations %s\n" + "period set to %s %s\n\n", onlineRescanWindow, @@ -218,8 +251,8 @@ public void getInfo(PrintWriter pw) { seconds = 0L; } counters.appendDHMSElapsedTime(seconds, SCAN_DURATION, builder); - builder.append(String.format("last qosNearline (nearline) scan start %s\n" - + "last qosNearline (nearline) scan end %s\n", + builder.append(String.format("last qos (nearline) scan start %s\n" + + "last qos (nearline) scan end %s\n", new Date(lastQosNearlineScanStart), new Date(lastQosNearlineScanEnd))); seconds = TimeUnit.MILLISECONDS.toSeconds(lastQosNearlineScanEnd - lastQosNearlineScanStart); @@ -270,7 +303,7 @@ public void runScans() { lock.lock(); try { if (!isQosNearlineRunning() && isQosNearlinePastExpiration()) { - LOGGER.info("runScans: starting qosNearline system scans"); + LOGGER.info("runScans: starting qos (nearline) system scans"); start(true); } @@ -441,22 +474,29 @@ private void handleDone(SystemScanOperation operation) { remove(operation.id); history.add(operation.toString()); - boolean isQosPermanent = operation.qos; + boolean hasNext = hasNext(operation.qos); + + LOGGER.info("handleDone, hasNext {}, qos {}, online {}, qos {}.", hasNext, + operation.qos, online.size(), qosNearline.size()); - if (operation.isFinal()) { - if (isQosPermanent && qosNearline.isEmpty()) { - state &= (~QOS_NEARLINE); - lastQosNearlineScanEnd = System.currentTimeMillis(); + if (!hasNext) { + if (operation.qos) { + if (qosNearline.isEmpty()) { + state &= (~QOS_NEARLINE); + lastQosNearlineScanEnd = System.currentTimeMillis(); + qosIndices = null; + } } else if (online.isEmpty()) { state &= (~ONLINE); lastOnlineScanEnd = System.currentTimeMillis(); + onlineIndices = null; } } else { int loopWidth = maxConcurrentRunning; - int batchSize = getBatchSize(isQosPermanent); + int batchSize = getBatchSize(operation.qos); long fromIndex = (operation.from / batchSize) + loopWidth; long toIndex = (operation.to / batchSize) + loopWidth; - submit(fromIndex, toIndex, operation.minMaxIndices, isQosPermanent); + submit(fromIndex, toIndex, operation.qos); } } @@ -493,6 +533,9 @@ private SystemScanOperation remove(String id) { SystemScanOperation operation = online.remove(id); if (operation == null) { operation = qosNearline.remove(id); + LOGGER.info("removed qos operation {}.", operation); + } else { + LOGGER.info("removed online operation {}.", operation); } return operation; } @@ -502,24 +545,25 @@ private SystemScanOperation remove(String id) { */ @GuardedBy("lock") private void start(boolean qos) throws CacheException { - if (!onlineScanEnabled && !qos) { - LOGGER.info("start: overriding disabled flag to run online scan"); - } - - long[] indices = handler.getMinMaxIndices(qos); + long[] minMaxIndices = handler.getMinMaxIndices(qos); int count = maxConcurrentRunning; - if (indices[1] == 0) { - LOGGER.info("start: no {} entries to scan.", qos ? "QOS_NEARLINE" : "ONLINE"); + if (minMaxIndices[1] == 0) { + LOGGER.info("start: no {} entries to scan.", qos ? "qos (nearline)" : "online"); return; } + ScanIndices indices = new ScanIndices(minMaxIndices[0], minMaxIndices[1]); + + if (qos) { + qosIndices = indices; + } else { + onlineIndices = indices; + } + LOGGER.info("start: loop count {}.", count); - for (int i = 0; i < count; ++i) { - LOGGER.info("start: submitting {} scan {}.", qos ? "QOS_NEARLINE" : "ONLINE", i); - if (submit(i, i + 1, indices, qos) > indices[1]) { - break; - } + for (int i = 0; i < count && hasNext(qos); ++i) { + submit(i, i + 1, qos); } if (qos) { @@ -540,18 +584,18 @@ private void startPoolScans() { } @GuardedBy("lock") - private long submit(long fromIndex, long toIndex, long[] minmax, boolean qos) { + private void submit(long fromIndex, long toIndex, boolean qos) { int batchSize = getBatchSize(qos); - long start = minmax[0] + (fromIndex * batchSize); - long end = Math.min(minmax[0] + (toIndex * batchSize), minmax[1]); - if (start > end) { - return end; + ScanIndices indices = qos ? qosIndices : onlineIndices; + if (!indices.setNextOffset(fromIndex, toIndex, batchSize)) { + return; } - SystemScanOperation operation = new SystemScanOperation(start, end, qos); - operation.minMaxIndices = minmax; + SystemScanOperation operation = new SystemScanOperation(indices.currentOffset(), + indices.nextOffset(), qos); + LOGGER.info("submitting scan for start {}, next {}, qos {}.", indices.currentOffset(), + indices.nextOffset(), qos); operation.lastScan = System.currentTimeMillis(); submit(operation); - return end; } @GuardedBy("lock") @@ -565,4 +609,9 @@ private void submit(SystemScanOperation operation) { put(operation); operation.task.submit(); } + + @GuardedBy("lock") + private boolean hasNext(boolean qos) { + return qos ? qosIndices.hasNext() : onlineIndices.hasNext(); + } } \ No newline at end of file diff --git a/modules/dcache-qos/src/main/java/org/dcache/qos/services/scanner/data/SystemScanOperation.java b/modules/dcache-qos/src/main/java/org/dcache/qos/services/scanner/data/SystemScanOperation.java index 206c39748af..5a39f5df1df 100644 --- a/modules/dcache-qos/src/main/java/org/dcache/qos/services/scanner/data/SystemScanOperation.java +++ b/modules/dcache-qos/src/main/java/org/dcache/qos/services/scanner/data/SystemScanOperation.java @@ -87,7 +87,6 @@ public final class SystemScanOperation extends ScanOperation<SystemScanTask> { long lastUpdate; long lastScan; - long[] minMaxIndices; SystemScanTask task; CacheException exception; @@ -163,10 +162,6 @@ protected boolean isComplete() { return isComplete; } - boolean isFinal() { - return to >= minMaxIndices[1]; - } - boolean isQos() { return qos; }