diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java index 09480009455d..1e0b6c59a840 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java @@ -25,6 +25,7 @@ import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; import org.apache.hadoop.hdds.conf.ConfigurationSource; @@ -39,6 +40,7 @@ import org.apache.hadoop.hdds.scm.server.SCMDatanodeProtocolServer.NodeRegistrationContainerReport; import org.apache.hadoop.hdds.server.events.EventQueue; import org.apache.hadoop.hdds.server.events.TypedEvent; +import org.apache.hadoop.util.Time; /** * Abstract class for Container Safe mode exit rule. @@ -135,8 +137,15 @@ public double getCurrentContainerThreshold() { @Override public synchronized void refresh(boolean forceRefresh) { - if (forceRefresh || !validate()) { + if (!forceRefresh) { + return; + } + final long startNanos = Time.monotonicNowNanos(); + try { initializeRule(); + } finally { + long durationMs = TimeUnit.NANOSECONDS.toMillis(Time.monotonicNowNanos() - startNanos); + getSafeModeMetrics().setLastContainerSafeModeRuleRefreshDurationMs(getContainerType(), durationMs); } } diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java index 2c9173b2bf09..65e52ec42723 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java @@ -40,6 +40,7 @@ import org.apache.hadoop.hdds.scm.node.NodeManager; import org.apache.hadoop.hdds.scm.pipeline.PipelineManager; import org.apache.hadoop.hdds.server.events.EventQueue; +import org.apache.hadoop.util.Time; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -89,6 +90,9 @@ public class SCMSafeModeManager implements SafeModeManager { private ScheduledExecutorService safeModeLogExecutor; private ScheduledFuture safeModeLogTask; + /** Monotonic time when SCM entered safe mode; used to report exit duration. */ + private long safeModeEnteredAtNanos = -1L; + public SCMSafeModeManager(final ConfigurationSource conf, final NodeManager nodeManager, final PipelineManager pipelineManager, @@ -120,6 +124,9 @@ public SCMSafeModeManager(final ConfigurationSource conf, } public void start() { + if (getInSafeMode()) { + safeModeEnteredAtNanos = Time.monotonicNowNanos(); + } emitSafeModeStatus(); startSafeModePeriodicLogger(); } @@ -177,13 +184,18 @@ public synchronized void validateSafeModeExitRules(String ruleName) { LOG.info("ScmSafeModeManager, all rules are successfully validated"); LOG.info("SCM exiting safe mode."); emitSafeModeStatus(); + recordSafeModeExitDuration(); } } public void forceExitSafeMode() { + boolean wasInSafeMode = getInSafeMode(); LOG.info("SCM force-exiting safe mode."); status.set(SafeModeStatus.OUT_OF_SAFE_MODE); emitSafeModeStatus(); + if (wasInSafeMode) { + recordSafeModeExitDuration(); + } } /** @@ -308,6 +320,17 @@ private synchronized void logSafeModeStatus() { } } + private void recordSafeModeExitDuration() { + if (safeModeEnteredAtNanos < 0) { + return; + } + long durationMs = + TimeUnit.NANOSECONDS.toMillis(Time.monotonicNowNanos() - safeModeEnteredAtNanos); + safeModeEnteredAtNanos = -1; + safeModeMetrics.setScmSafeModeExitDurationMs(durationMs); + LOG.info("SCM safe mode exit duration {} ms (since start() while in safe mode)", durationMs); + } + /** * Stops the periodic safe mode logger. * Called when safe mode exits. diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeMetrics.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeMetrics.java index ae65eafcb910..39ed3fc77047 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeMetrics.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeMetrics.java @@ -59,6 +59,13 @@ public class SafeModeMetrics { @Metric private MutableGaugeLong numRequiredDatanodesThreshold; @Metric private MutableCounterLong currentRegisteredDatanodesCount; + @Metric("Wall-clock time (ms) SCM spent in safe mode for the last exit") + private MutableGaugeLong scmSafeModeExitDurationMs; + @Metric("Duration (ms) of the last Ratis container safe mode rule incremental refresh") + private MutableGaugeLong lastRatisContainerSafeModeRuleRefreshDurationMs; + @Metric("Duration (ms) of the last EC container safe mode rule incremental refresh") + private MutableGaugeLong lastEcContainerSafeModeRuleRefreshDurationMs; + public static SafeModeMetrics create() { final MetricsSystem ms = DefaultMetricsSystem.instance(); return ms.register(SOURCE_NAME, "SCM Safemode Metrics", new SafeModeMetrics()); @@ -113,6 +120,24 @@ public void incCurrentRegisteredDatanodesCount() { this.currentRegisteredDatanodesCount.incr(); } + public void setScmSafeModeExitDurationMs(long durationMs) { + this.scmSafeModeExitDurationMs.set(durationMs); + } + + public void setLastContainerSafeModeRuleRefreshDurationMs( + HddsProtos.ReplicationType type, long durationMs) { + switch (type) { + case RATIS: + this.lastRatisContainerSafeModeRuleRefreshDurationMs.set(durationMs); + break; + case EC: + this.lastEcContainerSafeModeRuleRefreshDurationMs.set(durationMs); + break; + default: + break; + } + } + MutableGaugeLong getNumHealthyPipelinesThreshold() { return numHealthyPipelinesThreshold; }