Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import org.apache.hadoop.hdds.conf.ConfigurationSource;
Expand All @@ -39,6 +40,7 @@
import org.apache.hadoop.hdds.scm.server.SCMDatanodeProtocolServer.NodeRegistrationContainerReport;
import org.apache.hadoop.hdds.server.events.EventQueue;
import org.apache.hadoop.hdds.server.events.TypedEvent;
import org.apache.hadoop.util.Time;

/**
* Abstract class for Container Safe mode exit rule.
Expand Down Expand Up @@ -135,8 +137,15 @@ public double getCurrentContainerThreshold() {

@Override
public synchronized void refresh(boolean forceRefresh) {
if (forceRefresh || !validate()) {
if (!forceRefresh) {
return;
}
final long startNanos = Time.monotonicNowNanos();
try {
initializeRule();
} finally {
long durationMs = TimeUnit.NANOSECONDS.toMillis(Time.monotonicNowNanos() - startNanos);
getSafeModeMetrics().setLastContainerSafeModeRuleRefreshDurationMs(getContainerType(), durationMs);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
import org.apache.hadoop.hdds.scm.node.NodeManager;
import org.apache.hadoop.hdds.scm.pipeline.PipelineManager;
import org.apache.hadoop.hdds.server.events.EventQueue;
import org.apache.hadoop.util.Time;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand Down Expand Up @@ -89,6 +90,9 @@ public class SCMSafeModeManager implements SafeModeManager {
private ScheduledExecutorService safeModeLogExecutor;
private ScheduledFuture<?> safeModeLogTask;

/** Monotonic time when SCM entered safe mode; used to report exit duration. */
private long safeModeEnteredAtNanos = -1L;

public SCMSafeModeManager(final ConfigurationSource conf,
final NodeManager nodeManager,
final PipelineManager pipelineManager,
Expand Down Expand Up @@ -120,6 +124,9 @@ public SCMSafeModeManager(final ConfigurationSource conf,
}

public void start() {
if (getInSafeMode()) {
safeModeEnteredAtNanos = Time.monotonicNowNanos();
}
emitSafeModeStatus();
startSafeModePeriodicLogger();
}
Expand Down Expand Up @@ -177,13 +184,18 @@ public synchronized void validateSafeModeExitRules(String ruleName) {
LOG.info("ScmSafeModeManager, all rules are successfully validated");
LOG.info("SCM exiting safe mode.");
emitSafeModeStatus();
recordSafeModeExitDuration();
}
}

public void forceExitSafeMode() {
boolean wasInSafeMode = getInSafeMode();
LOG.info("SCM force-exiting safe mode.");
status.set(SafeModeStatus.OUT_OF_SAFE_MODE);
emitSafeModeStatus();
if (wasInSafeMode) {
recordSafeModeExitDuration();
}
}

/**
Expand Down Expand Up @@ -308,6 +320,17 @@ private synchronized void logSafeModeStatus() {
}
}

private void recordSafeModeExitDuration() {
if (safeModeEnteredAtNanos < 0) {
return;
}
long durationMs =
TimeUnit.NANOSECONDS.toMillis(Time.monotonicNowNanos() - safeModeEnteredAtNanos);
safeModeEnteredAtNanos = -1;
safeModeMetrics.setScmSafeModeExitDurationMs(durationMs);
LOG.info("SCM safe mode exit duration {} ms (since start() while in safe mode)", durationMs);
}

/**
* Stops the periodic safe mode logger.
* Called when safe mode exits.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,13 @@ public class SafeModeMetrics {
@Metric private MutableGaugeLong numRequiredDatanodesThreshold;
@Metric private MutableCounterLong currentRegisteredDatanodesCount;

@Metric("Wall-clock time (ms) SCM spent in safe mode for the last exit")
private MutableGaugeLong scmSafeModeExitDurationMs;
@Metric("Duration (ms) of the last Ratis container safe mode rule incremental refresh")
private MutableGaugeLong lastRatisContainerSafeModeRuleRefreshDurationMs;
@Metric("Duration (ms) of the last EC container safe mode rule incremental refresh")
private MutableGaugeLong lastEcContainerSafeModeRuleRefreshDurationMs;

public static SafeModeMetrics create() {
final MetricsSystem ms = DefaultMetricsSystem.instance();
return ms.register(SOURCE_NAME, "SCM Safemode Metrics", new SafeModeMetrics());
Expand Down Expand Up @@ -113,6 +120,24 @@ public void incCurrentRegisteredDatanodesCount() {
this.currentRegisteredDatanodesCount.incr();
}

public void setScmSafeModeExitDurationMs(long durationMs) {
this.scmSafeModeExitDurationMs.set(durationMs);
}

public void setLastContainerSafeModeRuleRefreshDurationMs(
HddsProtos.ReplicationType type, long durationMs) {
switch (type) {
case RATIS:
this.lastRatisContainerSafeModeRuleRefreshDurationMs.set(durationMs);
break;
case EC:
this.lastEcContainerSafeModeRuleRefreshDurationMs.set(durationMs);
break;
default:
break;
}
}

MutableGaugeLong getNumHealthyPipelinesThreshold() {
return numHealthyPipelinesThreshold;
}
Expand Down