From fbf67b592df8f02d6cb2136f6d7ad03720b9dfc3 Mon Sep 17 00:00:00 2001 From: arafat Date: Mon, 30 Mar 2026 14:18:09 +0530 Subject: [PATCH] HDDS-14927. Add Quasi-Closed Container Tracking in Recon. --- .../ozone/recon/api/ContainerEndpoint.java | 142 ++++++++++++++++++ .../DatanodeUnhealthyContainersResponse.java | 71 +++++++++ .../api/types/DatanodeUnhealthySummary.java | 75 +++++++++ .../src/v2/pages/datanodes/datanodes.tsx | 128 +++++++++++++++- .../src/v2/types/datanode.types.ts | 12 ++ 5 files changed, 426 insertions(+), 2 deletions(-) create mode 100644 hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/api/types/DatanodeUnhealthyContainersResponse.java create mode 100644 hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/api/types/DatanodeUnhealthySummary.java diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/api/ContainerEndpoint.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/api/ContainerEndpoint.java index 4cf6ca85f6f7..0a9af7ff85e2 100644 --- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/api/ContainerEndpoint.java +++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/api/ContainerEndpoint.java @@ -72,6 +72,8 @@ import org.apache.hadoop.ozone.recon.api.types.KeysResponse; import org.apache.hadoop.ozone.recon.api.types.MissingContainerMetadata; import org.apache.hadoop.ozone.recon.api.types.MissingContainersResponse; +import org.apache.hadoop.ozone.recon.api.types.DatanodeUnhealthyContainersResponse; +import org.apache.hadoop.ozone.recon.api.types.DatanodeUnhealthySummary; import org.apache.hadoop.ozone.recon.api.types.UnhealthyContainerMetadata; import org.apache.hadoop.ozone.recon.api.types.UnhealthyContainersResponse; import org.apache.hadoop.ozone.recon.api.types.UnhealthyContainersSummary; @@ -812,4 +814,144 @@ public Response getOmContainersDeletedInSCM( response.put("containerDiscrepancyInfo", containerDiscrepancyInfoList); return Response.ok(response).build(); } + + /** + * Return a summary of unhealthy containers grouped by DataNode. + * For each DataNode, the response includes a breakdown of unhealthy + * container counts by state (MISSING, UNDER_REPLICATED, etc.). + * + * @param state Optional filter: only count containers in this state. + * @param limit Max number of unhealthy containers to scan for aggregation. + * @return {@link Response} containing a list of {@link DatanodeUnhealthySummary}. + */ + @GET + @Path("/unhealthy/byDatanode") + public Response getUnhealthyContainersByDatanode( + @QueryParam("state") String state, + @DefaultValue("0") @QueryParam(RECON_QUERY_LIMIT) int limit) { + + ContainerSchemaDefinition.UnHealthyContainerStates containerState = null; + if (StringUtils.isNotEmpty(state)) { + try { + containerState = ContainerSchemaDefinition + .UnHealthyContainerStates.valueOf(state); + } catch (IllegalArgumentException e) { + throw new WebApplicationException(e, Response.Status.BAD_REQUEST); + } + } + + final int effectiveLimit = (limit <= 0 || limit > maxCsvExportRecords) + ? maxCsvExportRecords : limit; + + // Fetch unhealthy containers from the DB + List records = + containerHealthSchemaManager.getUnhealthyContainers( + containerState, 0L, 0L, effectiveLimit); + + // Build DataNode -> Summary map + Map dnSummaryMap = new HashMap<>(); + + for (ContainerHealthSchemaManager.UnhealthyContainerRecord record : records) { + long containerID = record.getContainerId(); + String containerStateStr = record.getContainerState(); + try { + ContainerInfo containerInfo = + containerManager.getContainer(ContainerID.valueOf(containerID)); + List datanodes = + containerManager.getLatestContainerHistory(containerID, + containerInfo.getReplicationConfig().getRequiredNodes()); + + for (ContainerHistory dn : datanodes) { + String uuid = dn.getDatanodeUuid(); + DatanodeUnhealthySummary summary = dnSummaryMap.computeIfAbsent( + uuid, k -> new DatanodeUnhealthySummary(uuid, + dn.getDatanodeHost())); + summary.incrementStateCount(containerStateStr); + } + } catch (IOException e) { + LOG.warn("Failed to get container info/history for container {}", + containerID, e); + } + } + + // Sort by total unhealthy count descending + List summaries = dnSummaryMap.values().stream() + .sorted(Comparator.comparingInt( + DatanodeUnhealthySummary::getTotalUnhealthyContainers).reversed()) + .collect(Collectors.toList()); + + Map response = new HashMap<>(); + response.put("datanodes", summaries); + return Response.ok(response).build(); + } + + /** + * Return unhealthy containers for a specific DataNode. + * The response includes the full container metadata for each unhealthy + * container whose replica history includes the given DataNode. + * + * @param datanodeUuid The UUID of the DataNode to query. + * @param state Optional filter by unhealthy state. + * @param limit Max number of containers to scan. + * @return {@link Response} containing {@link DatanodeUnhealthyContainersResponse}. + */ + @GET + @Path("/unhealthy/byDatanode/{uuid}") + public Response getUnhealthyContainersForDatanode( + @PathParam("uuid") String datanodeUuid, + @QueryParam("state") String state, + @DefaultValue(DEFAULT_FETCH_COUNT) @QueryParam(RECON_QUERY_LIMIT) + int limit) { + + ContainerSchemaDefinition.UnHealthyContainerStates containerState = null; + if (StringUtils.isNotEmpty(state)) { + try { + containerState = ContainerSchemaDefinition + .UnHealthyContainerStates.valueOf(state); + } catch (IllegalArgumentException e) { + throw new WebApplicationException(e, Response.Status.BAD_REQUEST); + } + } + + final int effectiveLimit = (limit <= 0 || limit > maxCsvExportRecords) + ? maxCsvExportRecords : limit; + + // Fetch all unhealthy containers + List records = + containerHealthSchemaManager.getUnhealthyContainers( + containerState, 0L, 0L, effectiveLimit); + + // Filter to containers that have this DataNode in their replica history + List matchingContainers = new ArrayList<>(); + String datanodeHost = "N/A"; + + for (ContainerHealthSchemaManager.UnhealthyContainerRecord record : records) { + try { + UnhealthyContainerMetadata meta = toUnhealthyMetadata(record); + boolean hasDatanode = meta.getReplicas() != null + && meta.getReplicas().stream() + .anyMatch(r -> datanodeUuid.equals(r.getDatanodeUuid())); + + if (hasDatanode) { + matchingContainers.add(meta); + // Get the hostname from the first match + if ("N/A".equals(datanodeHost)) { + datanodeHost = meta.getReplicas().stream() + .filter(r -> datanodeUuid.equals(r.getDatanodeUuid())) + .map(ContainerHistory::getDatanodeHost) + .findFirst().orElse("N/A"); + } + } + } catch (UncheckedIOException e) { + LOG.warn("Failed to get metadata for container {}", + record.getContainerId(), e); + } + } + + DatanodeUnhealthyContainersResponse response = + new DatanodeUnhealthyContainersResponse( + datanodeUuid, datanodeHost, + matchingContainers.size(), matchingContainers); + return Response.ok(response).build(); + } } diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/api/types/DatanodeUnhealthyContainersResponse.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/api/types/DatanodeUnhealthyContainersResponse.java new file mode 100644 index 000000000000..db64c4ec4d17 --- /dev/null +++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/api/types/DatanodeUnhealthyContainersResponse.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.ozone.recon.api.types; + +import java.util.List; +import javax.xml.bind.annotation.XmlAccessType; +import javax.xml.bind.annotation.XmlAccessorType; +import javax.xml.bind.annotation.XmlElement; + +/** + * Response object for the drill-down of unhealthy containers + * on a specific DataNode. + */ +@XmlAccessorType(XmlAccessType.FIELD) +public class DatanodeUnhealthyContainersResponse { + + @XmlElement(name = "datanodeUuid") + private String datanodeUuid; + + @XmlElement(name = "datanodeHost") + private String datanodeHost; + + @XmlElement(name = "totalCount") + private int totalCount; + + @XmlElement(name = "containers") + private List containers; + + public DatanodeUnhealthyContainersResponse() { + } + + public DatanodeUnhealthyContainersResponse( + String datanodeUuid, String datanodeHost, + int totalCount, List containers) { + this.datanodeUuid = datanodeUuid; + this.datanodeHost = datanodeHost; + this.totalCount = totalCount; + this.containers = containers; + } + + public String getDatanodeUuid() { + return datanodeUuid; + } + + public String getDatanodeHost() { + return datanodeHost; + } + + public int getTotalCount() { + return totalCount; + } + + public List getContainers() { + return containers; + } +} diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/api/types/DatanodeUnhealthySummary.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/api/types/DatanodeUnhealthySummary.java new file mode 100644 index 000000000000..0c67b6a69dc4 --- /dev/null +++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/api/types/DatanodeUnhealthySummary.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.ozone.recon.api.types; + +import java.util.HashMap; +import java.util.Map; +import javax.xml.bind.annotation.XmlAccessType; +import javax.xml.bind.annotation.XmlAccessorType; +import javax.xml.bind.annotation.XmlElement; + +/** + * Summary of unhealthy containers for a specific DataNode. + */ +@XmlAccessorType(XmlAccessType.FIELD) +public class DatanodeUnhealthySummary { + + @XmlElement(name = "datanodeUuid") + private String datanodeUuid; + + @XmlElement(name = "datanodeHost") + private String datanodeHost; + + @XmlElement(name = "totalUnhealthyContainers") + private int totalUnhealthyContainers; + + @XmlElement(name = "stateCounts") + private Map stateCounts; + + public DatanodeUnhealthySummary() { + this.stateCounts = new HashMap<>(); + } + + public DatanodeUnhealthySummary(String datanodeUuid, String datanodeHost) { + this.datanodeUuid = datanodeUuid; + this.datanodeHost = datanodeHost; + this.totalUnhealthyContainers = 0; + this.stateCounts = new HashMap<>(); + } + + public void incrementStateCount(String state) { + stateCounts.merge(state, 1, Integer::sum); + totalUnhealthyContainers++; + } + + public String getDatanodeUuid() { + return datanodeUuid; + } + + public String getDatanodeHost() { + return datanodeHost; + } + + public int getTotalUnhealthyContainers() { + return totalUnhealthyContainers; + } + + public Map getStateCounts() { + return stateCounts; + } +} diff --git a/hadoop-ozone/recon/src/main/resources/webapps/recon/ozone-recon-web/src/v2/pages/datanodes/datanodes.tsx b/hadoop-ozone/recon/src/main/resources/webapps/recon/ozone-recon-web/src/v2/pages/datanodes/datanodes.tsx index 0283a1872f1f..3734552b665c 100644 --- a/hadoop-ozone/recon/src/main/resources/webapps/recon/ozone-recon-web/src/v2/pages/datanodes/datanodes.tsx +++ b/hadoop-ozone/recon/src/main/resources/webapps/recon/ozone-recon-web/src/v2/pages/datanodes/datanodes.tsx @@ -24,11 +24,16 @@ import React, { import moment from 'moment'; import { Button, - Modal + Card, + Modal, + Table, + Tag, + Tooltip } from 'antd'; import { DeleteOutlined, WarningFilled, + ExclamationCircleOutlined, } from '@ant-design/icons'; import { ValueType } from 'react-select'; @@ -45,7 +50,9 @@ import { DatanodeDecomissionInfo, DatanodeResponse, DatanodesResponse, - DatanodesState + DatanodesState, + DatanodeUnhealthySummary, + DatanodeUnhealthyByDatanodeResponse } from '@/v2/types/datanode.types'; import './datanodes.less' @@ -77,6 +84,22 @@ const SearchableColumnOpts = [{ let decommissionUuids: string | string[] = []; const COLUMN_UPDATE_DECOMMISSIONING = 'DECOMMISSIONING'; +const STATE_COLORS: Record = { + 'MISSING': '#ff4d4f', + 'UNDER_REPLICATED': '#fa8c16', + 'OVER_REPLICATED': '#1890ff', + 'MIS_REPLICATED': '#722ed1', + 'REPLICA_MISMATCH': '#eb2f96' +}; + +const STATE_LABELS: Record = { + 'MISSING': 'Missing', + 'UNDER_REPLICATED': 'Under-Replicated', + 'OVER_REPLICATED': 'Over-Replicated', + 'MIS_REPLICATED': 'Mis-Replicated', + 'REPLICA_MISMATCH': 'Mismatched' +}; + const Datanodes: React.FC<{}> = () => { const [state, setState] = useState({ @@ -117,6 +140,16 @@ const Datanodes: React.FC<{}> = () => { } } ); + + // Unhealthy containers by DataNode API + const unhealthyByDnAPI = useApiData( + '/api/v1/containers/unhealthy/byDatanode', + { datanodes: [] }, + { + initialFetch: false, + onError: (error) => showDataFetchError(error) + } + ); const loading = decommissionAPI.loading || datanodesAPI.loading || removeDatanodesAPI.loading; const [selectedColumns, setSelectedColumns] = useState(defaultColumns); @@ -143,6 +176,7 @@ const Datanodes: React.FC<{}> = () => { // Trigger both API hooks to refetch data decommissionAPI.refetch(); datanodesAPI.refetch(); + unhealthyByDnAPI.refetch(); }; // Process data when both APIs have loaded @@ -216,6 +250,58 @@ const Datanodes: React.FC<{}> = () => { const { dataSource, lastUpdated, columnOptions } = state; + // Columns for the unhealthy containers by DataNode table + const unhealthyDnColumns = [ + { + title: 'DataNode Host', + dataIndex: 'datanodeHost', + key: 'datanodeHost', + sorter: (a: DatanodeUnhealthySummary, b: DatanodeUnhealthySummary) => + a.datanodeHost.localeCompare(b.datanodeHost), + }, + { + title: 'Total Unhealthy', + dataIndex: 'totalUnhealthyContainers', + key: 'totalUnhealthyContainers', + defaultSortOrder: 'descend' as const, + sorter: (a: DatanodeUnhealthySummary, b: DatanodeUnhealthySummary) => + a.totalUnhealthyContainers - b.totalUnhealthyContainers, + render: (count: number) => ( + 0 ? '#ff4d4f' : '#52c41a' }}> + {count} + + ), + }, + ...Object.keys(STATE_LABELS).map(stateKey => ({ + title: STATE_LABELS[stateKey], + key: stateKey, + sorter: (a: DatanodeUnhealthySummary, b: DatanodeUnhealthySummary) => + (a.stateCounts[stateKey] ?? 0) - (b.stateCounts[stateKey] ?? 0), + render: (_: any, record: DatanodeUnhealthySummary) => { + const count = record.stateCounts[stateKey] ?? 0; + return count > 0 + ? {count} + : 0; + }, + })), + { + title: 'UUID', + dataIndex: 'datanodeUuid', + key: 'datanodeUuid', + ellipsis: true, + width: 180, + render: (uuid: string) => ( + + + {uuid.substring(0, 8)}… + + + ), + }, + ]; + + const unhealthyDnData = unhealthyByDnAPI.data?.datanodes ?? []; + return ( <>
@@ -277,6 +363,44 @@ const Datanodes: React.FC<{}> = () => { handleSelectionChange={handleSelectionChange} decommissionUuids={decommissionUuids}/>
+ + {/* Unhealthy Containers by DataNode Section */} +
+ + + Unhealthy Containers by DataNode + + } + loading={unhealthyByDnAPI.loading} + extra={ + + {unhealthyDnData.length} DataNode(s) with unhealthy containers + + } + > + {unhealthyDnData.length === 0 && !unhealthyByDnAPI.loading ? ( +
+ No unhealthy containers found across DataNodes. +
+ ) : ( + + `${range[0]}-${range[1]} of ${total} DataNodes` + }} + /> + )} + + + void; } + +// Types for Unhealthy Container-to-DataNode mapping +export type DatanodeUnhealthySummary = { + datanodeUuid: string; + datanodeHost: string; + totalUnhealthyContainers: number; + stateCounts: Record; +} + +export type DatanodeUnhealthyByDatanodeResponse = { + datanodes: DatanodeUnhealthySummary[]; +}