From 0bc23aebd1c5c66f7337786099b530e9fd1b0091 Mon Sep 17 00:00:00 2001 From: richard_kuo Date: Tue, 7 Apr 2026 18:39:21 +0800 Subject: [PATCH] [Accton][AS7927-50X] implement XCVR thermal monitoring, OTP, and BMC reporting This commit introduces a comprehensive thermal management policy for transceivers (XCVR) and integrates it with the BMC. Key changes include: - build: Add `ipmitool` to base packages. - sys: Introduce `bmc_thermal_data` sysfs attribute to send thermal data (temperature and port number) to the BMC via IPMI cmd 0x13. - onlp: Implement temperature and high-alarm getters for SFF-8472, SFF-8436, and CMIS compatible modules. - onlp: Add Over-Temperature Protection (OTP) logic to disable e-fuse for SFPs or reset QSFPs when critical temperatures are exceeded. - onlp: Spawn a dedicated thermal policy `pthread`, triggered during fan management, to evaluate OTP conditions and sync the highest transceiver temperature to the BMC. --- .../buster/common/all-base-packages.yml | 1 + .../builds/src/x86-64-accton-as7927-50x-sys.c | 64 ++- .../module/src/platform_lib.h | 3 + .../module/src/sysi.c | 536 +++++++++++++++++- 4 files changed, 590 insertions(+), 14 deletions(-) diff --git a/builds/any/rootfs/buster/common/all-base-packages.yml b/builds/any/rootfs/buster/common/all-base-packages.yml index b8e9df5f3a..36a5d200c9 100644 --- a/builds/any/rootfs/buster/common/all-base-packages.yml +++ b/builds/any/rootfs/buster/common/all-base-packages.yml @@ -85,3 +85,4 @@ - htop - tree - memtester +- ipmitool diff --git a/packages/platforms/accton/x86-64/as7927-50x/modules/builds/src/x86-64-accton-as7927-50x-sys.c b/packages/platforms/accton/x86-64/as7927-50x/modules/builds/src/x86-64-accton-as7927-50x-sys.c index 4bb7519f50..26be078c38 100644 --- a/packages/platforms/accton/x86-64/as7927-50x/modules/builds/src/x86-64-accton-as7927-50x-sys.c +++ b/packages/platforms/accton/x86-64/as7927-50x/modules/builds/src/x86-64-accton-as7927-50x-sys.c @@ -45,10 +45,13 @@ #define IPMI_CPLD_DCSCM_CMD 0x06 // Since addr conflicts with FPGA, replaced by 0x06 #define IPMI_CPLD_FPGA_CMD 0x60 #define IPMI_CPLD_SYS_CMD 0x61 +#define IPMI_SEND_THERMAL_DATA_CMD 0x13 static int as7927_50x_sys_probe(struct platform_device *pdev); static int as7927_50x_sys_remove(struct platform_device *pdev); static ssize_t show_cpld_version(struct device *dev, struct device_attribute *da, char *buf); +static ssize_t set_bmc_thermal_data(struct device *dev, struct device_attribute *da, + const char *buf, size_t count); struct as7927_50x_sys_data { struct platform_device *pdev; @@ -58,7 +61,7 @@ struct as7927_50x_sys_data { struct ipmi_data ipmi; unsigned char ipmi_resp_eeprom[EEPROM_SIZE]; unsigned char ipmi_resp_cpld[4]; - unsigned char ipmi_tx_data[2]; + unsigned char ipmi_tx_data[3]; struct bin_attribute eeprom; /* eeprom data */ }; @@ -78,7 +81,8 @@ enum as5916_54xks_sys_sysfs_attrs { FPGA_CPLD, FAN_CPLD, DCSCM_CPLD, - SYS_CPLD + SYS_CPLD, + THERMAL_DATA }; /* Functions to talk to the IPMI layer */ static SENSOR_DEVICE_ATTR(come_e_cpld_ver, S_IRUGO, show_cpld_version, NULL, COM_E_CPLD); @@ -86,6 +90,7 @@ static SENSOR_DEVICE_ATTR(fpga_cpld_ver, S_IRUGO, show_cpld_version, NULL, FPGA_ static SENSOR_DEVICE_ATTR(fan_cpld_ver, S_IRUGO, show_cpld_version, NULL, FAN_CPLD); static SENSOR_DEVICE_ATTR(dcscm_cpld_ver, S_IRUGO, show_cpld_version, NULL, DCSCM_CPLD); static SENSOR_DEVICE_ATTR(sys_cpld_ver, S_IRUGO, show_cpld_version, NULL, SYS_CPLD); +static SENSOR_DEVICE_ATTR(bmc_thermal_data, S_IWUSR, NULL, set_bmc_thermal_data, THERMAL_DATA); static struct attribute *as7927_50x_sys_attributes[] = { &sensor_dev_attr_come_e_cpld_ver.dev_attr.attr, @@ -93,6 +98,7 @@ static struct attribute *as7927_50x_sys_attributes[] = { &sensor_dev_attr_fan_cpld_ver.dev_attr.attr, &sensor_dev_attr_dcscm_cpld_ver.dev_attr.attr, &sensor_dev_attr_sys_cpld_ver.dev_attr.attr, + &sensor_dev_attr_bmc_thermal_data.dev_attr.attr, NULL }; @@ -113,7 +119,7 @@ static ssize_t sys_eeprom_read(loff_t off, char *buf, size_t count) data->ipmi_tx_data[0] = (off & 0xff); data->ipmi_tx_data[1] = length; status = ipmi_send_message(&data->ipmi, IPMI_SYSEEPROM_READ_CMD, - data->ipmi_tx_data, sizeof(data->ipmi_tx_data), + data->ipmi_tx_data, 2, data->ipmi_resp_eeprom + off, length); if (unlikely(status != 0)) { goto exit; @@ -267,6 +273,58 @@ static ssize_t show_cpld_version(struct device *dev, struct device_attribute *da return error; } +static ssize_t set_bmc_thermal_data(struct device *dev, struct device_attribute *da, + const char *buf, size_t count) +{ + int status; + int args; + char *opt, tmp[32] = {0}; + char *tmp_p; + size_t copy_size; + u8 input[3] = {0}; + + copy_size = (count < sizeof(tmp)) ? count : sizeof(tmp) - 1; + #ifdef __STDC_LIB_EXT1__ + memcpy_s(tmp, copy_size, buf, copy_size); + #else + memcpy(tmp, buf, copy_size); + #endif + tmp[copy_size] = '\0'; + + args = 0; + tmp_p = strim(tmp); + while (args < 3 && (opt = strsep(&tmp_p, " ")) != NULL) { + if (kstrtou8(opt, 10, &input[args]) == 0) { + args++; + } + } + if (args != 3) { + return -EINVAL; + } + + mutex_lock(&data->update_lock); + + data->ipmi_tx_data[0] = input[0]; + data->ipmi_tx_data[1] = input[1]; + data->ipmi_tx_data[2] = input[2]; + status = ipmi_send_message(&data->ipmi, IPMI_SEND_THERMAL_DATA_CMD, + data->ipmi_tx_data, 3, + NULL, 0); + if (unlikely(status != 0)) + goto exit; + + if (unlikely(data->ipmi.rx_result != 0)) { + status = -EINVAL; + goto exit; + } + + status = count; + +exit: + mutex_unlock(&data->update_lock); + return status; +} + static int as7927_50x_sys_probe(struct platform_device *pdev) { int status = -1; diff --git a/packages/platforms/accton/x86-64/as7927-50x/onlp/builds/x86_64_accton_as7927_50x/module/src/platform_lib.h b/packages/platforms/accton/x86-64/as7927-50x/onlp/builds/x86_64_accton_as7927_50x/module/src/platform_lib.h index 1d0f659a04..b0be39b4ed 100644 --- a/packages/platforms/accton/x86-64/as7927-50x/onlp/builds/x86_64_accton_as7927_50x/module/src/platform_lib.h +++ b/packages/platforms/accton/x86-64/as7927-50x/onlp/builds/x86_64_accton_as7927_50x/module/src/platform_lib.h @@ -26,6 +26,8 @@ #ifndef __PLATFORM_LIB_H__ #define __PLATFORM_LIB_H__ +#include +#include #include "x86_64_accton_as7927_50x_log.h" #define CHASSIS_FAN_COUNT 14 @@ -46,6 +48,7 @@ #define BIOS_VER_PATH "/sys/devices/virtual/dmi/id/bios_version" #define BMC_VER1_PATH "/sys/devices/platform/ipmi_bmc.0/firmware_revision" #define BMC_VER2_PATH "/sys/devices/platform/ipmi_bmc.0/aux_firmware_revision" +#define BMC_THERMAL_DATA_PATH "/sys/devices/platform/as7927_50x_sys/bmc_thermal_data" enum onlp_thermal_id { THERMAL_RESERVED = 0, diff --git a/packages/platforms/accton/x86-64/as7927-50x/onlp/builds/x86_64_accton_as7927_50x/module/src/sysi.c b/packages/platforms/accton/x86-64/as7927-50x/onlp/builds/x86_64_accton_as7927_50x/module/src/sysi.c index 5a3b288283..9e058a8d8d 100644 --- a/packages/platforms/accton/x86-64/as7927-50x/onlp/builds/x86_64_accton_as7927_50x/module/src/sysi.c +++ b/packages/platforms/accton/x86-64/as7927-50x/onlp/builds/x86_64_accton_as7927_50x/module/src/sysi.c @@ -32,13 +32,65 @@ #include #include #include +#include #include "platform_lib.h" #include "x86_64_accton_as7927_50x_int.h" #include "x86_64_accton_as7927_50x_log.h" +#define NUM_OF_CPLD_VER 7 +#define PORT_NUM 50 +#define LAST_OF_SFP_PORT 48 +#define BMC_FILE_RETRY_COUNT 3 // Retry count for file read/write operations +#define BMC_FILE_RETRY_DELAY_S 1 // Delay between retries (in seconds, 1s) -#define NUM_OF_CPLD_VER 7 +#define MODULE_EFUSE_FORMAT "/sys/bus/platform/devices/as7927_50x_fpga/module_efuse_%d" + +typedef struct port_thermal_data { + int present; + int temp; + int high_alarm; +} port_thermal_data_t; + +typedef struct temp_reader_data { + port_thermal_data_t ports[PORT_NUM + 1]; +} temp_reader_data_t; + +int onlp_sysi_get_xcvr_temp(temp_reader_data_t *temp); +int onlp_sysi_xcvr_over_temp_protector(int port); + +enum temp_sensors { + TEMP_SENSOR_XCVR, + TEMP_SENSOR_COUNT +}; + +typedef int (*temp_getter_t)(temp_reader_data_t *temp); +typedef int (*ot_protector_t)(int port); + +typedef struct temp_handler { + temp_getter_t temp_readers[TEMP_SENSOR_COUNT]; +} temp_handler_t; + +/* over temp protection */ +typedef struct otp_handler { + ot_protector_t otp_writer; +} otp_handler_t; + +struct thermal_policy_manager { + temp_handler_t temp_hdlr; + otp_handler_t otp_hdlr; /* over temp protector */ +}; + +struct thermal_policy_manager tp_mgr = { + .temp_hdlr = { + .temp_readers = { + [TEMP_SENSOR_XCVR] = onlp_sysi_get_xcvr_temp + } + }, + .otp_hdlr = { + .otp_writer = onlp_sysi_xcvr_over_temp_protector + } +}; static char* cpld_ver_path[NUM_OF_CPLD_VER] = { "/sys/bus/platform/devices/as7927_50x_sys/come_e_cpld_ver", /* CPU CPLD */ @@ -154,14 +206,12 @@ onlp_sysi_platform_info_get(onlp_platform_info_t* pi) "\r\n\t Carrier(0x60):%s" "\r\n\t Fan(0x33):%s" , v[0], v[1], v[2], v[3], v[4], v[5]); - - - -pi->other_versions = aim_fstrdup("\r\n\t FPGA(0x60):%s" - "\r\n\t BIOS: %s" - "\r\n\t ONIE: %s" - "\r\n\t BMC: %s", - v[6], bios_ver, onie.onie_version, bmc_ver); + + pi->other_versions = aim_fstrdup("\r\n\t FPGA(0x60):%s" + "\r\n\t BIOS: %s" + "\r\n\t ONIE: %s" + "\r\n\t BMC: %s", + v[6], bios_ver, onie.onie_version, bmc_ver); for (i = 0; i < AIM_ARRAYSIZE(v); i++) { AIM_FREE_IF_PTR(v[i]); @@ -175,6 +225,15 @@ pi->other_versions = aim_fstrdup("\r\n\t FPGA(0x60):%s" return ret; } +int +onlp_sysi_get_xcvr_presence(void) +{ + onlp_sfp_bitmap_t bitmap; + onlp_sfp_bitmap_t_init(&bitmap); + onlp_sfp_presence_bitmap_get(&bitmap); + return !(AIM_BITMAP_COUNT(&bitmap) == 0); +} + void onlp_sysi_platform_info_free(onlp_platform_info_t* pi) { @@ -182,9 +241,465 @@ onlp_sysi_platform_info_free(onlp_platform_info_t* pi) aim_free(pi->other_versions); } +// ======================================================= +// SFF-8472 (SFP) Temperature and Alarm Getters +// ======================================================= +int +onlp_sysi_get_sff8472_temp(int port, int *temp) +{ + int value; + int16_t port_temp; + + /* SFF-8472 DDM Support Check is at Address A0h (0x50), Offset 92 */ + value = onlp_sfpi_dev_readb(port, 0x50, 92); + if (value < 0 || !(value & 0x40)) { + *temp = ONLP_STATUS_E_MISSING; + return ONLP_STATUS_E_MISSING; + } + + /* SFF-8472 Temperature is at Address A2h (0x51), Offset 96-97 */ + value = onlp_sfpi_dev_readb(port, 0x51, 96); + if (value < 0) { + *temp = ONLP_STATUS_E_MISSING; + return ONLP_STATUS_E_MISSING; + } + port_temp = (int16_t)((value & 0xFF) << 8); + + value = onlp_sfpi_dev_readb(port, 0x51, 97); + if (value < 0) { + *temp = ONLP_STATUS_E_MISSING; + return ONLP_STATUS_E_MISSING; + } + port_temp = (port_temp | (int16_t)(value & 0xFF)); + + *temp = (int)port_temp * 1000 / 256; + return ONLP_STATUS_OK; +} + +int +onlp_sysi_get_sff8472_temp_alarm(int port, int *alarm) +{ + int value; + int16_t port_alarm; + + /* SFF-8472 High Alarm is at Address A2h (0x51), Offset 00-01 */ + value = onlp_sfpi_dev_readb(port, 0x51, 0); + if (value < 0) { + *alarm = ONLP_STATUS_E_MISSING; + return ONLP_STATUS_E_MISSING; + } + port_alarm = (int16_t)((value & 0xFF) << 8); + + value = onlp_sfpi_dev_readb(port, 0x51, 1); + if (value < 0) { + *alarm = ONLP_STATUS_E_MISSING; + return ONLP_STATUS_E_MISSING; + } + port_alarm = (port_alarm | (int16_t)(value & 0xFF)); + + *alarm = (int)port_alarm * 1000 / 256; + return ONLP_STATUS_OK; +} + +// ======================================================= +// SFF-8436 (QSFP+) Temperature and Alarm Getters +// ======================================================= +int +onlp_sysi_get_sff8436_temp(int port, int *temp) +{ + int value; + int16_t port_temp; + + /* QSFP+ Temperature is at Address A0h (0x50), Page 00h, Offset 22-23 */ + value = onlp_sfpi_dev_readb(port, 0x50, 22); + if (value < 0) { + *temp = ONLP_STATUS_E_MISSING; + return ONLP_STATUS_E_MISSING; + } + port_temp = (int16_t)((value & 0xFF) << 8); + + value = onlp_sfpi_dev_readb(port, 0x50, 23); + if (value < 0) { + *temp = ONLP_STATUS_E_MISSING; + return ONLP_STATUS_E_MISSING; + } + port_temp = (port_temp | (int16_t)(value & 0xFF)); + + *temp = (int)port_temp * 1000 / 256; + return ONLP_STATUS_OK; +} + +int +onlp_sysi_get_sff8436_temp_alarm(int port, int *alarm) +{ + int value; + int16_t port_alarm; + + /* QSFP+ Memory model check is at Address A0h (0x50), Offset 2 */ + value = onlp_sfpi_dev_readb(port, 0x50, 0x2); + if (value < 0 || (value & 0x04)) { + *alarm = ONLP_STATUS_E_MISSING; + return ONLP_STATUS_E_MISSING; + } + + /* QSFP+ Temp High Alarm is at Page 03h,Offset 128-129 */ + if (onlp_sfpi_dev_writeb(port, 0x50, 127, 0x03) < 0) { + *alarm = ONLP_STATUS_E_MISSING; + return ONLP_STATUS_E_MISSING; + } + + value = onlp_sfpi_dev_readb(port, 0x50, 128); + if (value < 0) { + *alarm = ONLP_STATUS_E_MISSING; + onlp_sfpi_dev_writeb(port, 0x50, 127, 0x00); + return ONLP_STATUS_E_MISSING; + } + port_alarm = (int16_t)((value & 0xFF) << 8); + + value = onlp_sfpi_dev_readb(port, 0x50, 129); + if (value < 0) { + *alarm = ONLP_STATUS_E_MISSING; + onlp_sfpi_dev_writeb(port, 0x50, 127, 0x00); + return ONLP_STATUS_E_MISSING; + } + port_alarm = (port_alarm | (int16_t)(value & 0xFF)); + *alarm = (int)port_alarm * 1000 / 256; + + onlp_sfpi_dev_writeb(port, 0x50, 127, 0x00); + return ONLP_STATUS_OK; +} + +// ======================================================= +// CMIS (QSFP-DD) Temperature and Alarm Getters +// ======================================================= +int onlp_sysi_get_cmis_temp(int port, int *temp) +{ + int value; + int16_t port_temp; + + + /* CMIS Temperature is at Address A0h (0x50), Page 00h, Offset 14-15 */ + value = onlp_sfpi_dev_readb(port, 0x50, 14); + if (value < 0) { + *temp = ONLP_STATUS_E_MISSING; + return ONLP_STATUS_E_MISSING; + } + port_temp = (int16_t)((value & 0xFF) << 8); + + value = onlp_sfpi_dev_readb(port, 0x50, 15); + if (value < 0) { + *temp = ONLP_STATUS_E_MISSING; + return ONLP_STATUS_E_MISSING; + } + port_temp = (port_temp | (int16_t)(value & 0xFF)); + + *temp = (int)port_temp * 1000 / 256; + return ONLP_STATUS_OK; +} + +int onlp_sysi_get_cmis_temp_alarm(int port, int *alarm) +{ + int value; + int16_t port_alarm; + + /* CMIS Memory model check is at Address A0h (0x50), Offset 2 */ + value = onlp_sfpi_dev_readb(port, 0x50, 0x2); + if (value < 0 || (value & 0x80)) { + *alarm = ONLP_STATUS_E_MISSING; + return ONLP_STATUS_E_MISSING; + } + + /* CMIS Temp High Alarm is at Page 02h,Offset 128-129 */ + if (onlp_sfpi_dev_writeb(port, 0x50, 127, 0x02) < 0) { + *alarm = ONLP_STATUS_E_MISSING; + return ONLP_STATUS_E_MISSING; + } + + value = onlp_sfpi_dev_readb(port, 0x50, 128); + if (value < 0) { + *alarm = ONLP_STATUS_E_MISSING; + onlp_sfpi_dev_writeb(port, 0x50, 127, 0x00); + return ONLP_STATUS_E_MISSING; + } + port_alarm = (int16_t)((value & 0xFF) << 8); + + value = onlp_sfpi_dev_readb(port, 0x50, 129); + if (value < 0) { + *alarm = ONLP_STATUS_E_MISSING; + onlp_sfpi_dev_writeb(port, 0x50, 127, 0x00); + return ONLP_STATUS_E_MISSING; + } + port_alarm = (port_alarm | (int16_t)(value & 0xFF)); + *alarm = (int)port_alarm * 1000 / 256; + + onlp_sfpi_dev_writeb(port, 0x50, 127, 0x00); + return ONLP_STATUS_OK; +} + +int +onlp_sysi_get_xcvr_temp(temp_reader_data_t *temp) +{ + int ret = ONLP_STATUS_OK; + int value, port; + int port_temp = ONLP_STATUS_E_MISSING, port_alarm = ONLP_STATUS_E_MISSING; + + memset(temp, 0, sizeof(temp_reader_data_t)); + + if (!onlp_sysi_get_xcvr_presence()) { + return ONLP_STATUS_OK; + } + + for (port = 1; port <= PORT_NUM; port++) { + temp->ports[port].temp = ONLP_STATUS_E_MISSING; + temp->ports[port].high_alarm = ONLP_STATUS_E_MISSING; + temp->ports[port].present = 0; + + port_temp = ONLP_STATUS_E_MISSING; + port_alarm = ONLP_STATUS_E_MISSING; + + if (onlp_sfpi_is_present(port) != 1) { + continue; + } + + temp->ports[port].present = 1; + + value = onlp_sfpi_dev_readb(port, 0x50, 0); + if (value < 0) { + AIM_LOG_ERROR("Unable to get read port(%d) eeprom\r\n", port); + continue; + } + + if (value == 0x18 || value == 0x19 || value == 0x1E) { + ret = onlp_sysi_get_cmis_temp(port, &port_temp); + if (ret != ONLP_STATUS_OK) { + continue; + } + onlp_sysi_get_cmis_temp_alarm(port, &port_alarm); + } + else if (value == 0x0C || value == 0x0D || value == 0x11 || value == 0xE1) { + ret = onlp_sysi_get_sff8436_temp(port, &port_temp); + if (ret != ONLP_STATUS_OK) { + continue; + } + onlp_sysi_get_sff8436_temp_alarm(port, &port_alarm); + } + else if(value == 0x03 || value == 0x0b) { + ret = onlp_sysi_get_sff8472_temp(port, &port_temp); + if (ret != ONLP_STATUS_OK) { + continue; + } + onlp_sysi_get_sff8472_temp_alarm(port, &port_alarm); + } + else { + continue; + } + + temp->ports[port].temp = port_temp; + temp->ports[port].high_alarm = (port_alarm != ONLP_STATUS_E_MISSING) ? port_alarm : 75000; + } + + return ONLP_STATUS_OK; +} + +int +onlp_sysi_xcvr_over_temp_protector(int port) +{ + int ret = ONLP_STATUS_E_INTERNAL; + AIM_SYSLOG_CRIT("Temperature critical", "OTP Action", + "Critical temperature detected on port %d; performing OTP protect action!", port); + + // SFP + if (port > 0 && port <= LAST_OF_SFP_PORT){ + ret = onlp_file_write_int(0, MODULE_EFUSE_FORMAT, port); + if (ret != ONLP_STATUS_OK){ + AIM_LOG_ERROR("Unable to write e-fuse status from port(%d)\r\n", port); + } + } + // QSFP + else if (port > LAST_OF_SFP_PORT && port <= PORT_NUM){ + ret = onlp_sfpi_control_set(port, ONLP_SFP_CONTROL_RESET_STATE, 1); + if (ret != ONLP_STATUS_OK){ + AIM_LOG_ERROR("Unable to write reset status to port(%d)\r\n", port); + } + } + + return ret; +} + +/* + * Send thermal data (MAC temp, XCVR temp, port number) to BMC. + * + * This writes to the BMC thermal policy interface, equivalent to: + * ipmitool raw 0x34 0x13 + * + * Temperatures are in millidegree Celsius and converted to degrees Celsius before sending. + * + * @param mac_temp MAC sensor temperature in milli-degrees Celsius + * @param xcvr_temp Transceiver temperature in milli-degrees Celsius + * @param xcvr_num Transceiver port number + * + * @return ONLP_STATUS_OK on success + * ONLP_STATUS_E_INTERNAL if formatting fails + * ONLP_STATUS_E_MISSING if writing to BMC fails + */ +int +send_thermal_data_to_bmc(int mac_temp, int xcvr_temp, int xcvr_num) +{ + char data[64]; + int ret = ONLP_STATUS_E_INTERNAL; + + if (xcvr_temp == 0) { + xcvr_num = 0; + } + + ret = snprintf(data, sizeof(data), "%d %d %d", + (mac_temp / 1000), (xcvr_temp / 1000), xcvr_num); + if (ret < 0 || ret >= (int)sizeof(data)) { + AIM_LOG_WARN("snprintf failed or truncated: mac=%d xcvr=%d port=%d (ret=%d)\n", + mac_temp, xcvr_temp, xcvr_num, ret); + return ONLP_STATUS_E_INTERNAL; + } + + for (int i = 0; i < BMC_FILE_RETRY_COUNT; i++) { + ret = onlp_file_write_str(data, BMC_THERMAL_DATA_PATH); + if (ret == ONLP_STATUS_OK) { + return ONLP_STATUS_OK; + } + sleep(BMC_FILE_RETRY_DELAY_S); + } + + AIM_LOG_ERROR("Failed to write '%s' to %s", data , BMC_THERMAL_DATA_PATH); + return ONLP_STATUS_E_MISSING; +} + +/* + * Control BMC thermal policy by collecting and sending temperature data. + * + *This function performs the following: + * . Reads transceiver temperatures via registered readers. + * . The MAC temperature will Sends 0 because it is not being used. + * . Sends the collected data to the BMC for thermal management. + * + * @return ONLP_STATUS_OK on success, + * ONLP_STATUS_E_MISSING if: + * - sending thermal data fails. + */ +int +control_thermal_policy_via_bmc(void) +{ + int i, p; + int max_temp = ONLP_STATUS_E_MISSING; + int max_port = ONLP_STATUS_E_MISSING; + temp_reader_data_t temp[TEMP_SENSOR_COUNT] = {0}; + static bool port_otp_triggered[PORT_NUM + 1] = {false}; + + for (i = 0; i < AIM_ARRAYSIZE(temp); i++) { + tp_mgr.temp_hdlr.temp_readers[i](&temp[i]); + } + + for (p = 1; p <= PORT_NUM; p++) { + int present = temp[TEMP_SENSOR_XCVR].ports[p].present; + int current_temp = temp[TEMP_SENSOR_XCVR].ports[p].temp; + int high_alarm = temp[TEMP_SENSOR_XCVR].ports[p].high_alarm; + + if (!present || current_temp == ONLP_STATUS_E_MISSING) { + port_otp_triggered[p] = false; + continue; + } + + if (max_temp == ONLP_STATUS_E_MISSING || current_temp > max_temp) { + max_temp = current_temp; + max_port = p; + } + + if (high_alarm != ONLP_STATUS_E_MISSING) { + if (current_temp >= high_alarm) { + if (!port_otp_triggered[p]) { + AIM_LOG_WARN("Port %d temperature (%d mC) exceeded high alarm (%d mC)! Triggering OTP.\n", + p, current_temp, high_alarm); + tp_mgr.otp_hdlr.otp_writer(p); + port_otp_triggered[p] = true; + } + } + else { + port_otp_triggered[p] = false; + } + } + else { + port_otp_triggered[p] = false; + } + } + + if (max_temp == ONLP_STATUS_E_MISSING) { + max_temp = 0; + max_port = 0; + } + int mac_temp = 0; // MAC temperature not required + + return send_thermal_data_to_bmc(mac_temp, max_temp, max_port); +} + +static pthread_mutex_t thermal_lock = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t thermal_cond = PTHREAD_COND_INITIALIZER; +static bool thermal_thread_started = false; +static bool thermal_thread_waiting = false; +static pthread_t thermal_thread; + +/* + * Thermal policy thread loop. + * + * This background thread waits for a condition signal and runs the thermal policy. + * It uses a condition variable to sleep until triggered, and only one instance runs at a time. + * + * Note: Signals are ignored if the thread is busy. No event queueing. + */ +void +*thermal_policy_thread_loop(void *arg) +{ + while (1) { + pthread_mutex_lock(&thermal_lock); + thermal_thread_waiting = true; + pthread_cond_wait(&thermal_cond, &thermal_lock); + thermal_thread_waiting = false; + pthread_mutex_unlock(&thermal_lock); + + control_thermal_policy_via_bmc(); + } + + return NULL; +} + +void +start_thermal_policy_thread_once(void) +{ + pthread_mutex_lock(&thermal_lock); + if (!thermal_thread_started) { + thermal_thread_started = true; + if (pthread_create(&thermal_thread, NULL, thermal_policy_thread_loop, NULL) != 0) { + AIM_LOG_ERROR("Failed to start thermal policy thread."); + thermal_thread_started = false; + } else { + pthread_detach(thermal_thread); + thermal_thread_waiting = true; + AIM_LOG_INFO("Thermal policy thread started."); + } + } + pthread_mutex_unlock(&thermal_lock); +} + int onlp_sysi_platform_manage_fans(void) { + start_thermal_policy_thread_once(); + + pthread_mutex_lock(&thermal_lock); + if (thermal_thread_waiting) { + pthread_cond_signal(&thermal_cond); + } else { + AIM_LOG_INFO("Thermal policy thread is busy; skipping this trigger."); + } + pthread_mutex_unlock(&thermal_lock); + return ONLP_STATUS_OK; } @@ -192,5 +707,4 @@ int onlp_sysi_platform_manage_leds(void) { return ONLP_STATUS_E_UNSUPPORTED; -} - +} \ No newline at end of file