From 3a0610548118fb81b8881249cf56729b0e1f64bb Mon Sep 17 00:00:00 2001 From: XkunW Date: Wed, 8 Apr 2026 16:39:00 -0400 Subject: [PATCH] Add retry loop for writing server address to job json file, rename temp json file to avoid name collision --- vec_inf/client/_slurm_templates.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/vec_inf/client/_slurm_templates.py b/vec_inf/client/_slurm_templates.py index 52d59989..1a7bf496 100644 --- a/vec_inf/client/_slurm_templates.py +++ b/vec_inf/client/_slurm_templates.py @@ -186,10 +186,15 @@ class SlurmScriptTemplate(TypedDict): ], "write_to_json": [ '\njson_path="{log_dir}/{model_name}.$SLURM_JOB_ID/{model_name}.$SLURM_JOB_ID.json"', - 'jq --arg server_addr "$server_address" \\', - " '. + {{\"server_address\": $server_addr}}' \\", - ' "$json_path" > temp.json \\', - ' && mv temp.json "$json_path"', + 'tmp_json="${{json_path}}.tmp.$$"', + "for _attempt in 1 2 3 4 5; do", + ' jq --arg server_addr "$server_address" \\', + " '. + {{\"server_address\": $server_addr}}' \\", + ' "$json_path" > "$tmp_json" \\', + ' && mv "$tmp_json" "$json_path" \\', + " && break", + " sleep 2", + "done", ], "launch_cmd": { "vllm": [ @@ -303,10 +308,15 @@ class BatchModelLaunchScriptTemplate(TypedDict): "write_to_json": [ "het_job_id=$(($SLURM_JOB_ID+{het_group_id}))", 'json_path="{log_dir}/{slurm_job_name}.$het_job_id/{model_name}.$het_job_id.json"', - 'jq --arg server_addr "$server_address" \\', - " '. + {{\"server_address\": $server_addr}}' \\", - ' "$json_path" > temp_{model_name}.json \\', - ' && mv temp_{model_name}.json "$json_path"\n', + 'tmp_json="${{json_path}}.tmp.$$"', + "for _attempt in 1 2 3 4 5; do", + ' jq --arg server_addr "$server_address" \\', + " '. + {{\"server_address\": $server_addr}}' \\", + ' "$json_path" > "$tmp_json" \\', + ' && mv "$tmp_json" "$json_path" \\', + " && break", + " sleep 2", + "done\n", ], "container_command": f"{CONTAINER_MODULE_NAME} exec --nv --containall {{image_path}} \\", "launch_cmd": {