diff --git a/.github/workflows/_xpu_4cards_case_test.yml b/.github/workflows/_xpu_4cards_case_test.yml index 2c4d6deb9e1..f3c97f40dc6 100644 --- a/.github/workflows/_xpu_4cards_case_test.yml +++ b/.github/workflows/_xpu_4cards_case_test.yml @@ -193,13 +193,29 @@ jobs: echo "============================开始运行pytest测试============================" export PYTHONPATH=/workspace/FastDeploy/ export PYTHONPATH=$(pwd)/tests/xpu_ci:$PYTHONPATH + mkdir -p case_logs + set +e python -m pytest -v -s --tb=short tests/xpu_ci/4cards_cases/ exit_code=$? + set -e + + # 修改case_logs权限,确保Docker外部的runner用户可以读取并上传 + chmod -R a+rX case_logs/ 2>/dev/null || true if [ $exit_code -eq 0 ]; then echo "============================4卡cases测试通过!============================" + exit $exit_code else echo "============================4卡cases测试失败,请检查日志!============================" exit $exit_code fi ' + + - name: Upload case logs + if: always() + uses: actions/upload-artifact@v6 + with: + name: xpu-4cards-case-logs + path: FastDeploy/case_logs/ + retention-days: 7 + if-no-files-found: ignore diff --git a/.github/workflows/_xpu_8cards_case_test.yml b/.github/workflows/_xpu_8cards_case_test.yml index b4e5b85083f..c9ed0fa2314 100644 --- a/.github/workflows/_xpu_8cards_case_test.yml +++ b/.github/workflows/_xpu_8cards_case_test.yml @@ -182,8 +182,14 @@ jobs: echo "============================开始运行pytest测试============================" export PYTHONPATH=/workspace/FastDeploy/ export PYTHONPATH=$(pwd)/tests/xpu_ci:$PYTHONPATH + mkdir -p case_logs + set +e python -m pytest -v -s --tb=short tests/xpu_ci/8cards_cases/ exit_code=$? + set -e + + # 修改case_logs权限,确保Docker外部的runner用户可以读取并上传 + chmod -R a+rX case_logs/ 2>/dev/null || true if [ $exit_code -eq 0 ]; then echo "============================8卡cases测试通过!============================" @@ -192,3 +198,12 @@ jobs: exit $exit_code fi ' + + - name: Upload case logs + if: always() + uses: actions/upload-artifact@v6 + with: + name: xpu-8cards-case-logs + path: FastDeploy/case_logs/ + retention-days: 7 + if-no-files-found: ignore diff --git a/tests/xpu_ci/8cards_cases/test_pd_21b_ep4tp1.py b/tests/xpu_ci/8cards_cases/test_pd_21b_ep4tp1.py index d7d75090561..2429b8c1458 100644 --- a/tests/xpu_ci/8cards_cases/test_pd_21b_ep4tp1.py +++ b/tests/xpu_ci/8cards_cases/test_pd_21b_ep4tp1.py @@ -109,7 +109,7 @@ def print_pd_logs_on_failure(): log_dirs = ["log_router", "log_prefill", "log_decode"] for log_dir in log_dirs: - nohup_path = os.path.join(log_dir, "log_0/worklog.0") + nohup_path = os.path.join(log_dir, "log_0/workerlog.0") if os.path.exists(nohup_path): print(f"\n========== {nohup_path} ==========") with open(nohup_path, "r") as f: diff --git a/tests/xpu_ci/8cards_cases/test_pd_21b_ep4tp4.py b/tests/xpu_ci/8cards_cases/test_pd_21b_ep4tp4.py index 919a5c39942..c1f804e6466 100644 --- a/tests/xpu_ci/8cards_cases/test_pd_21b_ep4tp4.py +++ b/tests/xpu_ci/8cards_cases/test_pd_21b_ep4tp4.py @@ -109,7 +109,7 @@ def print_pd_logs_on_failure(): log_dirs = ["log_router", "log_prefill", "log_decode"] for log_dir in log_dirs: - nohup_path = os.path.join(log_dir, "log_0/worklog.0") + nohup_path = os.path.join(log_dir, "log_0/workerlog.0") if os.path.exists(nohup_path): print(f"\n========== {nohup_path} ==========") with open(nohup_path, "r") as f: diff --git a/tests/xpu_ci/8cards_cases/test_pd_21b_ep4tp4_cudagraph.py b/tests/xpu_ci/8cards_cases/test_pd_21b_ep4tp4_cudagraph.py index 14fcb358f8f..280870f0a55 100644 --- a/tests/xpu_ci/8cards_cases/test_pd_21b_ep4tp4_cudagraph.py +++ b/tests/xpu_ci/8cards_cases/test_pd_21b_ep4tp4_cudagraph.py @@ -109,7 +109,7 @@ def print_pd_logs_on_failure(): log_dirs = ["log_router", "log_prefill", "log_decode"] for log_dir in log_dirs: - nohup_path = os.path.join(log_dir, "log_0/worklog.0") + nohup_path = os.path.join(log_dir, "log_0/workerlog.0") if os.path.exists(nohup_path): print(f"\n========== {nohup_path} ==========") with open(nohup_path, "r") as f: diff --git a/tests/xpu_ci/8cards_cases/test_pd_p_tp4ep4_d_tp1ep4.py b/tests/xpu_ci/8cards_cases/test_pd_p_tp4ep4_d_tp1ep4.py index 2904749a10c..936bc8b6371 100644 --- a/tests/xpu_ci/8cards_cases/test_pd_p_tp4ep4_d_tp1ep4.py +++ b/tests/xpu_ci/8cards_cases/test_pd_p_tp4ep4_d_tp1ep4.py @@ -110,7 +110,7 @@ def print_pd_logs_on_failure(): log_dirs = ["log_router", "log_prefill", "log_decode"] for log_dir in log_dirs: - nohup_path = os.path.join(log_dir, "log_0/worklog.0") + nohup_path = os.path.join(log_dir, "log_0/workerlog.0") if os.path.exists(nohup_path): print(f"\n========== {nohup_path} ==========") with open(nohup_path, "r") as f: diff --git a/tests/xpu_ci/conftest.py b/tests/xpu_ci/conftest.py index 62496bff16f..ae0c95d727a 100644 --- a/tests/xpu_ci/conftest.py +++ b/tests/xpu_ci/conftest.py @@ -23,6 +23,7 @@ 4. 环境配置 - 设置XPU相关环境变量 """ +import glob import json import os import shutil @@ -31,6 +32,8 @@ import pytest +CASE_LOGS_DIR = os.path.join(os.getcwd(), "case_logs") + def get_xpu_id(): """获取XPU_ID环境变量""" @@ -457,3 +460,42 @@ def setup_logprobs_zmq_env(): os.environ[key] = value print(f"设置环境变量: {key}={value}") return original_values + + +# ============ 日志归档 pytest hook ============ + + +def _archive_case_logs(test_name): + """ + 将当前工作目录下所有 log 开头的文件夹和 server.log 复制到 case_logs/{test_name}/ 下 + """ + dest_dir = os.path.join(CASE_LOGS_DIR, test_name) + os.makedirs(dest_dir, exist_ok=True) + + # 复制所有 log* 目录 + for entry in glob.glob("log*"): + if os.path.isdir(entry): + shutil.copytree(entry, os.path.join(dest_dir, entry), dirs_exist_ok=True) + elif os.path.isfile(entry): + # 处理 server.log 等 log 开头的文件 + shutil.copy2(entry, os.path.join(dest_dir, entry)) + + # 单独处理 server.log(不以 log 开头但也是关键日志) + if os.path.exists("server.log") and not os.path.exists(os.path.join(dest_dir, "server.log")): + shutil.copy2("server.log", os.path.join(dest_dir, "server.log")) + + +@pytest.hookimpl(hookwrapper=True, trylast=True) +def pytest_runtest_makereport(item, call): + """每个测试阶段结束后归档日志(仅在 call 阶段后执行)""" + outcome = yield + report = outcome.get_result() + + if report.when == "call": + # 使用测试文件名(不含 .py)作为归档目录名 + test_file = os.path.basename(item.fspath) + test_name = os.path.splitext(test_file)[0] + try: + _archive_case_logs(test_name) + except Exception: + pass