From 992ddd24fb63dee94ee3549d9c9f81bb5e168cff Mon Sep 17 00:00:00 2001 From: Sydney Lister Date: Thu, 2 Apr 2026 11:55:35 -0400 Subject: [PATCH] fix: track errored/undetermined objectives in red team result counts Previously, objectives that failed during attack execution or risk categories with zero prepared objectives were silently dropped from the pipeline. The result_counts.errored field always showed 0 because _compute_result_count only counted existing output items. Changes: - _execution_manager.py: Record 0-objective categories as failed in red_team_info instead of silently skipping. Add expected_count to all red_team_info entries to track expected vs actual objectives. - _result_processor.py: Add _extract_expected_total() to compute total expected objectives from red_team_info (de-duplicated by risk category). Pass expected_total to _compute_result_count() which now computes errored as the delta between expected and actual items. Add partial_failure to _determine_run_status failure detection. - test_result_processor_errored.py: 31 new unit tests covering _compute_result_count with expected_total, _extract_expected_total de-duplication logic, and _determine_run_status failure detection. - test_foundry.py: 3 new tests for 0-objective recording and expected_count propagation in FoundryExecutionManager. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../red_team/_foundry/_execution_manager.py | 42 ++- .../evaluation/red_team/_result_processor.py | 61 +++- .../unittests/test_redteam/test_foundry.py | 99 +++++- .../test_result_processor_errored.py | 328 ++++++++++++++++++ 4 files changed, 520 insertions(+), 10 deletions(-) create mode 100644 sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_result_processor_errored.py diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_foundry/_execution_manager.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_foundry/_execution_manager.py index 61c01c56b352..7a4892b96637 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_foundry/_execution_manager.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_foundry/_execution_manager.py @@ -122,7 +122,37 @@ async def execute_attacks( objectives = objectives_by_risk.get(risk_value, []) if not objectives: - self.logger.info(f"No objectives for {risk_value}, skipping") + self.logger.info(f"No objectives for {risk_value}, recording as failed") + # Record zero-objective categories for every requested strategy + # so _determine_run_status detects the failure and errored + # counts reflect the gap. + from .._utils.formatting_utils import get_strategy_name + + failed_entry = { + "data_file": "", + "status": "failed", + "error": "No attack objectives could be prepared for this risk category", + "asr": 0.0, + "expected_count": 0, + } + foundry_strats, special_strats = StrategyMapper.filter_for_foundry(attack_strategies) + for strategy in foundry_strats: + strategy_key = get_strategy_name(strategy) + if strategy_key not in red_team_info: + red_team_info[strategy_key] = {} + red_team_info[strategy_key][risk_value] = {**failed_entry} + for strategy in special_strats: + flat = strategy if not isinstance(strategy, list) else strategy[0] + if flat != AttackStrategy.Baseline: + strategy_key = get_strategy_name(strategy) + if strategy_key not in red_team_info: + red_team_info[strategy_key] = {} + red_team_info[strategy_key][risk_value] = {**failed_entry} + if include_baseline: + strategy_key = get_strategy_name(AttackStrategy.Baseline) + if strategy_key not in red_team_info: + red_team_info[strategy_key] = {} + red_team_info[strategy_key][risk_value] = {**failed_entry} continue self.logger.info(f"Processing {len(objectives)} objectives for {risk_value}") @@ -186,6 +216,7 @@ async def execute_attacks( "error": str(e), "partial_failure": True, "asr": 0.0, + "expected_count": len(objectives), } else: self.logger.error(f"Error executing attacks for {risk_value}: {e}") @@ -197,6 +228,7 @@ async def execute_attacks( "status": "failed", "error": str(e), "asr": 0.0, + "expected_count": len(objectives), } continue @@ -223,6 +255,7 @@ async def execute_attacks( output_path=output_path, attack_strategies=attack_strategies, include_baseline=include_baseline, + num_objectives=len(objectives), ) for strategy_name, strategy_data in strategy_results.items(): @@ -357,6 +390,7 @@ def _group_results_by_strategy( output_path: str, attack_strategies: List[Union[AttackStrategy, List[AttackStrategy]]], include_baseline: bool, + num_objectives: int = 0, ) -> Dict[str, Dict[str, Any]]: """Group attack results by strategy for red_team_info format. @@ -375,6 +409,8 @@ def _group_results_by_strategy( :type attack_strategies: List[Union[AttackStrategy, List[AttackStrategy]]] :param include_baseline: Whether baseline was included in execution :type include_baseline: bool + :param num_objectives: Number of objectives sent for this risk category + :type num_objectives: int :return: Dictionary mapping strategy name to result data :rtype: Dict[str, Dict[str, Any]] """ @@ -395,6 +431,7 @@ def _group_results_by_strategy( "data_file": output_path, "status": "completed", "asr": overall_asr, + "expected_count": num_objectives, } # Add entries for special strategies that were executed (e.g., IndirectJailbreak via XPIA) @@ -407,6 +444,7 @@ def _group_results_by_strategy( "data_file": output_path, "status": "completed", "asr": overall_asr, + "expected_count": num_objectives, } # Add baseline entry if it was included @@ -415,6 +453,7 @@ def _group_results_by_strategy( "data_file": output_path, "status": "completed", "asr": overall_asr, + "expected_count": num_objectives, } # Fallback if no strategies produced results @@ -423,6 +462,7 @@ def _group_results_by_strategy( "data_file": output_path, "status": "completed", "asr": overall_asr, + "expected_count": num_objectives, } return results diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_result_processor.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_result_processor.py index 18db09313476..1f028b31dcd2 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_result_processor.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_result_processor.py @@ -1424,16 +1424,55 @@ def _format_thresholds_for_output(self) -> Dict[str, Any]: return formatted_thresholds @staticmethod - def _compute_result_count(output_items: List[Dict[str, Any]]) -> Dict[str, int]: + def _extract_expected_total(red_team_info: Optional[Dict]) -> Optional[int]: + """Extract the total expected objective count from red_team_info. + + Each strategy/risk-category entry may carry an ``expected_count`` field + set by the execution layer. Since the same risk category can appear + under multiple strategies (e.g. baseline + crescendo), we de-duplicate + by risk category to avoid double-counting. + + :param red_team_info: The red_team_info tracking dictionary + :return: Total expected objectives, or None if information is unavailable + """ + if not isinstance(red_team_info, dict): + return None + + # Collect the max expected_count per risk category across strategies + per_risk: Dict[str, int] = {} + for risk_data in red_team_info.values(): + if not isinstance(risk_data, dict): + continue + for risk_category, details in risk_data.items(): + if not isinstance(details, dict): + continue + count = details.get("expected_count") + if count is not None: + try: + per_risk[risk_category] = max(per_risk.get(risk_category, 0), int(count)) + except (ValueError, TypeError): + continue + + if not per_risk: + return None + + return sum(per_risk.values()) + + @staticmethod + def _compute_result_count( + output_items: List[Dict[str, Any]], + expected_total: Optional[int] = None, + ) -> Dict[str, int]: """Aggregate run-level pass/fail counts from individual output items. Counts reflect attack success rate (ASR) semantics: - passed: attacks that were unsuccessful (system defended successfully) - failed: attacks that were successful (system was compromised) - - errored: rows that failed to process due to errors + - errored: rows that failed to process due to errors, plus any + objectives that never produced output items (expected vs actual delta) """ - total = len(output_items) + actual_total = len(output_items) passed = failed = errored = 0 for item in output_items: @@ -1474,6 +1513,15 @@ def _compute_result_count(output_items: List[Dict[str, Any]]) -> Dict[str, int]: else: errored += 1 + # Account for objectives that never produced output items (e.g., attack + # execution errors that were caught and swallowed, or risk categories + # with zero objectives prepared). + if expected_total is not None and expected_total > actual_total: + missing = expected_total - actual_total + errored += missing + + total = expected_total if expected_total is not None and expected_total > actual_total else actual_total + return { "total": total, "passed": passed, @@ -1675,7 +1723,7 @@ def _determine_run_status( if not isinstance(details, dict): continue status = details.get("status", "").lower() - if status in ("incomplete", "failed", "timeout", "pending", "running"): + if status in ("incomplete", "failed", "timeout", "pending", "running", "partial_failure"): return "failed" return "completed" @@ -1769,7 +1817,10 @@ def _build_results_payload( if run_name is None: run_name = scan_name or f"redteam-run-{run_id[:8]}" - result_count = self._compute_result_count(output_items) + result_count = self._compute_result_count( + output_items, + expected_total=self._extract_expected_total(red_team_info), + ) per_testing_results = self._compute_per_testing_criteria(output_items) data_source = self._build_data_source_section(parameters, red_team_info) status = self._determine_run_status(scan_result, red_team_info, output_items) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_foundry.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_foundry.py index 00f82e5ba09b..5b45c9fae592 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_foundry.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_foundry.py @@ -1749,7 +1749,7 @@ def test_group_results_by_strategy_with_indirect_jailbreak( @pytest.mark.asyncio async def test_execute_attacks_empty_objectives(self, mock_credential, mock_azure_ai_project, mock_logger): - """Test execute_attacks with no objectives.""" + """Test execute_attacks with no objectives for any risk category.""" manager = FoundryExecutionManager( credential=mock_credential, azure_ai_project=mock_azure_ai_project, @@ -1762,12 +1762,103 @@ async def test_execute_attacks_empty_objectives(self, mock_credential, mock_azur result = await manager.execute_attacks( objective_target=mock_target, risk_categories=[RiskCategory.Violence], - attack_strategies=[AttackStrategy.Base64], + attack_strategies=[AttackStrategy.Baseline], objectives_by_risk={}, # No objectives ) - # Should return empty dict when no objectives - assert result == {} + # When no objectives are available at all the category should be + # recorded as failed in red_team_info so _determine_run_status can + # detect the gap. + assert "baseline" in result + assert "violence" in result["baseline"] + entry = result["baseline"]["violence"] + assert entry["status"] == "failed" + assert entry["expected_count"] == 0 + assert "error" in entry + + @pytest.mark.asyncio + async def test_execute_attacks_zero_objectives_records_failed( + self, mock_credential, mock_azure_ai_project, mock_logger + ): + """When a risk category has zero objectives, it should be recorded as failed + in red_team_info so that _determine_run_status marks the run as failed.""" + manager = FoundryExecutionManager( + credential=mock_credential, + azure_ai_project=mock_azure_ai_project, + logger=mock_logger, + output_dir="/test/output", + ) + + mock_target = MagicMock() + + result = await manager.execute_attacks( + objective_target=mock_target, + risk_categories=[RiskCategory.Violence, RiskCategory.SelfHarm], + attack_strategies=[AttackStrategy.Baseline], + objectives_by_risk={ + "violence": [], # explicitly empty + # self_harm not present at all + }, + ) + + # Both risk categories should be recorded as failed + assert "baseline" in result + assert result["baseline"]["violence"]["status"] == "failed" + assert result["baseline"]["self_harm"]["status"] == "failed" + + @pytest.mark.asyncio + async def test_execute_attacks_zero_objectives_records_all_strategies( + self, mock_credential, mock_azure_ai_project, mock_logger + ): + """When a risk category has zero objectives with multiple strategies, + failed entries should be created for every strategy, not just baseline.""" + manager = FoundryExecutionManager( + credential=mock_credential, + azure_ai_project=mock_azure_ai_project, + logger=mock_logger, + output_dir="/test/output", + ) + + mock_target = MagicMock() + + result = await manager.execute_attacks( + objective_target=mock_target, + risk_categories=[RiskCategory.Violence], + attack_strategies=[AttackStrategy.Base64, AttackStrategy.Baseline], + objectives_by_risk={}, # No objectives + ) + + # Both base64 and baseline strategies should have a failed entry + assert "base64" in result + assert result["base64"]["violence"]["status"] == "failed" + assert "baseline" in result + assert result["baseline"]["violence"]["status"] == "failed" + + def test_group_results_by_strategy_includes_expected_count( + self, mock_credential, mock_azure_ai_project, mock_logger + ): + """Verify _group_results_by_strategy includes expected_count in entries.""" + manager = FoundryExecutionManager( + credential=mock_credential, + azure_ai_project=mock_azure_ai_project, + logger=mock_logger, + output_dir="/test/output", + ) + + mock_orchestrator = MagicMock() + mock_orchestrator.calculate_asr.return_value = 0.5 + + results = manager._group_results_by_strategy( + orchestrator=mock_orchestrator, + risk_value="violence", + output_path="/test/output.jsonl", + attack_strategies=[AttackStrategy.Baseline], + include_baseline=True, + num_objectives=32, + ) + + assert "baseline" in results + assert results["baseline"]["expected_count"] == 32 @pytest.mark.asyncio async def test_execute_attacks_filters_multi_turn_without_adversarial( diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_result_processor_errored.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_result_processor_errored.py new file mode 100644 index 000000000000..b561feb4c99b --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_result_processor_errored.py @@ -0,0 +1,328 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +"""Tests for ResultProcessor._compute_result_count, _determine_run_status, and _extract_expected_total.""" + +import pytest +from azure.ai.evaluation.red_team._result_processor import ResultProcessor + + +@pytest.mark.unittest +class TestComputeResultCount: + """Tests for ResultProcessor._compute_result_count — errored tracking.""" + + def test_empty_output_items(self): + """Empty input returns all-zero counts.""" + result = ResultProcessor._compute_result_count([]) + assert result == {"total": 0, "passed": 0, "failed": 0, "errored": 0} + + def test_all_passed(self): + """All items with passed=True count as passed.""" + items = [ + {"sample": {}, "results": [{"passed": True, "name": "violence"}]}, + {"sample": {}, "results": [{"passed": True, "name": "self_harm"}]}, + ] + result = ResultProcessor._compute_result_count(items) + assert result == {"total": 2, "passed": 2, "failed": 0, "errored": 0} + + def test_all_failed(self): + """All items with passed=False count as failed.""" + items = [ + {"sample": {}, "results": [{"passed": False, "name": "violence"}]}, + {"sample": {}, "results": [{"passed": False, "name": "self_harm"}]}, + ] + result = ResultProcessor._compute_result_count(items) + assert result == {"total": 2, "passed": 0, "failed": 2, "errored": 0} + + def test_mixed_results(self): + """Mix of passed and failed items.""" + items = [ + {"sample": {}, "results": [{"passed": True, "name": "violence"}]}, + {"sample": {}, "results": [{"passed": False, "name": "self_harm"}]}, + {"sample": {}, "results": [{"passed": True, "name": "sexual"}]}, + ] + result = ResultProcessor._compute_result_count(items) + assert result == {"total": 3, "passed": 2, "failed": 1, "errored": 0} + + def test_sample_error_counts_as_errored(self): + """Items with error in sample count as errored.""" + items = [ + {"sample": {"error": {"message": "Some error"}}, "results": [{"passed": True}]}, + {"sample": {}, "results": [{"passed": True, "name": "violence"}]}, + ] + result = ResultProcessor._compute_result_count(items) + assert result == {"total": 2, "passed": 1, "failed": 0, "errored": 1} + + def test_missing_results_counts_as_errored(self): + """Items with no results array count as errored.""" + items = [ + {"sample": {}, "results": []}, + {"sample": {}, "results": [{"passed": True, "name": "violence"}]}, + ] + result = ResultProcessor._compute_result_count(items) + assert result == {"total": 2, "passed": 1, "failed": 0, "errored": 1} + + def test_null_passed_counts_as_errored(self): + """Items where all results have passed=None count as errored.""" + items = [ + {"sample": {}, "results": [{"passed": None, "name": "violence"}]}, + {"sample": {}, "results": [{"passed": True, "name": "self_harm"}]}, + ] + result = ResultProcessor._compute_result_count(items) + assert result == {"total": 2, "passed": 1, "failed": 0, "errored": 1} + + def test_expected_total_none_no_effect(self): + """When expected_total is None, behaviour is unchanged.""" + items = [ + {"sample": {}, "results": [{"passed": True, "name": "violence"}]}, + ] + result = ResultProcessor._compute_result_count(items, expected_total=None) + assert result == {"total": 1, "passed": 1, "failed": 0, "errored": 0} + + def test_expected_total_equals_actual_no_extra_errored(self): + """When expected_total equals actual item count, no extra errored added.""" + items = [ + {"sample": {}, "results": [{"passed": True, "name": "v"}]}, + {"sample": {}, "results": [{"passed": False, "name": "s"}]}, + ] + result = ResultProcessor._compute_result_count(items, expected_total=2) + assert result == {"total": 2, "passed": 1, "failed": 1, "errored": 0} + + def test_expected_total_greater_than_actual_adds_errored(self): + """Missing objectives (expected > actual) are counted as errored.""" + items = [ + {"sample": {}, "results": [{"passed": True, "name": "violence"}]}, + ] + result = ResultProcessor._compute_result_count(items, expected_total=5) + assert result["total"] == 5 + assert result["passed"] == 1 + assert result["failed"] == 0 + assert result["errored"] == 4 # 5 expected - 1 actual = 4 missing + + def test_expected_total_with_existing_errors(self): + """Missing objectives add to already-errored items.""" + items = [ + {"sample": {}, "results": [{"passed": True, "name": "v"}]}, + {"sample": {"error": {"message": "eval failed"}}, "results": []}, + ] + # 2 actual items: 1 passed, 1 errored from sample error + # expected_total=5 means 3 more missing → errored = 1 + 3 = 4 + result = ResultProcessor._compute_result_count(items, expected_total=5) + assert result["total"] == 5 + assert result["passed"] == 1 + assert result["failed"] == 0 + assert result["errored"] == 4 + + def test_expected_total_less_than_actual_no_negative_errored(self): + """If expected_total < actual (shouldn't happen), don't add negative errored.""" + items = [ + {"sample": {}, "results": [{"passed": True, "name": "v"}]}, + {"sample": {}, "results": [{"passed": True, "name": "s"}]}, + ] + result = ResultProcessor._compute_result_count(items, expected_total=1) + assert result["total"] == 2 # actual count used when expected < actual + assert result["errored"] == 0 + + def test_entirely_missing_scenario(self): + """When zero output items exist but expected_total > 0, all are errored.""" + result = ResultProcessor._compute_result_count([], expected_total=32) + assert result == {"total": 32, "passed": 0, "failed": 0, "errored": 32} + + +@pytest.mark.unittest +class TestExtractExpectedTotal: + """Tests for ResultProcessor._extract_expected_total.""" + + def test_none_red_team_info(self): + """None input returns None.""" + assert ResultProcessor._extract_expected_total(None) is None + + def test_empty_red_team_info(self): + """Empty dict returns None.""" + assert ResultProcessor._extract_expected_total({}) is None + + def test_no_expected_count_fields(self): + """red_team_info without expected_count returns None.""" + info = { + "baseline": { + "violence": {"data_file": "v.jsonl", "status": "completed"}, + } + } + assert ResultProcessor._extract_expected_total(info) is None + + def test_single_strategy_single_risk(self): + """Simple case: one strategy, one risk category.""" + info = { + "baseline": { + "violence": {"status": "completed", "expected_count": 32}, + } + } + assert ResultProcessor._extract_expected_total(info) == 32 + + def test_single_strategy_multiple_risks(self): + """Multiple risk categories sum their expected counts.""" + info = { + "baseline": { + "violence": {"status": "completed", "expected_count": 32}, + "self_harm": {"status": "completed", "expected_count": 32}, + "task_adherence": {"status": "failed", "expected_count": 1}, + } + } + assert ResultProcessor._extract_expected_total(info) == 65 # 32+32+1 + + def test_duplicate_risk_across_strategies_deduplicates(self): + """Same risk category under multiple strategies is counted once (max).""" + info = { + "baseline": { + "violence": {"status": "completed", "expected_count": 32}, + }, + "crescendo": { + "violence": {"status": "completed", "expected_count": 32}, + }, + } + # violence appears in both strategies but should only count once + assert ResultProcessor._extract_expected_total(info) == 32 + + def test_different_counts_across_strategies_takes_max(self): + """If counts differ across strategies (unlikely), take the max.""" + info = { + "baseline": { + "violence": {"status": "completed", "expected_count": 10}, + }, + "crescendo": { + "violence": {"status": "completed", "expected_count": 32}, + }, + } + assert ResultProcessor._extract_expected_total(info) == 32 + + def test_zero_objective_category_included(self): + """Categories with expected_count=0 (failed to prepare) are included.""" + info = { + "baseline": { + "violence": {"status": "completed", "expected_count": 32}, + "sensitive_data_leakage": {"status": "failed", "expected_count": 0}, + } + } + assert ResultProcessor._extract_expected_total(info) == 32 # 32 + 0 + + def test_non_dict_values_skipped(self): + """Non-dict entries in red_team_info are gracefully skipped.""" + info = { + "baseline": { + "violence": {"status": "completed", "expected_count": 32}, + }, + "_metadata": "not a dict", + } + assert ResultProcessor._extract_expected_total(info) == 32 + + def test_invalid_expected_count_value_skipped(self): + """Non-numeric expected_count values are gracefully skipped.""" + info = { + "baseline": { + "violence": {"status": "completed", "expected_count": 32}, + "self_harm": {"status": "completed", "expected_count": "invalid"}, + } + } + assert ResultProcessor._extract_expected_total(info) == 32 + + +@pytest.mark.unittest +class TestDetermineRunStatus: + """Tests for ResultProcessor._determine_run_status.""" + + def _make_processor(self): + return ResultProcessor.__new__(ResultProcessor) + + def test_completed_when_all_ok(self): + """Run is completed when all categories succeeded.""" + proc = self._make_processor() + info = { + "baseline": { + "violence": {"status": "completed"}, + "self_harm": {"status": "completed"}, + } + } + assert proc._determine_run_status({}, info, []) == "completed" + + def test_failed_when_category_failed(self): + """Run is failed when any category has status 'failed'.""" + proc = self._make_processor() + info = { + "baseline": { + "violence": {"status": "completed"}, + "task_adherence": {"status": "failed"}, + } + } + assert proc._determine_run_status({}, info, []) == "failed" + + def test_failed_when_category_incomplete(self): + """Run is failed when any category has status 'incomplete'.""" + proc = self._make_processor() + info = { + "baseline": { + "violence": {"status": "incomplete"}, + } + } + assert proc._determine_run_status({}, info, []) == "failed" + + def test_failed_when_category_pending(self): + """Run is failed when any category is still pending (was never executed).""" + proc = self._make_processor() + info = { + "baseline": { + "violence": {"status": "pending"}, + } + } + assert proc._determine_run_status({}, info, []) == "failed" + + def test_failed_when_category_partial_failure(self): + """Run is failed when any category has status 'partial_failure'.""" + proc = self._make_processor() + info = { + "baseline": { + "violence": {"status": "completed"}, + "self_harm": {"status": "partial_failure"}, + } + } + assert proc._determine_run_status({}, info, []) == "failed" + + def test_failed_when_category_timeout(self): + """Run is failed when any category has status 'timeout'.""" + proc = self._make_processor() + info = { + "baseline": { + "violence": {"status": "timeout"}, + } + } + assert proc._determine_run_status({}, info, []) == "failed" + + def test_completed_when_no_red_team_info(self): + """Run is completed when red_team_info is None.""" + proc = self._make_processor() + assert proc._determine_run_status({}, None, []) == "completed" + + def test_non_dict_entries_skipped(self): + """Non-dict entries in red_team_info are ignored.""" + proc = self._make_processor() + info = { + "baseline": { + "violence": {"status": "completed"}, + }, + "_metadata": "some string", + } + assert proc._determine_run_status({}, info, []) == "completed" + + def test_zero_objective_category_recorded_as_failed_triggers_failure(self): + """A risk category with 0 objectives that was recorded as failed makes the run fail.""" + proc = self._make_processor() + info = { + "baseline": { + "violence": {"status": "completed", "expected_count": 32}, + "sensitive_data_leakage": { + "status": "failed", + "error": "No attack objectives could be prepared for this risk category", + "expected_count": 0, + }, + } + } + assert proc._determine_run_status({}, info, []) == "failed"