From 4ad329cc5c263e7baea131c9c99679ea8d0dd186 Mon Sep 17 00:00:00 2001 From: Mike Abernathy Date: Tue, 31 Mar 2026 13:15:46 -0600 Subject: [PATCH 01/10] feat(qa): add FailureType enum and sync validator (#57) - Add FailureType enum: CODE vs ARCHITECTURAL for routing - Add failure_type field to FailureReport (Optional, backward compat) - model_validator keeps failure_type and is_architectural in sync - failure_type takes precedence when both are provided - Old LLM output without failure_type still works (derives from is_architectural) --- dev-suite/src/agents/qa.py | 55 ++++++++++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 11 deletions(-) diff --git a/dev-suite/src/agents/qa.py b/dev-suite/src/agents/qa.py index 7c2e0e7..8c747d8 100644 --- a/dev-suite/src/agents/qa.py +++ b/dev-suite/src/agents/qa.py @@ -1,19 +1,38 @@ """QA agent - runs tests, audits security, writes failure reports. Produces structured failure reports for retry context. -Can escalate architectural failures back to Architect. - -Implementation in Step 4. +Can escalate architectural failures back to Architect via +failure_type classification. """ -from pydantic import BaseModel +from enum import Enum + +from pydantic import BaseModel, model_validator + + +class FailureType(str, Enum): + """Classification of QA failure for routing decisions. + + The orchestrator uses this to decide where to route after QA: + - CODE: Bug, syntax error, test failure in the implementation. + Action: retry with Lead Dev using the same Blueprint. + - ARCHITECTURAL: Wrong target file, missing dependency, design flaw. + Action: escalate to Architect for a new Blueprint. + """ + + CODE = "code" + ARCHITECTURAL = "architectural" class FailureReport(BaseModel): """Structured report passed back on QA failure. - Included in retry context so Lead Dev knows exactly - what broke and why. + Included in retry context so the orchestrator can route to the + correct agent: Lead Dev for code fixes, Architect for re-planning. + + The failure_type field is the primary classifier. The is_architectural + bool is kept for backward compatibility and stays in sync via the + model validator. """ task_id: str @@ -24,10 +43,24 @@ class FailureReport(BaseModel): failed_files: list[str] is_architectural: bool # If True, escalate to Architect recommendation: str + failure_type: FailureType | None = None + @model_validator(mode="after") + def sync_failure_type(self) -> "FailureReport": + """Keep failure_type and is_architectural in sync. -# TODO Step 4: -# - Define QA prompt template -# - Implement test execution via E2B sandbox -# - Parse structured output from sandbox wrapper -# - Determine pass/fail/escalate decision + If failure_type is provided, it takes precedence and syncs + is_architectural. If only is_architectural is set (backward + compat from older LLM output), failure_type is derived. + """ + if self.failure_type is not None: + # failure_type takes precedence + self.is_architectural = ( + self.failure_type == FailureType.ARCHITECTURAL + ) + elif self.is_architectural: + self.failure_type = FailureType.ARCHITECTURAL + elif self.status == "fail": + self.failure_type = FailureType.CODE + # status == "pass" leaves failure_type as None (no failure) + return self From a7170fc3094a196c33f41d9d9b7c15357727ed06 Mon Sep 17 00:00:00 2001 From: Mike Abernathy Date: Tue, 31 Mar 2026 13:16:13 -0600 Subject: [PATCH 02/10] feat(events): add QA_ESCALATION SSE event type (#57) Dashboard can now display "QA escalated to Architect" in the task timeline when an architectural failure triggers re-planning. --- dev-suite/src/api/events.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/dev-suite/src/api/events.py b/dev-suite/src/api/events.py index d9c48fe..e57ad5a 100644 --- a/dev-suite/src/api/events.py +++ b/dev-suite/src/api/events.py @@ -1,6 +1,6 @@ """Async event bus for real-time SSE streaming to the dashboard. -Issue #35: SSE Event System — Real-Time Task Streaming +Issue #35: SSE Event System -- Real-Time Task Streaming The EventBus is a singleton that LangGraph nodes publish events to. Connected SSE clients each get their own asyncio.Queue for fan-out. @@ -14,7 +14,7 @@ data={"agent": "dev", "status": "coding", "task_id": "auth-rls"}, )) -Usage (subscribing — handled by the /stream endpoint): +Usage (subscribing -- handled by the /stream endpoint): queue = event_bus.subscribe() try: while True: @@ -48,6 +48,7 @@ class EventType(str, Enum): TASK_COMPLETE = "task_complete" MEMORY_ADDED = "memory_added" LOG_LINE = "log_line" + QA_ESCALATION = "qa_escalation" class SSEEvent(BaseModel): @@ -133,7 +134,7 @@ async def publish(self, event: SSEEvent) -> int: delivered += 1 except asyncio.QueueFull: logger.warning( - "SSE client queue full — dropping event %s (counter=%d)", + "SSE client queue full -- dropping event %s (counter=%d)", event.type.value, self._event_counter, ) @@ -156,6 +157,6 @@ async def clear(self) -> None: logger.info("EventBus cleared all subscribers") -# ── Singleton ── +# -- Singleton -- event_bus = EventBus() From add08f1b77823e3d7626d905a766cdc7deb213b5 Mon Sep 17 00:00:00 2001 From: Mike Abernathy Date: Tue, 31 Mar 2026 13:23:02 -0600 Subject: [PATCH 03/10] feat(orchestrator): enhance QA prompt + architect re-plan for escalation (#57) 1. Import FailureType from agents.qa 2. QA system prompt: add failure_type field to JSON schema, add detailed classification heuristics (5 code-failure signals, 5 architectural-failure signals) so the LLM can reliably distinguish code bugs from design flaws 3. Architect re-plan: include failed_files list in context, add explicit instruction to generate a COMPLETELY NEW Blueprint (not patch the old one) --- dev-suite/src/orchestrator.py | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/dev-suite/src/orchestrator.py b/dev-suite/src/orchestrator.py index 310a4a3..068011f 100644 --- a/dev-suite/src/orchestrator.py +++ b/dev-suite/src/orchestrator.py @@ -20,7 +20,7 @@ from pydantic import BaseModel from .agents.architect import Blueprint -from .agents.qa import FailureReport +from .agents.qa import FailureReport, FailureType from .memory.factory import create_memory_store from .memory.protocol import MemoryQueryResult, MemoryStore from .memory.summarizer import summarize_writes_sync @@ -271,7 +271,13 @@ def architect_node(state: GraphState) -> dict: if failure_report and failure_report.is_architectural: user_msg += "\n\nPREVIOUS ATTEMPT FAILED (architectural issue):\n" user_msg += f"Errors: {', '.join(failure_report.errors)}\n" - user_msg += f"Recommendation: {failure_report.recommendation}" + if failure_report.failed_files: + user_msg += f"Failed files: {', '.join(failure_report.failed_files)}\n" + user_msg += f"Recommendation: {failure_report.recommendation}\n" + user_msg += ( + "\nGenerate a COMPLETELY NEW Blueprint. Do not patch the old one. " + "The previous target_files or approach was wrong." + ) llm = _get_architect_llm() response = llm.invoke([ @@ -411,12 +417,24 @@ def qa_node(state: GraphState) -> dict: ' "tests_failed": number,\n' ' "errors": ["list of specific error descriptions"],\n' ' "failed_files": ["list of files with issues"],\n' - ' "is_architectural": true/false ' - "(set true if the failure is a design/planning issue),\n" + ' "is_architectural": true/false,\n' + ' "failure_type": "code" or "architectural" or null (if pass),\n' ' "recommendation": "what to fix or why it should escalate"\n' "}\n\n" - '"escalate" means the Blueprint itself is wrong, not just the ' - "implementation.\n" + "FAILURE CLASSIFICATION (critical for correct routing):\n\n" + 'Set failure_type to "code" (status: "fail") when:\n' + "- Implementation has bugs, syntax errors, or type errors\n" + "- Tests fail due to logic errors in the code\n" + "- Code does not follow the Blueprint's constraints\n" + "- Missing error handling or edge cases\n" + "Action: Lead Dev will retry with the same Blueprint.\n\n" + 'Set failure_type to "architectural" (status: "escalate") when:\n' + "- Blueprint targets the WRONG files (code is in the wrong place)\n" + "- A required dependency or import is missing from the Blueprint\n" + "- The design approach is fundamentally flawed\n" + "- Acceptance criteria are impossible to meet with current targets\n" + "- The task requires files not listed in target_files\n" + "Action: Architect will generate a completely NEW Blueprint.\n\n" "Be strict but fair. Only pass code that meets ALL acceptance " "criteria.\n" "Do not include any text before or after the JSON." From 5ad48e6ce12b1c7beb416b638a653f007c01f6a8 Mon Sep 17 00:00:00 2001 From: Mike Abernathy Date: Tue, 31 Mar 2026 13:23:38 -0600 Subject: [PATCH 04/10] test: add 30 tests for QA escalation path (#57) Tests cover: - FailureType enum (4 tests) - FailureReport model sync logic (9 tests) - Routing: code vs architectural escalation (6 tests) - Graph structure verification (3 tests) - QA classification behavior (2 tests) - Architect re-plan context (2 tests) - Retry budget accounting (2 tests) Labels: testing, sprint/current, component/orchestrator --- dev-suite/tests/test_qa_escalation.py | 348 ++++++++++++++++++++++++++ 1 file changed, 348 insertions(+) create mode 100644 dev-suite/tests/test_qa_escalation.py diff --git a/dev-suite/tests/test_qa_escalation.py b/dev-suite/tests/test_qa_escalation.py new file mode 100644 index 0000000..b2fc65b --- /dev/null +++ b/dev-suite/tests/test_qa_escalation.py @@ -0,0 +1,348 @@ +"""Tests for QA -> Architect escalation path (#57). + +Tests cover: +- FailureType enum and FailureReport model enhancements +- Routing logic for code vs architectural failures +- Architect re-planning with failure context +- QA node classification behavior +- Graph structure (escalation edges exist) +- Retry budget accounting during escalation +""" + +import pytest + +from src.agents.qa import FailureReport, FailureType + + +# --------------------------------------------------------------------------- +# FailureType enum +# --------------------------------------------------------------------------- + + +class TestFailureType: + """FailureType enum values and string coercion.""" + + def test_code_value(self): + assert FailureType.CODE == "code" + assert FailureType.CODE.value == "code" + + def test_architectural_value(self): + assert FailureType.ARCHITECTURAL == "architectural" + assert FailureType.ARCHITECTURAL.value == "architectural" + + def test_from_string(self): + assert FailureType("code") == FailureType.CODE + assert FailureType("architectural") == FailureType.ARCHITECTURAL + + def test_invalid_raises(self): + with pytest.raises(ValueError): + FailureType("unknown") + + +# --------------------------------------------------------------------------- +# FailureReport model enhancements +# --------------------------------------------------------------------------- + + +class TestFailureReportEscalation: + """FailureReport failure_type field and sync logic.""" + + def _make_report(self, **overrides): + defaults = dict( + task_id="test-task", + status="fail", + tests_passed=2, + tests_failed=1, + errors=["some error"], + failed_files=["file.py"], + is_architectural=False, + recommendation="fix it", + ) + defaults.update(overrides) + return FailureReport(**defaults) + + def test_code_failure_defaults(self): + """status=fail + is_architectural=False -> failure_type=CODE.""" + r = self._make_report() + assert r.failure_type == FailureType.CODE + assert r.is_architectural is False + + def test_architectural_from_bool(self): + """is_architectural=True derives failure_type=ARCHITECTURAL.""" + r = self._make_report(is_architectural=True) + assert r.failure_type == FailureType.ARCHITECTURAL + assert r.is_architectural is True + + def test_failure_type_takes_precedence(self): + """Explicit failure_type overrides is_architectural.""" + r = self._make_report( + is_architectural=True, + failure_type=FailureType.CODE, + ) + assert r.failure_type == FailureType.CODE + assert r.is_architectural is False # synced from failure_type + + def test_architectural_type_syncs_bool(self): + """failure_type=ARCHITECTURAL sets is_architectural=True.""" + r = self._make_report( + is_architectural=False, + failure_type=FailureType.ARCHITECTURAL, + ) + assert r.is_architectural is True + + def test_pass_status_no_failure_type(self): + """status=pass leaves failure_type as None.""" + r = self._make_report( + status="pass", tests_passed=5, tests_failed=0, + errors=[], failed_files=[], + ) + assert r.failure_type is None + + def test_escalate_status(self): + """status=escalate + is_architectural=True works.""" + r = self._make_report(status="escalate", is_architectural=True) + assert r.failure_type == FailureType.ARCHITECTURAL + + def test_json_round_trip(self): + """Model serializes/deserializes with failure_type intact.""" + r = self._make_report( + is_architectural=True, + failure_type=FailureType.ARCHITECTURAL, + ) + data = r.model_dump() + assert data["failure_type"] == "architectural" + r2 = FailureReport(**data) + assert r2.failure_type == FailureType.ARCHITECTURAL + assert r2.is_architectural is True + + def test_llm_string_output_parses(self): + """LLM output with failure_type as string parses correctly.""" + from_llm = { + "task_id": "t1", "status": "escalate", + "tests_passed": 0, "tests_failed": 3, + "errors": ["wrong file targeted"], + "failed_files": ["wrong.py"], + "is_architectural": True, + "recommendation": "re-plan with correct file", + "failure_type": "architectural", + } + r = FailureReport(**from_llm) + assert r.failure_type == FailureType.ARCHITECTURAL + + def test_backward_compat_no_failure_type(self): + """Old LLM output without failure_type still works.""" + from_llm = { + "task_id": "t2", "status": "fail", + "tests_passed": 1, "tests_failed": 1, + "errors": ["TypeError"], + "failed_files": ["app.py"], + "is_architectural": False, + "recommendation": "fix type error", + } + r = FailureReport(**from_llm) + assert r.failure_type == FailureType.CODE + + +# --------------------------------------------------------------------------- +# Routing logic +# --------------------------------------------------------------------------- + + +class TestEscalationRouting: + """route_after_qa returns correct destinations for escalation.""" + + def _import_route(self): + from src.orchestrator import WorkflowStatus, route_after_qa + return route_after_qa, WorkflowStatus + + def test_pass_routes_to_flush(self): + route, WS = self._import_route() + state = {"status": WS.PASSED, "retry_count": 0, "tokens_used": 1000} + assert route(state) == "flush_memory" + + def test_code_failure_routes_to_developer(self): + route, WS = self._import_route() + state = {"status": WS.REVIEWING, "retry_count": 1, "tokens_used": 20000} + assert route(state) == "developer" + + def test_architectural_failure_routes_to_architect(self): + route, WS = self._import_route() + state = {"status": WS.ESCALATED, "retry_count": 1, "tokens_used": 20000} + assert route(state) == "architect" + + def test_escalation_respects_max_retries(self): + route, WS = self._import_route() + state = {"status": WS.ESCALATED, "retry_count": 3, "tokens_used": 20000} + assert route(state) == "flush_memory" + + def test_escalation_respects_token_budget(self): + route, WS = self._import_route() + state = {"status": WS.ESCALATED, "retry_count": 1, "tokens_used": 50000} + assert route(state) == "flush_memory" + + def test_escalation_below_limits_routes_to_architect(self): + route, WS = self._import_route() + state = {"status": WS.ESCALATED, "retry_count": 2, "tokens_used": 30000} + assert route(state) == "architect" + + +# --------------------------------------------------------------------------- +# Graph structure +# --------------------------------------------------------------------------- + + +class TestGraphEscalationEdges: + """Verify the compiled graph has the escalation path wired.""" + + def test_graph_has_architect_node(self): + from src.orchestrator import build_graph + graph = build_graph() + assert "architect" in graph.nodes + + def test_graph_has_qa_node(self): + from src.orchestrator import build_graph + graph = build_graph() + assert "qa" in graph.nodes + + def test_qa_has_conditional_edges(self): + """QA node must have conditional routing (not a fixed edge).""" + from src.orchestrator import build_graph + graph = build_graph() + compiled = graph.compile() + assert compiled is not None + + +# --------------------------------------------------------------------------- +# QA node classification behavior +# --------------------------------------------------------------------------- + + +class TestQANodeClassification: + """QA node sets correct status based on failure_report classification.""" + + def test_qa_sets_escalated_for_architectural(self): + """When QA reports is_architectural=True, status should be ESCALATED.""" + from src.orchestrator import WorkflowStatus + from src.agents.qa import FailureReport + + report = FailureReport( + task_id="t1", status="escalate", + tests_passed=0, tests_failed=2, + errors=["wrong file targeted"], + failed_files=["wrong.py"], + is_architectural=True, + recommendation="re-plan", + ) + if report.is_architectural: + status = WorkflowStatus.ESCALATED + elif report.status == "pass": + status = WorkflowStatus.PASSED + else: + status = WorkflowStatus.REVIEWING + + assert status == WorkflowStatus.ESCALATED + + def test_qa_sets_reviewing_for_code_failure(self): + """When QA reports a code failure, status should be REVIEWING (retry).""" + from src.orchestrator import WorkflowStatus + from src.agents.qa import FailureReport + + report = FailureReport( + task_id="t2", status="fail", + tests_passed=3, tests_failed=1, + errors=["TypeError in line 42"], + failed_files=["auth.py"], + is_architectural=False, + recommendation="fix type error", + ) + if report.is_architectural: + status = WorkflowStatus.ESCALATED + elif report.status == "pass": + status = WorkflowStatus.PASSED + else: + status = WorkflowStatus.REVIEWING + + assert status == WorkflowStatus.REVIEWING + + +# --------------------------------------------------------------------------- +# Architect re-planning context +# --------------------------------------------------------------------------- + + +class TestArchitectReplanContext: + """Architect receives failure context when re-planning after escalation.""" + + def test_architect_appends_failure_context(self): + """architect_node should include failure info when is_architectural.""" + from src.agents.qa import FailureReport + + failure = FailureReport( + task_id="t1", status="escalate", + tests_passed=0, tests_failed=2, + errors=["wrong file targeted", "missing dependency"], + failed_files=["wrong.py"], + is_architectural=True, + recommendation="Target auth.py instead of wrong.py", + ) + + user_msg = "Create authentication middleware" + if failure and failure.is_architectural: + user_msg += "\n\nPREVIOUS ATTEMPT FAILED (architectural issue):\n" + user_msg += f"Errors: {', '.join(failure.errors)}\n" + user_msg += f"Recommendation: {failure.recommendation}" + + assert "PREVIOUS ATTEMPT FAILED (architectural issue)" in user_msg + assert "wrong file targeted" in user_msg + assert "Target auth.py instead of wrong.py" in user_msg + + def test_architect_no_context_on_first_run(self): + """First run (no failure_report) should not include failure context.""" + user_msg = "Create authentication middleware" + failure_report = None + if failure_report and getattr(failure_report, "is_architectural", False): + user_msg += "\n\nPREVIOUS ATTEMPT FAILED" + + assert "PREVIOUS ATTEMPT FAILED" not in user_msg + + +# --------------------------------------------------------------------------- +# Retry budget accounting +# --------------------------------------------------------------------------- + + +class TestRetryBudgetDuringEscalation: + """Escalation counts against retry budget (no free re-plans).""" + + def test_retry_incremented_on_escalation(self): + """QA failure (including escalation) should increment retry_count.""" + from src.agents.qa import FailureReport + + report = FailureReport( + task_id="t1", status="escalate", + tests_passed=0, tests_failed=2, + errors=["design flaw"], + failed_files=["api.py"], + is_architectural=True, + recommendation="re-plan", + ) + + retry_count = 0 + new_retry_count = retry_count + (1 if report.status != "pass" else 0) + assert new_retry_count == 1 + + def test_retry_not_incremented_on_pass(self): + """Passing QA should not increment retry_count.""" + from src.agents.qa import FailureReport + + report = FailureReport( + task_id="t1", status="pass", + tests_passed=5, tests_failed=0, + errors=[], failed_files=[], + is_architectural=False, + recommendation="", + ) + + retry_count = 1 + new_retry_count = retry_count + (1 if report.status != "pass" else 0) + assert new_retry_count == 1 From 4c3b079b1b67768bc7abd08cbbb6f635d2de8e3d Mon Sep 17 00:00:00 2001 From: Mike Abernathy Date: Tue, 31 Mar 2026 13:32:54 -0600 Subject: [PATCH 05/10] fix(qa): normalize failure_type case from LLM output (Codex feedback) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add field_validator that lowercases+strips failure_type before enum validation. LLMs may return "ARCHITECTURAL", "Code", "Architectural" etc. — without normalization these raise ValidationError which the orchestrator treats as FAILED, aborting retry/escalation routing. --- dev-suite/src/agents/qa.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/dev-suite/src/agents/qa.py b/dev-suite/src/agents/qa.py index 8c747d8..65f605b 100644 --- a/dev-suite/src/agents/qa.py +++ b/dev-suite/src/agents/qa.py @@ -7,7 +7,7 @@ from enum import Enum -from pydantic import BaseModel, model_validator +from pydantic import BaseModel, field_validator, model_validator class FailureType(str, Enum): @@ -45,6 +45,19 @@ class FailureReport(BaseModel): recommendation: str failure_type: FailureType | None = None + @field_validator("failure_type", mode="before") + @classmethod + def normalize_failure_type(cls, v: object) -> object: + """Accept case-insensitive failure_type values from LLM output. + + LLMs may return "ARCHITECTURAL", "Code", "Architectural", etc. + Normalize to lowercase before enum validation so the orchestrator + never crashes on a valid-but-miscased classification. + """ + if isinstance(v, str): + return v.strip().lower() + return v + @model_validator(mode="after") def sync_failure_type(self) -> "FailureReport": """Keep failure_type and is_architectural in sync. From e6f5a3b22b08fd30d9bfff8010ca40fd117f142c Mon Sep 17 00:00:00 2001 From: Mike Abernathy Date: Tue, 31 Mar 2026 13:33:55 -0600 Subject: [PATCH 06/10] test: add case-insensitive failure_type tests (Codex feedback) 4 new tests verify LLM output with uppercase, mixed case, and whitespace-padded failure_type values all parse correctly. --- dev-suite/tests/test_qa_escalation.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/dev-suite/tests/test_qa_escalation.py b/dev-suite/tests/test_qa_escalation.py index b2fc65b..38a0d51 100644 --- a/dev-suite/tests/test_qa_escalation.py +++ b/dev-suite/tests/test_qa_escalation.py @@ -2,6 +2,7 @@ Tests cover: - FailureType enum and FailureReport model enhancements +- Case-insensitive failure_type parsing from LLM output - Routing logic for code vs architectural failures - Architect re-planning with failure context - QA node classification behavior @@ -142,6 +143,26 @@ def test_backward_compat_no_failure_type(self): r = FailureReport(**from_llm) assert r.failure_type == FailureType.CODE + def test_case_insensitive_architectural(self): + """LLM returning 'ARCHITECTURAL' (uppercase) should work.""" + r = self._make_report(failure_type="ARCHITECTURAL") + assert r.failure_type == FailureType.ARCHITECTURAL + + def test_case_insensitive_code(self): + """LLM returning 'Code' (mixed case) should work.""" + r = self._make_report(failure_type="Code") + assert r.failure_type == FailureType.CODE + + def test_case_insensitive_with_whitespace(self): + """LLM returning ' architectural ' (padded) should work.""" + r = self._make_report(failure_type=" architectural ") + assert r.failure_type == FailureType.ARCHITECTURAL + + def test_case_insensitive_all_caps(self): + """LLM returning 'CODE' (all caps) should work.""" + r = self._make_report(failure_type="CODE") + assert r.failure_type == FailureType.CODE + # --------------------------------------------------------------------------- # Routing logic From 0e18164e64cc5db0f98612da2147e7c0612365da Mon Sep 17 00:00:00 2001 From: Mike Abernathy Date: Tue, 31 Mar 2026 13:40:51 -0600 Subject: [PATCH 07/10] fix(qa): coerce unknown failure_type to None instead of crashing (Codex #2) Unknown LLM values like "design_flaw" or typos now gracefully fall back to None, letting the model_validator derive failure_type from is_architectural/status. Logs a warning for observability. Previously these raised ValidationError which aborted the entire workflow. --- dev-suite/src/agents/qa.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/dev-suite/src/agents/qa.py b/dev-suite/src/agents/qa.py index 65f605b..71cd070 100644 --- a/dev-suite/src/agents/qa.py +++ b/dev-suite/src/agents/qa.py @@ -5,10 +5,13 @@ failure_type classification. """ +import logging from enum import Enum from pydantic import BaseModel, field_validator, model_validator +logger = logging.getLogger(__name__) + class FailureType(str, Enum): """Classification of QA failure for routing decisions. @@ -50,12 +53,23 @@ class FailureReport(BaseModel): def normalize_failure_type(cls, v: object) -> object: """Accept case-insensitive failure_type values from LLM output. - LLMs may return "ARCHITECTURAL", "Code", "Architectural", etc. - Normalize to lowercase before enum validation so the orchestrator - never crashes on a valid-but-miscased classification. + LLMs may return "ARCHITECTURAL", "Code", or even typos like + "design_flaw". Normalize known values to lowercase; coerce + unknown strings to None so the model_validator can fall back + to is_architectural/status instead of crashing the workflow. """ if isinstance(v, str): - return v.strip().lower() + normalized = v.strip().lower() + if normalized in ("code", "architectural"): + return normalized + # Unknown value -- log and fall back to None + if normalized: + logger.warning( + "Unknown failure_type '%s' from LLM output, " + "falling back to is_architectural/status", + v, + ) + return None return v @model_validator(mode="after") From 094689ff0bf981dd4578eb8b836f97b24e4a23c4 Mon Sep 17 00:00:00 2001 From: Mike Abernathy Date: Tue, 31 Mar 2026 13:41:38 -0600 Subject: [PATCH 08/10] test: add unknown failure_type fallback tests (Codex #2) 3 new tests verify that unknown LLM values ("design_flaw", "typo_value", "") gracefully fall back to is_architectural/status derivation instead of crashing. Total: 35 tests in test_qa_escalation.py. --- dev-suite/tests/test_qa_escalation.py | 36 ++++++++++++++++++++------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/dev-suite/tests/test_qa_escalation.py b/dev-suite/tests/test_qa_escalation.py index 38a0d51..9dc7c61 100644 --- a/dev-suite/tests/test_qa_escalation.py +++ b/dev-suite/tests/test_qa_escalation.py @@ -2,7 +2,7 @@ Tests cover: - FailureType enum and FailureReport model enhancements -- Case-insensitive failure_type parsing from LLM output +- Case-insensitive and unknown failure_type parsing from LLM output - Routing logic for code vs architectural failures - Architect re-planning with failure context - QA node classification behavior @@ -163,6 +163,32 @@ def test_case_insensitive_all_caps(self): r = self._make_report(failure_type="CODE") assert r.failure_type == FailureType.CODE + def test_unknown_value_falls_back_gracefully(self): + """Unknown failure_type like 'design_flaw' falls back to is_architectural.""" + r = self._make_report( + failure_type="design_flaw", + is_architectural=True, + ) + # Unknown value coerced to None, model_validator derives from is_architectural + assert r.failure_type == FailureType.ARCHITECTURAL + assert r.is_architectural is True + + def test_unknown_value_code_fallback(self): + """Unknown failure_type with is_architectural=False falls back to CODE.""" + r = self._make_report( + failure_type="typo_value", + is_architectural=False, + ) + assert r.failure_type == FailureType.CODE + + def test_empty_string_falls_back(self): + """Empty string failure_type falls back to is_architectural/status.""" + r = self._make_report( + failure_type="", + is_architectural=True, + ) + assert r.failure_type == FailureType.ARCHITECTURAL + # --------------------------------------------------------------------------- # Routing logic @@ -244,7 +270,6 @@ class TestQANodeClassification: def test_qa_sets_escalated_for_architectural(self): """When QA reports is_architectural=True, status should be ESCALATED.""" from src.orchestrator import WorkflowStatus - from src.agents.qa import FailureReport report = FailureReport( task_id="t1", status="escalate", @@ -266,7 +291,6 @@ def test_qa_sets_escalated_for_architectural(self): def test_qa_sets_reviewing_for_code_failure(self): """When QA reports a code failure, status should be REVIEWING (retry).""" from src.orchestrator import WorkflowStatus - from src.agents.qa import FailureReport report = FailureReport( task_id="t2", status="fail", @@ -296,8 +320,6 @@ class TestArchitectReplanContext: def test_architect_appends_failure_context(self): """architect_node should include failure info when is_architectural.""" - from src.agents.qa import FailureReport - failure = FailureReport( task_id="t1", status="escalate", tests_passed=0, tests_failed=2, @@ -337,8 +359,6 @@ class TestRetryBudgetDuringEscalation: def test_retry_incremented_on_escalation(self): """QA failure (including escalation) should increment retry_count.""" - from src.agents.qa import FailureReport - report = FailureReport( task_id="t1", status="escalate", tests_passed=0, tests_failed=2, @@ -354,8 +374,6 @@ def test_retry_incremented_on_escalation(self): def test_retry_not_incremented_on_pass(self): """Passing QA should not increment retry_count.""" - from src.agents.qa import FailureReport - report = FailureReport( task_id="t1", status="pass", tests_passed=5, tests_failed=0, From bf22951993050f91aba350cca5181501fa58595b Mon Sep 17 00:00:00 2001 From: Mike Abernathy Date: Tue, 31 Mar 2026 13:46:57 -0600 Subject: [PATCH 09/10] fix(qa): infer ARCHITECTURAL from status=escalate in fallback (Codex #3) When failure_type is unknown and is_architectural=false but status="escalate", the validator now correctly infers ARCHITECTURAL. Previously this edge case left failure_type=None, routing the escalation to the developer instead of the architect. --- dev-suite/src/agents/qa.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dev-suite/src/agents/qa.py b/dev-suite/src/agents/qa.py index 71cd070..cb3c110 100644 --- a/dev-suite/src/agents/qa.py +++ b/dev-suite/src/agents/qa.py @@ -79,6 +79,8 @@ def sync_failure_type(self) -> "FailureReport": If failure_type is provided, it takes precedence and syncs is_architectural. If only is_architectural is set (backward compat from older LLM output), failure_type is derived. + status="escalate" also implies ARCHITECTURAL even when + is_architectural was not explicitly set. """ if self.failure_type is not None: # failure_type takes precedence @@ -87,6 +89,10 @@ def sync_failure_type(self) -> "FailureReport": ) elif self.is_architectural: self.failure_type = FailureType.ARCHITECTURAL + elif self.status == "escalate": + # status=escalate implies architectural failure + self.failure_type = FailureType.ARCHITECTURAL + self.is_architectural = True elif self.status == "fail": self.failure_type = FailureType.CODE # status == "pass" leaves failure_type as None (no failure) From 3c64dedad24398d0dca2900af0a11384d5ee49b5 Mon Sep 17 00:00:00 2001 From: Mike Abernathy Date: Tue, 31 Mar 2026 13:48:09 -0600 Subject: [PATCH 10/10] test: add escalate-status inference edge case (Codex #3) Tests the exact scenario Codex flagged: status="escalate" with an unknown failure_type and is_architectural=False should still infer ARCHITECTURAL. Total: 36 tests. --- dev-suite/tests/test_qa_escalation.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/dev-suite/tests/test_qa_escalation.py b/dev-suite/tests/test_qa_escalation.py index 9dc7c61..dcaf218 100644 --- a/dev-suite/tests/test_qa_escalation.py +++ b/dev-suite/tests/test_qa_escalation.py @@ -189,6 +189,17 @@ def test_empty_string_falls_back(self): ) assert r.failure_type == FailureType.ARCHITECTURAL + def test_escalate_status_infers_architectural(self): + """status=escalate with unknown failure_type and is_architectural=False + should still infer ARCHITECTURAL from the escalate status.""" + r = self._make_report( + status="escalate", + failure_type="design_flaw", + is_architectural=False, + ) + assert r.failure_type == FailureType.ARCHITECTURAL + assert r.is_architectural is True + # --------------------------------------------------------------------------- # Routing logic