From b16db2ce8dd23c13f616f89157fdc54c25d27811 Mon Sep 17 00:00:00 2001
From: Mike Abernathy <mabernathy87@gmail.com>
Date: Wed, 1 Apr 2026 22:26:20 -0600
Subject: [PATCH 01/12] feat(sandbox): add ValidationStrategy enum and update
 test coverage (#95)

- Add ValidationStrategy enum (TEST_SUITE, SCRIPT_EXEC, LINT_ONLY, SKIP)
- Add strategy, script_file fields to ValidationPlan
- Detect test files via TEST_INDICATORS to select strategy
- SCRIPT_EXEC for single scripts without test suites
- Ruff guard: `which ruff` check before running lint
- Split PYTHON_COMMANDS into PYTHON_LINT_COMMANDS + PYTHON_TEST_COMMANDS
- Updated tests for new strategy selection logic
- Updated sandbox_validate_node tests for strategy dispatch
---
 dev-suite/src/sandbox/validation_commands.py | 181 +++++++++++++++----
 1 file changed, 148 insertions(+), 33 deletions(-)

diff --git a/dev-suite/src/sandbox/validation_commands.py b/dev-suite/src/sandbox/validation_commands.py
index 7b9ea56..0a3f7f1 100644
--- a/dev-suite/src/sandbox/validation_commands.py
+++ b/dev-suite/src/sandbox/validation_commands.py
@@ -2,14 +2,21 @@
 
 Maps file types from Blueprint target_files to:
   - The appropriate sandbox template (default Python vs fullstack)
+  - The validation strategy (TEST_SUITE, SCRIPT_EXEC, LINT_ONLY, SKIP)
   - The validation commands QA should run
 
-This module is pure functions with no side effects — easy to test
+This module is pure functions with no side effects -- easy to test
 and extend as new file types are supported.
+
+Issue #95: Added ValidationStrategy enum to differentiate between
+"run tests", "just execute the script", and "lint only" paths.
+Previously, all Python tasks got ruff + pytest even when no test
+suite existed, causing misleading "passed" results.
 """
 
 import os
 from dataclasses import dataclass, field
+from enum import Enum
 
 from .e2b_runner import select_template_for_files
 
@@ -19,29 +26,58 @@
 FRONTEND_EXTENSIONS = {".svelte", ".ts", ".tsx", ".js", ".jsx", ".css", ".vue"}
 PYTHON_EXTENSIONS = {".py", ".pyi"}
 
+# Filenames/patterns that indicate a test suite exists
+TEST_INDICATORS = {"tests/", "test_", "_test.py", "tests.py", ".test.", ".spec."}
+
+
+class ValidationStrategy(str, Enum):
+    """How to validate the generated code in the sandbox.
+
+    TEST_SUITE:  Full validation -- lint + test runner. Used when test
+                 files are present in target_files.
+    SCRIPT_EXEC: Run the primary script and check exit code + stdout.
+                 Used for single-file scripts with no test suite.
+    LINT_ONLY:   Syntax/lint check only, no execution. Reserved for
+                 future use (config generators, migrations, etc.).
+    SKIP:        No validation needed (non-code files).
+    """
+    TEST_SUITE = "test_suite"
+    SCRIPT_EXEC = "script_exec"
+    LINT_ONLY = "lint_only"
+    SKIP = "skip"
+
 
 @dataclass
 class ValidationPlan:
     """What to validate and how.
 
     Attributes:
+        strategy: The validation approach to use.
         template: Sandbox template name ("fullstack" or None for default).
-        commands: Shell commands to run in the sandbox.
+        commands: Shell commands to run in the sandbox (for TEST_SUITE).
+        script_file: Primary file to execute (for SCRIPT_EXEC).
         description: Human-readable summary for logging/QA context.
     """
 
+    strategy: ValidationStrategy = ValidationStrategy.SKIP
     template: str | None = None
     commands: list[str] = field(default_factory=list)
+    script_file: str | None = None
     description: str = ""
 
 
 # -- Command Sets --
 
-PYTHON_COMMANDS = [
-    "ruff check . --select=E,F,W --no-fix",
-    "python -m pytest tests/ -v --tb=short 2>&1 || true",
+PYTHON_LINT_COMMANDS = [
+    "which ruff >/dev/null 2>&1 && ruff check . --select=E,F,W --no-fix || echo '[WARN] ruff not available -- lint skipped'",
+]
+
+PYTHON_TEST_COMMANDS = [
+    "python -m pytest tests/ -v --tb=short",
 ]
 
+PYTHON_COMMANDS = PYTHON_LINT_COMMANDS + PYTHON_TEST_COMMANDS
+
 FRONTEND_COMMANDS = [
     "cd /home/user/dashboard && pnpm check 2>&1 || true",
     "cd /home/user/dashboard && pnpm exec tsc --noEmit 2>&1 || true",
@@ -51,20 +87,63 @@ class ValidationPlan:
 FULLSTACK_COMMANDS = FRONTEND_COMMANDS + PYTHON_COMMANDS
 
 
+def _has_test_files(target_files: list[str]) -> bool:
+    """Check if any target files look like test files or test directories."""
+    for filepath in target_files:
+        lower = filepath.lower()
+        for indicator in TEST_INDICATORS:
+            if indicator in lower:
+                return True
+    return False
+
+
+def _get_primary_script(target_files: list[str]) -> str | None:
+    """Find the primary executable script from target files.
+
+    Returns the first .py file that doesn't look like a test file,
+    or the first .py file if all look like tests, or None.
+    """
+    py_files = [
+        f for f in target_files
+        if os.path.splitext(f.lower())[1] in PYTHON_EXTENSIONS
+    ]
+    if not py_files:
+        return None
+
+    # Prefer non-test files
+    for f in py_files:
+        lower = f.lower()
+        is_test = any(ind in lower for ind in TEST_INDICATORS)
+        if not is_test:
+            return f
+
+    # Fallback to first .py file
+    return py_files[0]
+
+
 def get_validation_plan(target_files: list[str]) -> ValidationPlan:
     """Determine the validation plan based on target file types.
 
+    Strategy selection:
+      - No files -> SKIP
+      - Non-code files only -> SKIP
+      - Frontend files -> TEST_SUITE (pnpm check + tsc)
+      - Python files with test files -> TEST_SUITE (ruff + pytest)
+      - Python files without test files -> SCRIPT_EXEC (run the script)
+      - Mixed frontend + Python -> TEST_SUITE (fullstack)
+
     Args:
         target_files: List of file paths from the Blueprint.
 
     Returns:
-        ValidationPlan with template, commands, and description.
+        ValidationPlan with strategy, template, commands, and description.
     """
     if not target_files:
         return ValidationPlan(
+            strategy=ValidationStrategy.SKIP,
             template=None,
             commands=[],
-            description="No target files \u2014 skipping validation",
+            description="No target files -- skipping validation",
         )
 
     has_frontend = False
@@ -80,37 +159,64 @@ def get_validation_plan(target_files: list[str]) -> ValidationPlan:
     # Pick template
     template = select_template_for_files(target_files)
 
-    # Pick commands
+    # Pick strategy and commands
     if has_frontend and has_python:
-        commands = list(FULLSTACK_COMMANDS)
-        description = (
-            f"Full-stack validation: {len(target_files)} files "
-            f"(frontend + Python) using fullstack template"
-        )
-    elif has_frontend:
-        commands = list(FRONTEND_COMMANDS)
-        description = (
-            f"Frontend validation: {len(target_files)} files "
-            f"using fullstack template (pnpm check + tsc)"
-        )
-    elif has_python:
-        commands = list(PYTHON_COMMANDS)
-        description = (
-            f"Python validation: {len(target_files)} files "
-            f"using default template (ruff + pytest)"
+        # Full-stack: always TEST_SUITE
+        return ValidationPlan(
+            strategy=ValidationStrategy.TEST_SUITE,
+            template=template,
+            commands=list(FULLSTACK_COMMANDS),
+            description=(
+                f"Full-stack validation: {len(target_files)} files "
+                f"(frontend + Python) using fullstack template"
+            ),
         )
-    else:
-        # Non-code files (JSON, YAML, SQL, etc.) \u2014 no validation
-        commands = []
-        description = (
-            f"No code validation needed for {len(target_files)} files "
-            f"(non-code file types)"
+
+    if has_frontend:
+        return ValidationPlan(
+            strategy=ValidationStrategy.TEST_SUITE,
+            template=template,
+            commands=list(FRONTEND_COMMANDS),
+            description=(
+                f"Frontend validation: {len(target_files)} files "
+                f"using fullstack template (pnpm check + tsc)"
+            ),
         )
 
+    if has_python:
+        has_tests = _has_test_files(target_files)
+        if has_tests:
+            return ValidationPlan(
+                strategy=ValidationStrategy.TEST_SUITE,
+                template=template,
+                commands=list(PYTHON_COMMANDS),
+                description=(
+                    f"Python validation: {len(target_files)} files "
+                    f"using default template (ruff + pytest)"
+                ),
+            )
+        else:
+            primary = _get_primary_script(target_files)
+            return ValidationPlan(
+                strategy=ValidationStrategy.SCRIPT_EXEC,
+                template=template,
+                commands=[],
+                script_file=primary,
+                description=(
+                    f"Script execution: {len(target_files)} files "
+                    f"using default template (run + check exit code)"
+                ),
+            )
+
+    # Non-code files (JSON, YAML, SQL, etc.)
     return ValidationPlan(
-        template=template,
-        commands=commands,
-        description=description,
+        strategy=ValidationStrategy.SKIP,
+        template=None,
+        commands=[],
+        description=(
+            f"No code validation needed for {len(target_files)} files "
+            f"(non-code file types)"
+        ),
     )
 
 
@@ -119,6 +225,15 @@ def format_validation_summary(plan: ValidationPlan) -> str:
 
     Used in the QA prompt to describe what sandbox validation was attempted.
     """
+    if plan.strategy == ValidationStrategy.SKIP:
+        return plan.description
+
+    if plan.strategy == ValidationStrategy.SCRIPT_EXEC:
+        lines = [plan.description]
+        if plan.script_file:
+            lines.append(f"  Executing: python {plan.script_file}")
+        return "\n".join(lines)
+
     if not plan.commands:
         return plan.description
 

From 02c5c42355172363e7cac8b40ed1eb9d06a639e4 Mon Sep 17 00:00:00 2001
From: Mike Abernathy <mabernathy87@gmail.com>
Date: Wed, 1 Apr 2026 22:29:16 -0600
Subject: [PATCH 02/12] test: update validation_commands and sandbox_validate
 tests for strategy dispatch (#95)

---
 dev-suite/tests/test_sandbox_validate.py    | 172 ++++++++++++++------
 dev-suite/tests/test_validation_commands.py | 134 ++++++++++++---
 2 files changed, 239 insertions(+), 67 deletions(-)

diff --git a/dev-suite/tests/test_sandbox_validate.py b/dev-suite/tests/test_sandbox_validate.py
index f0e01c5..71bd600 100644
--- a/dev-suite/tests/test_sandbox_validate.py
+++ b/dev-suite/tests/test_sandbox_validate.py
@@ -1,15 +1,18 @@
 """Tests for the sandbox_validate orchestrator node.
 
 These tests validate the sandbox_validate node behavior with mocked
-E2B runner — no real sandbox needed. Tests cover:
+E2B runner -- no real sandbox needed. Tests cover:
+  - Strategy-based dispatch (TEST_SUITE, SCRIPT_EXEC, SKIP)
   - Template selection from blueprint target_files
-  - Validation command execution
   - Graceful skip when E2B_API_KEY is missing
+  - validation_skipped flag propagation
   - SandboxResult stored in graph state
-  - QA prompt includes sandbox results
+  - Warnings surfaced in trace
+
+Issue #95: Updated for ValidationStrategy dispatch.
 """
 
-from unittest.mock import MagicMock, patch
+from unittest.mock import patch
 
 import pytest
 
@@ -43,19 +46,91 @@ def _make_state(target_files: list[str], **overrides) -> GraphState:
         "memory_context": [],
         "memory_writes": [],
         "trace": [],
+        "parsed_files": [],
+        "tool_calls_log": [],
     }
     state.update(overrides)
     return state
 
 
+class TestSandboxValidateStrategy:
+    """Tests for strategy-based dispatch in sandbox_validate_node."""
+
+    def test_script_exec_for_single_python_file(self):
+        """Single .py file without tests should use SCRIPT_EXEC."""
+        state = _make_state(["hello_world.py"])
+
+        with patch("src.orchestrator._run_sandbox_script") as mock_script:
+            mock_script.return_value = SandboxResult(
+                exit_code=0,
+                tests_passed=1,
+                tests_failed=0,
+                output_summary="Hello, World!",
+            )
+            result = sandbox_validate_node(state)
+
+        assert result["sandbox_result"] is not None
+        assert result["sandbox_result"].exit_code == 0
+        assert result["sandbox_result"].tests_passed == 1
+        mock_script.assert_called_once()
+        call_kwargs = mock_script.call_args[1]
+        assert call_kwargs["script_file"] == "hello_world.py"
+
+    def test_test_suite_for_python_with_tests(self):
+        """Python files with test files should use TEST_SUITE."""
+        state = _make_state(["src/main.py", "tests/test_main.py"])
+
+        with patch("src.orchestrator._run_sandbox_tests") as mock_tests:
+            mock_tests.return_value = SandboxResult(
+                exit_code=0,
+                tests_passed=5,
+                tests_failed=0,
+                output_summary="5 passed in 1.2s",
+            )
+            result = sandbox_validate_node(state)
+
+        assert result["sandbox_result"] is not None
+        assert result["sandbox_result"].tests_passed == 5
+        mock_tests.assert_called_once()
+        call_kwargs = mock_tests.call_args[1]
+        assert call_kwargs["template"] is None
+
+    def test_test_suite_for_frontend_files(self):
+        """Frontend files should use TEST_SUITE with fullstack template."""
+        state = _make_state(["dashboard/src/lib/Widget.svelte"])
+
+        with patch("src.orchestrator._run_sandbox_tests") as mock_tests:
+            mock_tests.return_value = SandboxResult(
+                exit_code=0,
+                tests_passed=1,
+                tests_failed=0,
+                output_summary="svelte-check found 0 errors",
+            )
+            result = sandbox_validate_node(state)
+
+        assert result["sandbox_result"] is not None
+        call_kwargs = mock_tests.call_args[1]
+        assert call_kwargs["template"] == "fullstack"
+
+    def test_skip_for_non_code_files(self):
+        """Non-code files should get SKIP strategy with validation_skipped=True."""
+        state = _make_state(["schema.sql", "config.yaml"])
+
+        result = sandbox_validate_node(state)
+
+        assert result["sandbox_result"] is not None
+        assert result["sandbox_result"].validation_skipped is True
+        assert result["sandbox_result"].exit_code == 0
+
+
 class TestSandboxValidateNode:
-    """Tests for the sandbox_validate_node function."""
+    """Core sandbox_validate_node tests."""
 
     def test_python_task_uses_default_template(self):
         """Python-only tasks should use the default (None) template."""
         state = _make_state(["src/main.py", "tests/test_main.py"])
 
-        with patch("src.orchestrator._run_sandbox_validation") as mock_run:
+        with patch("src.orchestrator._run_sandbox_tests") as mock_run:
             mock_run.return_value = SandboxResult(
                 exit_code=0,
                 tests_passed=5,
@@ -67,15 +142,14 @@ def test_python_task_uses_default_template(self):
         assert result["sandbox_result"] is not None
         assert result["sandbox_result"].exit_code == 0
         assert result["sandbox_result"].tests_passed == 5
-        # Check it was called with None template (default)
-        call_args = mock_run.call_args
-        assert call_args[1]["template"] is None
+        call_kwargs = mock_run.call_args[1]
+        assert call_kwargs["template"] is None
 
     def test_svelte_task_uses_fullstack_template(self):
         """Svelte tasks should use the fullstack template."""
         state = _make_state(["dashboard/src/lib/Widget.svelte"])
 
-        with patch("src.orchestrator._run_sandbox_validation") as mock_run:
+        with patch("src.orchestrator._run_sandbox_tests") as mock_run:
             mock_run.return_value = SandboxResult(
                 exit_code=0,
                 tests_passed=1,
@@ -85,14 +159,14 @@ def test_svelte_task_uses_fullstack_template(self):
             result = sandbox_validate_node(state)
 
         assert result["sandbox_result"] is not None
-        call_args = mock_run.call_args
-        assert call_args[1]["template"] == "fullstack"
+        call_kwargs = mock_run.call_args[1]
+        assert call_kwargs["template"] == "fullstack"
 
     def test_mixed_task_uses_fullstack_template(self):
         """Mixed Python+frontend tasks should use fullstack."""
         state = _make_state(["src/api.py", "dashboard/src/App.svelte"])
 
-        with patch("src.orchestrator._run_sandbox_validation") as mock_run:
+        with patch("src.orchestrator._run_sandbox_tests") as mock_run:
             mock_run.return_value = SandboxResult(
                 exit_code=0,
                 tests_passed=6,
@@ -101,19 +175,19 @@ def test_mixed_task_uses_fullstack_template(self):
             )
             result = sandbox_validate_node(state)
 
-        call_args = mock_run.call_args
-        assert call_args[1]["template"] == "fullstack"
+        call_kwargs = mock_run.call_args[1]
+        assert call_kwargs["template"] == "fullstack"
 
     def test_graceful_skip_no_api_key(self):
-        """Should warn and skip when E2B_API_KEY is missing."""
-        state = _make_state(["src/main.py"])
+        """Should return validation_skipped when E2B_API_KEY is missing."""
+        state = _make_state(["src/main.py", "tests/test_main.py"])
 
-        with patch("src.orchestrator._run_sandbox_validation") as mock_run:
-            mock_run.return_value = None  # Signals skip
+        with patch("src.orchestrator._run_sandbox_tests") as mock_run:
+            mock_run.return_value = None
             result = sandbox_validate_node(state)
 
-        assert result["sandbox_result"] is None
-        assert any("skip" in t.lower() or "warn" in t.lower() for t in result["trace"])
+        assert result["sandbox_result"] is not None
+        assert result["sandbox_result"].validation_skipped is True
 
     def test_no_blueprint_skips(self):
         """Should skip validation when there's no blueprint."""
@@ -125,20 +199,11 @@ def test_no_blueprint_skips(self):
         assert result["sandbox_result"] is None
         assert any("no blueprint" in t.lower() for t in result["trace"])
 
-    def test_non_code_files_skip_validation(self):
-        """Non-code files (JSON, YAML, SQL) should skip sandbox validation."""
-        state = _make_state(["schema.sql", "config.yaml"])
-
-        result = sandbox_validate_node(state)
-
-        assert result["sandbox_result"] is None
-        assert any("no code validation" in t.lower() or "skip" in t.lower() for t in result["trace"])
-
     def test_sandbox_error_doesnt_crash(self):
         """Sandbox errors should be captured, not raise exceptions."""
-        state = _make_state(["src/main.py"])
+        state = _make_state(["src/main.py", "tests/test_main.py"])
 
-        with patch("src.orchestrator._run_sandbox_validation") as mock_run:
+        with patch("src.orchestrator._run_sandbox_tests") as mock_run:
             mock_run.side_effect = Exception("E2B connection failed")
             result = sandbox_validate_node(state)
 
@@ -146,10 +211,10 @@ def test_sandbox_error_doesnt_crash(self):
         assert any("error" in t.lower() or "failed" in t.lower() for t in result["trace"])
 
     def test_failed_validation_still_continues(self):
-        """Sandbox validation failure should not block QA — it adds context."""
-        state = _make_state(["src/main.py"])
+        """Sandbox validation failure should not block QA -- it adds context."""
+        state = _make_state(["src/main.py", "tests/test_main.py"])
 
-        with patch("src.orchestrator._run_sandbox_validation") as mock_run:
+        with patch("src.orchestrator._run_sandbox_tests") as mock_run:
             mock_run.return_value = SandboxResult(
                 exit_code=1,
                 tests_passed=3,
@@ -159,29 +224,41 @@ def test_failed_validation_still_continues(self):
             )
             result = sandbox_validate_node(state)
 
-        # Node should still return the result even though tests failed
         assert result["sandbox_result"] is not None
         assert result["sandbox_result"].tests_failed == 2
-        # Status should not change — QA decides pass/fail
-        assert "status" not in result or result.get("status") == WorkflowStatus.REVIEWING
 
-    def test_trace_includes_validation_plan(self):
-        """Trace should include which template and commands were selected."""
-        state = _make_state(["dashboard/src/App.svelte"])
+    def test_warnings_surfaced_in_trace(self):
+        """Sandbox warnings should appear in the trace log."""
+        state = _make_state(["src/main.py", "tests/test_main.py"])
 
-        with patch("src.orchestrator._run_sandbox_validation") as mock_run:
+        with patch("src.orchestrator._run_sandbox_tests") as mock_run:
             mock_run.return_value = SandboxResult(
                 exit_code=0,
                 tests_passed=1,
                 tests_failed=0,
-                output_summary="svelte-check found 0 errors",
+                output_summary="1 passed",
+                warnings=["[WARN] ruff not available -- lint skipped"],
+            )
+            result = sandbox_validate_node(state)
+
+        trace_text = " ".join(result.get("trace", []))
+        assert "ruff" in trace_text.lower()
+
+    def test_trace_includes_strategy(self):
+        """Trace should include which strategy was selected."""
+        state = _make_state(["hello_world.py"])
+
+        with patch("src.orchestrator._run_sandbox_script") as mock_script:
+            mock_script.return_value = SandboxResult(
+                exit_code=0,
+                tests_passed=1,
+                tests_failed=0,
+                output_summary="Hello, World!",
             )
             result = sandbox_validate_node(state)
 
-        trace = result.get("trace", [])
-        trace_text = " ".join(trace).lower()
-        assert "sandbox_validate" in trace_text
-        assert "fullstack" in trace_text or "frontend" in trace_text
+        trace_text = " ".join(result.get("trace", []))
+        assert "script_exec" in trace_text.lower()
 
 
 class TestSandboxValidateGraphIntegration:
@@ -193,7 +270,6 @@ def test_graph_has_sandbox_validate_node(self):
 
         graph = build_graph()
         compiled = graph.compile()
-        # LangGraph stores node names — check sandbox_validate exists
         node_names = set(compiled.get_graph().nodes.keys())
         assert "sandbox_validate" in node_names
 
diff --git a/dev-suite/tests/test_validation_commands.py b/dev-suite/tests/test_validation_commands.py
index fb95850..0e19384 100644
--- a/dev-suite/tests/test_validation_commands.py
+++ b/dev-suite/tests/test_validation_commands.py
@@ -1,7 +1,10 @@
 """Tests for validation command mapping.
 
-Validates that file types are correctly mapped to sandbox templates
-and validation commands.
+Validates that file types are correctly mapped to sandbox templates,
+validation strategies, and validation commands.
+
+Issue #95: Added tests for ValidationStrategy selection --
+SCRIPT_EXEC for simple scripts, TEST_SUITE when tests present.
 """
 
 from src.sandbox.validation_commands import (
@@ -9,6 +12,7 @@
     FULLSTACK_COMMANDS,
     PYTHON_COMMANDS,
     ValidationPlan,
+    ValidationStrategy,
     format_validation_summary,
     get_validation_plan,
 )
@@ -17,58 +21,94 @@
 class TestGetValidationPlan:
     """Tests for get_validation_plan()."""
 
-    # -- Python files --
+    # -- Python files with tests (TEST_SUITE) --
 
-    def test_python_only_files(self):
+    def test_python_with_test_files(self):
         plan = get_validation_plan(["src/main.py", "tests/test_main.py"])
+        assert plan.strategy == ValidationStrategy.TEST_SUITE
         assert plan.template is None
         assert plan.commands == PYTHON_COMMANDS
         assert "Python" in plan.description
 
-    def test_pyi_stub_file(self):
-        plan = get_validation_plan(["src/types.pyi"])
-        assert plan.template is None
+    def test_python_with_test_prefix(self):
+        plan = get_validation_plan(["src/utils.py", "test_utils.py"])
+        assert plan.strategy == ValidationStrategy.TEST_SUITE
         assert plan.commands == PYTHON_COMMANDS
 
-    # -- Frontend files --
+    def test_python_with_test_suffix(self):
+        plan = get_validation_plan(["src/auth.py", "src/auth_test.py"])
+        assert plan.strategy == ValidationStrategy.TEST_SUITE
+
+    def test_pyi_stub_with_tests(self):
+        plan = get_validation_plan(["src/types.pyi", "tests/test_types.py"])
+        assert plan.strategy == ValidationStrategy.TEST_SUITE
+        assert plan.commands == PYTHON_COMMANDS
+
+    # -- Python files without tests (SCRIPT_EXEC) --
+
+    def test_single_python_script(self):
+        plan = get_validation_plan(["hello_world.py"])
+        assert plan.strategy == ValidationStrategy.SCRIPT_EXEC
+        assert plan.script_file == "hello_world.py"
+        assert plan.commands == []
+        assert "Script execution" in plan.description
+
+    def test_python_module_no_tests(self):
+        plan = get_validation_plan(["src/utils.py"])
+        assert plan.strategy == ValidationStrategy.SCRIPT_EXEC
+        assert plan.script_file == "src/utils.py"
+
+    def test_multiple_python_no_tests(self):
+        plan = get_validation_plan(["src/main.py", "src/helpers.py"])
+        assert plan.strategy == ValidationStrategy.SCRIPT_EXEC
+        assert plan.script_file == "src/main.py"
+
+    def test_pyi_only_no_tests(self):
+        plan = get_validation_plan(["src/types.pyi"])
+        assert plan.strategy == ValidationStrategy.SCRIPT_EXEC
+        assert plan.script_file == "src/types.pyi"
+
+    # -- Frontend files (TEST_SUITE) --
 
     def test_svelte_files(self):
         plan = get_validation_plan(["dashboard/src/lib/Widget.svelte"])
+        assert plan.strategy == ValidationStrategy.TEST_SUITE
         assert plan.template == "fullstack"
         assert plan.commands == FRONTEND_COMMANDS
         assert "Frontend" in plan.description
 
     def test_typescript_files(self):
         plan = get_validation_plan(["dashboard/src/routes/+page.ts"])
+        assert plan.strategy == ValidationStrategy.TEST_SUITE
         assert plan.template == "fullstack"
         assert plan.commands == FRONTEND_COMMANDS
 
     def test_javascript_files(self):
         plan = get_validation_plan(["src/utils.js"])
+        assert plan.strategy == ValidationStrategy.TEST_SUITE
         assert plan.template == "fullstack"
-        assert plan.commands == FRONTEND_COMMANDS
 
     def test_jsx_files(self):
         plan = get_validation_plan(["src/Component.jsx"])
+        assert plan.strategy == ValidationStrategy.TEST_SUITE
         assert plan.template == "fullstack"
-        assert plan.commands == FRONTEND_COMMANDS
 
     def test_tsx_files(self):
         plan = get_validation_plan(["src/Component.tsx"])
+        assert plan.strategy == ValidationStrategy.TEST_SUITE
         assert plan.template == "fullstack"
-        assert plan.commands == FRONTEND_COMMANDS
 
     def test_css_files(self):
         plan = get_validation_plan(["dashboard/src/app.css"])
+        assert plan.strategy == ValidationStrategy.TEST_SUITE
         assert plan.template == "fullstack"
-        assert plan.commands == FRONTEND_COMMANDS
 
     def test_vue_files(self):
         plan = get_validation_plan(["src/App.vue"])
+        assert plan.strategy == ValidationStrategy.TEST_SUITE
         assert plan.template == "fullstack"
-        assert plan.commands == FRONTEND_COMMANDS
 
-    # -- Mixed files --
+    # -- Mixed files (TEST_SUITE fullstack) --
 
     def test_mixed_python_and_frontend(self):
         files = [
@@ -76,6 +116,7 @@ def test_mixed_python_and_frontend(self):
             "dashboard/src/lib/Widget.svelte",
         ]
         plan = get_validation_plan(files)
+        assert plan.strategy == ValidationStrategy.TEST_SUITE
         assert plan.template == "fullstack"
         assert plan.commands == FULLSTACK_COMMANDS
         assert "Full-stack" in plan.description
@@ -83,77 +124,131 @@ def test_mixed_python_and_frontend(self):
     def test_mixed_python_and_typescript(self):
         files = ["src/server.py", "dashboard/src/routes/+page.ts"]
         plan = get_validation_plan(files)
+        assert plan.strategy == ValidationStrategy.TEST_SUITE
         assert plan.template == "fullstack"
         assert plan.commands == FULLSTACK_COMMANDS
 
-    # -- Non-code files --
+    # -- Non-code files (SKIP) --
 
     def test_non_code_files(self):
         files = ["data.json", "config.yaml", "schema.sql"]
         plan = get_validation_plan(files)
+        assert plan.strategy == ValidationStrategy.SKIP
         assert plan.template is None
         assert plan.commands == []
         assert "non-code" in plan.description.lower()
 
     def test_markdown_files(self):
         plan = get_validation_plan(["README.md", "CHANGELOG.md"])
+        assert plan.strategy == ValidationStrategy.SKIP
         assert plan.commands == []
 
     def test_makefile(self):
         plan = get_validation_plan(["Makefile"])
+        assert plan.strategy == ValidationStrategy.SKIP
         assert plan.commands == []
 
     # -- Edge cases --
 
     def test_empty_files_list(self):
         plan = get_validation_plan([])
+        assert plan.strategy == ValidationStrategy.SKIP
         assert plan.template is None
         assert plan.commands == []
         assert "skipping" in plan.description.lower()
 
     def test_mixed_code_and_non_code(self):
-        """Non-code files alongside Python should still trigger Python validation."""
+        """Non-code files alongside Python without tests -> SCRIPT_EXEC."""
         files = ["src/main.py", "config.yaml"]
         plan = get_validation_plan(files)
+        assert plan.strategy == ValidationStrategy.SCRIPT_EXEC
+        assert plan.script_file == "src/main.py"
+
+    def test_mixed_code_and_non_code_with_tests(self):
+        """Non-code files alongside Python with tests -> TEST_SUITE."""
+        files = ["src/main.py", "tests/test_main.py", "config.yaml"]
+        plan = get_validation_plan(files)
+        assert plan.strategy == ValidationStrategy.TEST_SUITE
         assert plan.commands == PYTHON_COMMANDS
 
     def test_mixed_frontend_and_non_code(self):
         """Non-code files alongside frontend should still trigger frontend validation."""
         files = ["dashboard/src/App.svelte", "README.md"]
         plan = get_validation_plan(files)
+        assert plan.strategy == ValidationStrategy.TEST_SUITE
         assert plan.template == "fullstack"
         assert plan.commands == FRONTEND_COMMANDS
 
+    # -- Script file selection --
+
+    def test_script_file_prefers_non_test(self):
+        """_get_primary_script should prefer non-test files."""
+        files = ["src/main.py", "test_main.py"]
+        plan = get_validation_plan(files)
+        assert plan.strategy == ValidationStrategy.TEST_SUITE
+
+    def test_script_file_single_file(self):
+        plan = get_validation_plan(["utils.py"])
+        assert plan.script_file == "utils.py"
+
 
 class TestValidationPlan:
     """Tests for the ValidationPlan dataclass."""
 
     def test_default_values(self):
         plan = ValidationPlan()
+        assert plan.strategy == ValidationStrategy.SKIP
         assert plan.template is None
         assert plan.commands == []
+        assert plan.script_file is None
         assert plan.description == ""
 
     def test_custom_values(self):
         plan = ValidationPlan(
+            strategy=ValidationStrategy.TEST_SUITE,
             template="fullstack",
             commands=["pnpm check"],
             description="Frontend validation",
         )
+        assert plan.strategy == ValidationStrategy.TEST_SUITE
         assert plan.template == "fullstack"
         assert plan.commands == ["pnpm check"]
 
+    def test_script_exec_plan(self):
+        plan = ValidationPlan(
+            strategy=ValidationStrategy.SCRIPT_EXEC,
+            script_file="hello.py",
+            description="Run hello.py",
+        )
+        assert plan.strategy == ValidationStrategy.SCRIPT_EXEC
+        assert plan.script_file == "hello.py"
+        assert plan.commands == []
+
 
 class TestFormatValidationSummary:
     """Tests for format_validation_summary()."""
 
-    def test_no_commands(self):
-        plan = ValidationPlan(description="No validation needed")
+    def test_skip_strategy(self):
+        plan = ValidationPlan(
+            strategy=ValidationStrategy.SKIP,
+            description="No validation needed",
+        )
         result = format_validation_summary(plan)
         assert result == "No validation needed"
 
-    def test_with_commands(self):
+    def test_script_exec_strategy(self):
+        plan = ValidationPlan(
+            strategy=ValidationStrategy.SCRIPT_EXEC,
+            script_file="hello.py",
+            description="Script execution: 1 files",
+        )
+        result = format_validation_summary(plan)
+        assert "Script execution" in result
+        assert "python hello.py" in result
+
+    def test_test_suite_with_commands(self):
         plan = ValidationPlan(
+            strategy=ValidationStrategy.TEST_SUITE,
             description="Python validation: 2 files",
             commands=["ruff check .", "pytest tests/"],
         )
@@ -164,6 +259,7 @@ def test_with_commands(self):
 
     def test_commands_indented(self):
         plan = ValidationPlan(
+            strategy=ValidationStrategy.TEST_SUITE,
             description="Test",
             commands=["cmd1"],
         )

From 1264fe9765307ff6530018c3c9a19c58c6475094 Mon Sep 17 00:00:00 2001
From: Mike Abernathy <mabernathy87@gmail.com>
Date: Wed, 1 Apr 2026 22:30:41 -0600
Subject: [PATCH 03/12] feat(sandbox): sequential run_tests, run_script,
 validation_skipped + warnings (#95)

- SandboxResult: add validation_skipped and warnings fields
- run_tests(): sequential command execution (drop && chain)
  Each command runs independently, aggregated results.
  Missing tools (ruff) no longer block subsequent commands (pytest).
- run_script(): new method for SCRIPT_EXEC strategy
  Executes `python <file>`, checks exit code + stdout.
  Reports as 1 passed/1 failed test for QA context.
- Parse __RESULTS__ JSON for per-command exit codes
- Detect rc=127 (command not found) and surface as warnings
---
 dev-suite/src/sandbox/e2b_runner.py | 215 ++++++++++++++++++----------
 1 file changed, 139 insertions(+), 76 deletions(-)

diff --git a/dev-suite/src/sandbox/e2b_runner.py b/dev-suite/src/sandbox/e2b_runner.py
index be5a217..24601e6 100644
--- a/dev-suite/src/sandbox/e2b_runner.py
+++ b/dev-suite/src/sandbox/e2b_runner.py
@@ -11,6 +11,9 @@
   - Default: bare Python (e2b-code-interpreter base image)
   - Fullstack: Python + Node.js 22 + pnpm + SvelteKit toolchain
   - Templates configured via E2B_TEMPLATE_DEFAULT / E2B_TEMPLATE_FULLSTACK env vars
+
+Issue #95: Added validation_skipped/warnings fields to SandboxResult,
+sequential run_tests() execution, and run_script() for simple scripts.
 """
 
 import os
@@ -47,6 +50,8 @@ class SandboxResult(BaseModel):
     output_summary: str = ""
     files_modified: list[str] = []
     timed_out: bool = False
+    validation_skipped: bool = False
+    warnings: list[str] = []
 
 
 # -- Secret Patterns for Scanning --
@@ -173,16 +178,7 @@ def _extract_execution_error(error_obj) -> str:
 # -- Template Configuration --
 
 def _get_template_id(template: str | None) -> str | None:
-    """Resolve a template name to an E2B template ID.
-
-    Template names map to environment variables:
-      - "fullstack" -> E2B_TEMPLATE_FULLSTACK
-      - "default"   -> E2B_TEMPLATE_DEFAULT
-      - None        -> E2B_TEMPLATE_DEFAULT (if set), else bare E2B default
-
-    If the value looks like a raw template ID (no env var mapping),
-    it's used directly.
-    """
+    """Resolve a template name to an E2B template ID."""
     if template is None:
         return os.getenv("E2B_TEMPLATE_DEFAULT") or None
 
@@ -210,12 +206,7 @@ def _get_template_id(template: str | None) -> str | None:
 
 
 def select_template_for_files(target_files: list[str]) -> str | None:
-    """Select the appropriate sandbox template based on target file types.
-
-    Returns:
-        Template name ("fullstack" or None for default Python).
-        Returns None if no files are provided or all files are Python-only.
-    """
+    """Select the appropriate sandbox template based on target file types."""
     if not target_files:
         return None
 
@@ -234,24 +225,14 @@ def select_template_for_files(target_files: list[str]) -> str | None:
 class E2BRunner:
     """Execute code in E2B sandboxes with structured output.
 
-    The API key is read from the E2B_API_KEY environment variable.
-    Make sure it's set in your .env file or shell environment.
-
-    Correct usage per E2B SDK v2 (from official docs):
-        Sandbox.create() is the factory method - never call Sandbox() directly.
-
     Usage:
         runner = E2BRunner()
         result = runner.run("print('hello')")
-        print(result.output_summary)
-
-        # With a custom template:
-        result = runner.run("console.log('hi')", template="fullstack")
+        result = runner.run_script("hello.py", project_files={"hello.py": "print('hi')"})
+        result = runner.run_tests(commands=["pytest tests/"], project_files={...})
     """
 
     def __init__(self, api_key: str | None = None, default_timeout: int = 30):
-        # E2B SDK reads from E2B_API_KEY env var automatically.
-        # If an explicit key is passed, set it in the environment.
         if api_key:
             os.environ["E2B_API_KEY"] = api_key
         self._default_timeout = default_timeout
@@ -264,49 +245,28 @@ def run(
         timeout: int | None = None,
         template: str | None = None,
     ) -> SandboxResult:
-        """Execute code in a sandbox and return structured results.
-
-        Args:
-            code: Python code to execute.
-            profile: Security profile (LOCKED or PERMISSIVE).
-            env_vars: Environment variables to inject (secrets for LOCKED only).
-            timeout: Execution timeout in seconds.
-            template: Sandbox template name or ID.
-                      "fullstack" -> E2B_TEMPLATE_FULLSTACK env var.
-                      None -> E2B_TEMPLATE_DEFAULT or bare E2B default.
-                      Any other string -> used as raw template ID.
-
-        Returns:
-            SandboxResult with structured output. Never raw stdout/stderr.
-        """
+        """Execute code in a sandbox and return structured results."""
         timeout = timeout or self._default_timeout
 
-        # Enforce: PERMISSIVE profile gets zero secrets
         if profile == SandboxProfile.PERMISSIVE:
             env_vars = None
 
-        # Resolve template name to ID
         template_id = _get_template_id(template)
 
         try:
-            # E2B v2 SDK: Sandbox.create() is the public factory method.
-            # Using it as a context manager auto-kills the sandbox on exit.
             create_kwargs = {}
             if template_id:
                 create_kwargs["template"] = template_id
 
             with Sandbox.create(**create_kwargs) as sbx:
-                # Inject env vars if provided
                 if env_vars:
                     env_setup = "\n".join(
                         f"import os; os.environ['{k}'] = '{v}'" for k, v in env_vars.items()
                     )
                     sbx.run_code(env_setup)
 
-                # Execute the actual code
                 execution = sbx.run_code(code)
 
-                # Collect raw output
                 raw_stdout = ""
                 raw_stderr = ""
                 for log in execution.logs.stdout:
@@ -316,26 +276,20 @@ def run(
 
                 combined = raw_stdout + raw_stderr
 
-                # Layer 1: Structured output wrapper
                 test_info = _parse_test_output(combined)
 
-                # Extract errors from both stdout/stderr AND execution.error
                 errors = _extract_errors(combined)
                 if execution.error:
                     exec_err = _extract_execution_error(execution.error)
                     if exec_err and exec_err not in errors:
                         errors.insert(0, exec_err)
 
-                # Layer 2: Secret scanning
                 safe_summary = _scan_for_secrets(combined)
 
-                # If stdout/stderr was empty but we have an execution error,
-                # include the error in the summary
                 if not safe_summary.strip() and execution.error:
                     exec_err = _extract_execution_error(execution.error)
                     safe_summary = _scan_for_secrets(exec_err)
 
-                # Truncate summary to prevent context bloat
                 if len(safe_summary) > 2000:
                     safe_summary = safe_summary[:2000] + "\n... [truncated]"
 
@@ -364,38 +318,119 @@ def run(
 
     def run_tests(
         self,
-        test_command: str,
+        commands: list[str],
         project_files: dict[str, str] | None = None,
         env_vars: dict[str, str] | None = None,
         timeout: int = 60,
         template: str | None = None,
     ) -> SandboxResult:
-        """Run a test suite in the sandbox.
-
-        Args:
-            test_command: Shell command to run tests (e.g., 'pytest tests/ -v').
-            project_files: Dict of {filepath: content} to write before testing.
-            env_vars: Secrets to inject.
-            timeout: Test timeout in seconds.
-            template: Sandbox template name or ID (e.g., "fullstack" for Node.js tasks).
+        """Run validation commands sequentially in the sandbox.
+
+        Each command runs independently -- a missing tool (e.g., ruff)
+        does not prevent subsequent commands (e.g., pytest) from running.
+        Results are aggregated across all commands.
         """
-        # Build code that writes files then runs tests.
-        # File content is base64-encoded to avoid escaping issues with
-        # triple-quoted strings, embedded quotes, and multiline content.
-        setup_code = ""
+        import base64 as _b64
+        import json as _json
+
+        setup_lines = []
         if project_files:
-            import base64 as _b64
-            setup_code += "import os, base64\n"
+            setup_lines.append("import os, base64")
             for fpath, content in project_files.items():
                 b64 = _b64.b64encode(content.encode("utf-8")).decode("ascii")
-                setup_code += f"os.makedirs(os.path.dirname('{fpath}') or '.', exist_ok=True)\n"
-                setup_code += f"open('{fpath}', 'w').write(base64.b64decode('{b64}').decode('utf-8'))\n"
+                setup_lines.append(f"os.makedirs(os.path.dirname('{fpath}') or '.', exist_ok=True)")
+                setup_lines.append(f"open('{fpath}', 'w').write(base64.b64decode('{b64}').decode('utf-8'))")
+
+        commands_json = _json.dumps(commands)
+        run_code = f"""
+import subprocess, json
+
+commands = {commands_json}
+results = []
+for cmd in commands:
+    try:
+        r = subprocess.run(
+            cmd, shell=True, capture_output=True, text=True, timeout={timeout},
+        )
+        results.append({{"cmd": cmd, "rc": r.returncode, "out": r.stdout, "err": r.stderr}})
+    except subprocess.TimeoutExpired:
+        results.append({{"cmd": cmd, "rc": -1, "out": "", "err": "TIMEOUT"}})
+
+for r in results:
+    print(r["out"])
+    if r["err"] and r["err"] != "TIMEOUT":
+        print(r["err"])
+
+print("__RESULTS__" + json.dumps([{{"cmd": r["cmd"], "rc": r["rc"]}} for r in results]))
+"""
+
+        setup_code = "\n".join(setup_lines) + "\n" if setup_lines else ""
+        full_code = setup_code + run_code
+
+        result = self.run(
+            full_code,
+            profile=SandboxProfile.LOCKED,
+            env_vars=env_vars,
+            timeout=timeout + 30,
+            template=template,
+        )
+
+        warnings = list(result.warnings)
+        summary = result.output_summary
+        if "[WARN]" in summary:
+            for line in summary.split("\n"):
+                if "[WARN]" in line:
+                    warnings.append(line.strip())
+
+        if "__RESULTS__" in summary:
+            try:
+                json_part = summary.split("__RESULTS__")[-1].strip()
+                cmd_results = _json.loads(json_part)
+                for cr in cmd_results:
+                    if cr.get("rc") == 127:
+                        warnings.append(f"Command not found: {cr['cmd'][:60]}")
+                summary = summary.split("__RESULTS__")[0].strip()
+            except (ValueError, IndexError):
+                pass
+
+        return SandboxResult(
+            exit_code=result.exit_code,
+            tests_passed=result.tests_passed,
+            tests_failed=result.tests_failed,
+            errors=result.errors,
+            output_summary=summary,
+            files_modified=result.files_modified,
+            timed_out=result.timed_out,
+            validation_skipped=False,
+            warnings=warnings,
+        )
+
+    def run_script(
+        self,
+        script_file: str,
+        project_files: dict[str, str] | None = None,
+        env_vars: dict[str, str] | None = None,
+        timeout: int = 30,
+        template: str | None = None,
+    ) -> SandboxResult:
+        """Execute a script and check exit code + stdout.
+
+        For simple scripts that don't have a test suite.
+        """
+        import base64 as _b64
+
+        setup_lines = []
+        if project_files:
+            setup_lines.append("import os, base64")
+            for fpath, content in project_files.items():
+                b64 = _b64.b64encode(content.encode("utf-8")).decode("ascii")
+                setup_lines.append(f"os.makedirs(os.path.dirname('{fpath}') or '.', exist_ok=True)")
+                setup_lines.append(f"open('{fpath}', 'w').write(base64.b64decode('{b64}').decode('utf-8'))")
 
         run_code = f"""
 import subprocess
 result = subprocess.run(
-    {test_command!r},
-    shell=True,
+    ["python", {script_file!r}],
     capture_output=True,
     text=True,
     timeout={timeout},
@@ -403,13 +438,41 @@ def run_tests(
 print(result.stdout)
 if result.stderr:
     print(result.stderr)
+print(f"__EXIT_CODE__{{result.returncode}}")
 """
 
+        setup_code = "\n".join(setup_lines) + "\n" if setup_lines else ""
         full_code = setup_code + run_code
-        return self.run(
+
+        result = self.run(
             full_code,
             profile=SandboxProfile.LOCKED,
             env_vars=env_vars,
             timeout=timeout + 10,
             template=template,
         )
+
+        summary = result.output_summary
+        script_exit = 0
+        if "__EXIT_CODE__" in summary:
+            try:
+                code_str = summary.split("__EXIT_CODE__")[-1].strip()
+                script_exit = int(code_str)
+                summary = summary.split("__EXIT_CODE__")[0].strip()
+            except (ValueError, IndexError):
+                pass
+
+        passed = 1 if script_exit == 0 and not result.errors else 0
+        failed = 0 if passed else 1
+
+        return SandboxResult(
+            exit_code=script_exit if not result.errors else 1,
+            tests_passed=passed,
+            tests_failed=failed,
+            errors=result.errors,
+            output_summary=summary,
+            files_modified=result.files_modified,
+            timed_out=result.timed_out,
+            validation_skipped=False,
+            warnings=[],
+        )

From dee5fc873bc552583327f5417e4006e0078ae8f0 Mon Sep 17 00:00:00 2001
From: Mike Abernathy <mabernathy87@gmail.com>
Date: Wed, 1 Apr 2026 22:32:35 -0600
Subject: [PATCH 04/12] test

---
 dev-suite/src/orchestrator.py | 719 +---------------------------------
 1 file changed, 1 insertion(+), 718 deletions(-)

diff --git a/dev-suite/src/orchestrator.py b/dev-suite/src/orchestrator.py
index 8f786d5..45fea74 100644
--- a/dev-suite/src/orchestrator.py
+++ b/dev-suite/src/orchestrator.py
@@ -1,718 +1 @@
-"""LangGraph orchestrator -- Architect -> Lead Dev -> apply_code -> sandbox -> QA loop.
-
-This is the main entry point for the agent workflow.
-Implements the state machine with retry logic, token budgets,
-structured Blueprint passing, human escalation, code application,
-tool binding (issue #80), and memory write-back.
-
-Issue #80: Agent tool binding -- Dev and QA agents can now use
-workspace tools (filesystem_read, filesystem_write, etc.) via
-LangChain's bind_tools() + iterative tool execution loop.
-Tools are passed via RunnableConfig["configurable"]["tools"].
-"""
-
-import asyncio
-import json
-import logging
-import os
-import re
-from enum import Enum
-from pathlib import Path
-from typing import Any, Literal, TypedDict
-
-from dotenv import load_dotenv
-from langchain_anthropic import ChatAnthropic
-from langchain_core.messages import (
-    AIMessage,
-    HumanMessage,
-    SystemMessage,
-    ToolMessage,
-)
-from langchain_core.runnables.config import RunnableConfig
-from langchain_google_genai import ChatGoogleGenerativeAI
-from langgraph.graph import END, START, StateGraph
-from pydantic import BaseModel
-
-from .agents.architect import Blueprint
-from .agents.qa import FailureReport, FailureType
-from .memory.factory import create_memory_store
-from .memory.protocol import MemoryQueryResult, MemoryStore
-from .memory.summarizer import summarize_writes_sync
-from .sandbox.e2b_runner import E2BRunner, SandboxResult
-from .sandbox.validation_commands import (
-    ValidationPlan,
-    format_validation_summary,
-    get_validation_plan,
-)
-from .tools.code_parser import (
-    CodeParserError,
-    parse_generated_code,
-    validate_paths_for_workspace,
-)
-from .tracing import add_trace_event, create_trace_config
-
-load_dotenv()
-
-logger = logging.getLogger(__name__)
-
-
-def _safe_int(env_key: str, default: int) -> int:
-    raw = os.getenv(env_key, str(default))
-    try:
-        return int(raw)
-    except (ValueError, TypeError):
-        logger.warning("%s=%r is not a valid integer, using default %d", env_key, raw, default)
-        return default
-
-MAX_RETRIES = _safe_int("MAX_RETRIES", 3)
-TOKEN_BUDGET = _safe_int("TOKEN_BUDGET", 50000)
-MAX_TOOL_TURNS = _safe_int("MAX_TOOL_TURNS", 10)
-
-
-def _get_workspace_root() -> Path:
-    raw = os.getenv("WORKSPACE_ROOT", ".")
-    return Path(raw).resolve()
-
-
-class WorkflowStatus(str, Enum):
-    PLANNING = "planning"
-    BUILDING = "building"
-    REVIEWING = "reviewing"
-    PASSED = "passed"
-    FAILED = "failed"
-    ESCALATED = "escalated"
-
-
-class GraphState(TypedDict, total=False):
-    task_description: str
-    blueprint: Blueprint | None
-    generated_code: str
-    failure_report: FailureReport | None
-    status: WorkflowStatus
-    retry_count: int
-    tokens_used: int
-    error_message: str
-    memory_context: list[str]
-    memory_writes: list[dict]
-    trace: list[str]
-    sandbox_result: SandboxResult | None
-    parsed_files: list[dict]
-    tool_calls_log: list[dict]
-    memory_writes_flushed: list[dict]
-
-
-class AgentState(BaseModel):
-    task_description: str = ""
-    blueprint: Blueprint | None = None
-    generated_code: str = ""
-    failure_report: FailureReport | None = None
-    status: WorkflowStatus = WorkflowStatus.PLANNING
-    retry_count: int = 0
-    tokens_used: int = 0
-    error_message: str = ""
-    memory_context: list[str] = []
-    memory_writes: list[dict] = []
-    trace: list[str] = []
-    sandbox_result: SandboxResult | None = None
-    parsed_files: list[dict] = []
-    tool_calls_log: list[dict] = []
-
-
-def _get_architect_llm():
-    return ChatGoogleGenerativeAI(model=os.getenv("ARCHITECT_MODEL", "gemini-3-flash-preview"), google_api_key=os.getenv("GOOGLE_API_KEY"), temperature=0.2)
-
-
-def _get_developer_llm():
-    return ChatAnthropic(model=os.getenv("DEVELOPER_MODEL", "claude-sonnet-4-20250514"), api_key=os.getenv("ANTHROPIC_API_KEY"), temperature=0.1, max_tokens=8192)
-
-
-def _get_qa_llm():
-    return ChatAnthropic(model=os.getenv("QA_MODEL", "claude-sonnet-4-20250514"), api_key=os.getenv("ANTHROPIC_API_KEY"), temperature=0.0, max_tokens=4096)
-
-
-def _extract_text_content(content: Any) -> str:
-    if isinstance(content, str):
-        return content
-    if isinstance(content, list):
-        parts = []
-        for block in content:
-            if isinstance(block, str):
-                parts.append(block)
-            elif isinstance(block, dict) and "text" in block:
-                parts.append(block["text"])
-            elif hasattr(block, "text"):
-                parts.append(block.text)
-        return "\n".join(parts)
-    return str(content)
-
-
-def _extract_json(raw: str) -> dict:
-    text = raw.strip()
-    try:
-        return json.loads(text)
-    except json.JSONDecodeError:
-        pass
-    if "```" in text:
-        parts = text.split("```")
-        for part in parts[1::2]:
-            inner = part.strip()
-            if inner.startswith("json"):
-                inner = inner[4:].strip()
-            try:
-                return json.loads(inner)
-            except json.JSONDecodeError:
-                continue
-    first_brace = text.find("{")
-    last_brace = text.rfind("}")
-    if first_brace != -1 and last_brace > first_brace:
-        try:
-            return json.loads(text[first_brace:last_brace + 1])
-        except json.JSONDecodeError:
-            pass
-    raise json.JSONDecodeError(f"No valid JSON found in response ({len(text)} chars): {text[:200]}...", text, 0)
-
-
-def _extract_token_count(response: Any) -> int:
-    meta = getattr(response, "usage_metadata", None)
-    if not meta:
-        return 0
-    if isinstance(meta, dict):
-        for key in ("total_tokens", "totalTokenCount", "total_token_count"):
-            if key in meta:
-                return int(meta[key])
-        input_t = meta.get("input_tokens", meta.get("prompt_tokens", 0))
-        output_t = meta.get("output_tokens", meta.get("completion_tokens", 0))
-        if input_t or output_t:
-            return int(input_t) + int(output_t)
-    return 0
-
-
-def _get_memory_store() -> MemoryStore:
-    return create_memory_store()
-
-
-def _fetch_memory_context(task_description: str) -> list[str]:
-    try:
-        store = _get_memory_store()
-        results = store.query(task_description, n_results=10)
-        return [r.content for r in results]
-    except Exception:
-        return []
-
-
-def _infer_module(target_files: list[str]) -> str:
-    if not target_files:
-        return "global"
-    first_file = target_files[0]
-    parts = first_file.replace("\\", "/").split("/")
-    if len(parts) > 2 and parts[0] == "src":
-        return parts[1]
-    if len(parts) > 1:
-        return parts[0]
-    return "global"
-
-
-# -- Tool Binding (Issue #80) --
-
-# Fix 2: Removed github_create_pr -- non-idempotent write should not be in
-# an iterative tool loop. PR creation belongs in a post-QA-pass step.
-DEV_TOOL_NAMES = {"filesystem_read", "filesystem_write", "filesystem_list", "github_read_diff"}
-QA_TOOL_NAMES = {"filesystem_read", "filesystem_list", "github_read_diff"}
-
-# Fix 4: Secret pattern regexes for sanitizing tool call previews
-_SECRET_PATTERNS = [
-    re.compile(r'(?:sk|pk|api|key|token|secret|password|bearer)[_-]?\w{10,}', re.IGNORECASE),
-    re.compile(r'(?:ghp|gho|ghu|ghs|ghr)_[A-Za-z0-9_]{30,}'),
-    re.compile(r'(?:eyJ)[A-Za-z0-9_-]{20,}'),
-    re.compile(r'(?:AKIA|ASIA)[A-Z0-9]{16}'),
-]
-
-
-def _sanitize_preview(text: str, max_len: int = 200) -> str:
-    """Truncate and redact known secret patterns from tool call previews."""
-    if not text:
-        return ""
-    sanitized = text
-    for pattern in _SECRET_PATTERNS:
-        sanitized = pattern.sub("[REDACTED]", sanitized)
-    if len(sanitized) > max_len:
-        sanitized = sanitized[:max_len] + "..."
-    return sanitized
-
-
-def _get_agent_tools(config, allowed_names=None):
-    if not config:
-        return []
-    configurable = config.get("configurable", {})
-    tools = configurable.get("tools", [])
-    if not tools:
-        return []
-    if allowed_names is None:
-        return list(tools)
-    return [t for t in tools if t.name in allowed_names]
-
-
-async def _execute_tool_call(tool_call, tools):
-    """Execute a single tool call. Uses ainvoke/invoke public API (Fix 3)."""
-    tool_name = tool_call.get("name", "")
-    tool_args = tool_call.get("args", {})
-    tool_id = tool_call.get("id", "unknown")
-    tool_map = {t.name: t for t in tools}
-    tool = tool_map.get(tool_name)
-    if not tool:
-        return ToolMessage(content=f"Error: Tool '{tool_name}' not found. Available: {list(tool_map.keys())}", tool_call_id=tool_id)
-    try:
-        # Fix 3: Use public ainvoke/invoke API instead of tool.coroutine.
-        # ainvoke handles input validation, callbacks, and config propagation.
-        if hasattr(tool, "ainvoke"):
-            result = await tool.ainvoke(tool_args)
-        else:
-            result = tool.invoke(tool_args)
-        return ToolMessage(content=str(result), tool_call_id=tool_id)
-    except Exception as e:
-        logger.warning("[TOOLS] Tool %s failed: %s", tool_name, e)
-        return ToolMessage(content=f"Error executing {tool_name}: {type(e).__name__}: {e}", tool_call_id=tool_id)
-
-
-async def _run_tool_loop(llm_with_tools, messages, tools, max_turns=MAX_TOOL_TURNS, tokens_used=0, trace=None, agent_name="agent"):
-    if trace is None:
-        trace = []
-    tool_calls_log = []
-    current_messages = list(messages)
-    # Fix 5: Guard against max_turns <= 0 to prevent unbound response variable
-    if max_turns <= 0:
-        logger.warning("[%s] max_turns=%d, skipping tool loop", agent_name.upper(), max_turns)
-        trace.append(f"{agent_name}: tool loop skipped (max_turns={max_turns})")
-        last_msg = current_messages[-1] if current_messages else AIMessage(content="")
-        return last_msg, tokens_used, tool_calls_log
-    for turn in range(max_turns):
-        response = await llm_with_tools.ainvoke(current_messages)
-        tokens_used += _extract_token_count(response)
-        tool_calls = getattr(response, "tool_calls", None) or []
-        if not tool_calls:
-            trace.append(f"{agent_name}: tool loop done after {turn} tool turn(s)")
-            return response, tokens_used, tool_calls_log
-        trace.append(f"{agent_name}: turn {turn + 1} -- {len(tool_calls)} tool call(s): {', '.join(tc.get('name', '?') for tc in tool_calls)}")
-        logger.info("[%s] Tool turn %d: %d calls", agent_name.upper(), turn + 1, len(tool_calls))
-        current_messages.append(response)
-        for tc in tool_calls:
-            tool_msg = await _execute_tool_call(tc, tools)
-            current_messages.append(tool_msg)
-            # Fix 4: Sanitize previews before persisting to tool_calls_log
-            tool_calls_log.append({"agent": agent_name, "turn": turn + 1, "tool": tc.get("name", "unknown"), "args_preview": _sanitize_preview(str(tc.get("args", {}))), "result_preview": _sanitize_preview(str(tool_msg.content)), "success": not tool_msg.content.startswith("Error")})
-    trace.append(f"{agent_name}: tool loop hit max turns ({max_turns})")
-    logger.warning("[%s] Hit max tool turns (%d)", agent_name.upper(), max_turns)
-    return response, tokens_used, tool_calls_log
-
-
-def _run_async(coro):
-    """Run an async coroutine from sync context for the tool loop.
-
-    Design note (re: CodeRabbit #8): This is intentional. developer_node and
-    qa_node are sync functions that use _run_async() to bridge into the async
-    tool loop. run_task() uses workflow.invoke() (sync). Converting everything
-    to async would cascade changes to CLI, tests, and callers with no benefit
-    since LangGraph handles sync nodes with internal async bridges fine.
-    """
-    try:
-        loop = asyncio.get_running_loop()
-    except RuntimeError:
-        loop = None
-    if loop and loop.is_running():
-        import concurrent.futures
-        with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
-            return pool.submit(asyncio.run, coro).result()
-    else:
-        return asyncio.run(coro)
-
-
-# -- Node Functions --
-
-def architect_node(state: GraphState) -> dict:
-    trace = list(state.get("trace", []))
-    trace.append("architect: starting planning")
-    retry_count = state.get("retry_count", 0)
-    tokens_used = state.get("tokens_used", 0)
-    logger.info("[ARCH] retry_count=%d, tokens_used=%d, status=%s", retry_count, tokens_used, state.get("status", "unknown"))
-    memory_context = _fetch_memory_context(state.get("task_description", ""))
-    memory_block = ""
-    if memory_context:
-        memory_block = "\n\nProject context from memory:\n" + "\n".join(f"- {c}" for c in memory_context)
-    system_prompt = f"""You are the Architect agent. Your job is to create a structured Blueprint
-for a coding task. You NEVER write code yourself.
-
-Respond with ONLY a valid JSON object matching this schema:
-{{
-  "task_id": "string (short unique identifier)",
-  "target_files": ["list of file paths to create or modify"],
-  "instructions": "clear step-by-step instructions for the developer",
-  "constraints": ["list of constraints or requirements"],
-  "acceptance_criteria": ["list of testable criteria for QA"]
-}}
-
-Do not include any text before or after the JSON.{memory_block}"""
-    user_msg = state.get("task_description", "")
-    failure_report = state.get("failure_report")
-    if failure_report and failure_report.is_architectural:
-        user_msg += "\n\nPREVIOUS ATTEMPT FAILED (architectural issue):\n"
-        user_msg += f"Errors: {', '.join(failure_report.errors)}\n"
-        if failure_report.failed_files:
-            user_msg += f"Failed files: {', '.join(failure_report.failed_files)}\n"
-        user_msg += f"Recommendation: {failure_report.recommendation}\n"
-        user_msg += "\nGenerate a COMPLETELY NEW Blueprint. Do not patch the old one. The previous target_files or approach was wrong."
-    llm = _get_architect_llm()
-    response = llm.invoke([SystemMessage(content=system_prompt), HumanMessage(content=user_msg)])
-    try:
-        raw = _extract_text_content(response.content)
-        blueprint_data = _extract_json(raw)
-        blueprint = Blueprint(**blueprint_data)
-    except (json.JSONDecodeError, Exception) as e:
-        trace.append(f"architect: failed to parse blueprint: {e}")
-        logger.error("[ARCH] Blueprint parse failed: %s", e)
-        return {"status": WorkflowStatus.FAILED, "error_message": f"Architect failed to produce valid Blueprint: {e}", "trace": trace, "memory_context": memory_context}
-    trace.append(f"architect: blueprint created for {len(blueprint.target_files)} files")
-    tokens_used = tokens_used + _extract_token_count(response)
-    logger.info("[ARCH] done. tokens_used now=%d", tokens_used)
-    return {"blueprint": blueprint, "status": WorkflowStatus.BUILDING, "tokens_used": tokens_used, "trace": trace, "memory_context": memory_context}
-
-
-def developer_node(state: GraphState, config: RunnableConfig | None = None) -> dict:
-    """Lead Dev: executes the Blueprint and generates code. Issue #80: tool binding support."""
-    trace = list(state.get("trace", []))
-    trace.append("developer: starting build")
-    memory_writes = list(state.get("memory_writes", []))
-    tool_calls_log = list(state.get("tool_calls_log", []))
-    retry_count = state.get("retry_count", 0)
-    tokens_used = state.get("tokens_used", 0)
-    logger.info("[DEV] retry_count=%d, tokens_used=%d, status=%s", retry_count, tokens_used, state.get("status", "unknown"))
-    blueprint = state.get("blueprint")
-    if not blueprint:
-        trace.append("developer: no blueprint provided")
-        return {"status": WorkflowStatus.FAILED, "error_message": "Developer received no Blueprint", "trace": trace}
-    tools = _get_agent_tools(config, DEV_TOOL_NAMES)
-    has_tools = len(tools) > 0
-    if has_tools:
-        tool_names = [t.name for t in tools]
-        trace.append(f"developer: {len(tools)} tools available: {', '.join(tool_names)}")
-        system_prompt = """You are the Lead Dev agent. You receive a structured Blueprint and implement it using the tools available to you.
-
-WORKFLOW:
-1. Use filesystem_read to examine the existing files listed in target_files.
-2. Use filesystem_list to explore the project directory structure if needed.
-3. Implement the changes described in the Blueprint.
-4. Use filesystem_write to write each file.
-5. After writing all files, provide a summary of what you implemented.
-
-IMPORTANT:
-- Use filesystem_write for EACH file you need to create or modify.
-- Follow the Blueprint exactly. Respect all constraints.
-- Write clean, well-documented code.
-- After completing all file writes, respond with a text summary.
-- Also include the complete code in your final response using the format:
-  # --- FILE: path/to/file.py ---
-  (file contents)"""
-    else:
-        trace.append("developer: no tools available, using single-shot mode")
-        system_prompt = """You are the Lead Dev agent. You receive a structured Blueprint
-and write the code to implement it.
-
-Respond with the complete code implementation. Include file paths as comments
-at the top of each file section, like:
-# --- FILE: path/to/file.py ---
-
-Follow the Blueprint exactly. Respect all constraints.
-Write clean, well-documented code."""
-    user_msg = f"Blueprint:\n{blueprint.model_dump_json(indent=2)}"
-    failure_report = state.get("failure_report")
-    if failure_report and not failure_report.is_architectural:
-        user_msg += "\n\nPREVIOUS ATTEMPT FAILED:\n"
-        user_msg += f"Tests passed: {failure_report.tests_passed}\n"
-        user_msg += f"Tests failed: {failure_report.tests_failed}\n"
-        user_msg += f"Errors: {', '.join(failure_report.errors)}\n"
-        user_msg += f"Failed files: {', '.join(failure_report.failed_files)}\n"
-        user_msg += f"Recommendation: {failure_report.recommendation}\n"
-        user_msg += "\nFix the issues and regenerate the code."
-    messages = [SystemMessage(content=system_prompt), HumanMessage(content=user_msg)]
-    llm = _get_developer_llm()
-    if has_tools:
-        llm_with_tools = llm.bind_tools(tools)
-        response, tokens_used, new_tool_log = _run_async(_run_tool_loop(llm_with_tools, messages, tools, max_turns=MAX_TOOL_TURNS, tokens_used=tokens_used, trace=trace, agent_name="developer"))
-        tool_calls_log.extend(new_tool_log)
-    else:
-        response = llm.invoke(messages)
-        tokens_used += _extract_token_count(response)
-    content = _extract_text_content(response.content)
-    trace.append(f"developer: code generated ({len(content)} chars)")
-    logger.info("[DEV] done. tokens_used now=%d", tokens_used)
-    memory_writes.append({"content": f"Implemented blueprint {blueprint.task_id}: {blueprint.instructions[:200]}", "tier": "l1", "module": _infer_module(blueprint.target_files), "source_agent": "developer", "confidence": 1.0, "sandbox_origin": "locked-down", "related_files": ",".join(blueprint.target_files), "task_id": blueprint.task_id})
-    return {"generated_code": content, "status": WorkflowStatus.REVIEWING, "tokens_used": tokens_used, "trace": trace, "memory_writes": memory_writes, "tool_calls_log": tool_calls_log}
-
-
-def apply_code_node(state: GraphState) -> dict:
-    trace = list(state.get("trace", []))
-    trace.append("apply_code: starting")
-    generated_code = state.get("generated_code", "")
-    blueprint = state.get("blueprint")
-    if not generated_code:
-        trace.append("apply_code: no generated_code -- skipping")
-        return {"parsed_files": [], "trace": trace}
-    if not blueprint:
-        trace.append("apply_code: no blueprint -- skipping")
-        return {"parsed_files": [], "trace": trace}
-    try:
-        parsed = parse_generated_code(generated_code)
-    except CodeParserError as e:
-        logger.warning("[APPLY_CODE] Parse error: %s", e)
-        trace.append(f"apply_code: parse error -- {e}")
-        return {"parsed_files": [], "trace": trace}
-    if not parsed:
-        trace.append("apply_code: parser returned no files")
-        return {"parsed_files": [], "trace": trace}
-    workspace_root = _get_workspace_root()
-    safe_files = validate_paths_for_workspace(parsed, workspace_root)
-    if len(safe_files) < len(parsed):
-        skipped = len(parsed) - len(safe_files)
-        trace.append(f"apply_code: WARNING -- {skipped} file(s) skipped due to path validation")
-    total_chars = 0
-    written_count = 0
-    for pf in safe_files:
-        try:
-            target = workspace_root / pf.path
-            target.parent.mkdir(parents=True, exist_ok=True)
-            target.write_text(pf.content, encoding="utf-8")
-            written_count += 1
-            total_chars += len(pf.content)
-        except Exception as e:
-            logger.warning("[APPLY_CODE] Failed to write %s: %s", pf.path, e)
-            trace.append(f"apply_code: failed to write {pf.path} -- {e}")
-    trace.append(f"apply_code: wrote {written_count} files ({total_chars:,} chars total) to workspace")
-    logger.info("[APPLY_CODE] Wrote %d files (%d chars) to %s", written_count, total_chars, workspace_root)
-    parsed_files_data = [{"path": pf.path, "content": pf.content} for pf in safe_files]
-    return {"parsed_files": parsed_files_data, "trace": trace}
-
-
-def qa_node(state: GraphState, config: RunnableConfig | None = None) -> dict:
-    """QA: reviews the generated code. Issue #80: read-only tool access."""
-    trace = list(state.get("trace", []))
-    trace.append("qa: starting review")
-    memory_writes = list(state.get("memory_writes", []))
-    tool_calls_log = list(state.get("tool_calls_log", []))
-    retry_count = state.get("retry_count", 0)
-    tokens_used = state.get("tokens_used", 0)
-    logger.info("[QA] retry_count=%d, tokens_used=%d, status=%s", retry_count, tokens_used, state.get("status", "unknown"))
-    generated_code = state.get("generated_code", "")
-    blueprint = state.get("blueprint")
-    if not generated_code or not blueprint:
-        trace.append("qa: missing code or blueprint")
-        return {"status": WorkflowStatus.FAILED, "error_message": "QA received no code or blueprint to review", "trace": trace}
-    tools = _get_agent_tools(config, QA_TOOL_NAMES)
-    has_tools = len(tools) > 0
-    if has_tools:
-        tool_names = [t.name for t in tools]
-        trace.append(f"qa: {len(tools)} tools available: {', '.join(tool_names)}")
-    system_prompt = "You are the QA agent. You review code against a Blueprint's acceptance criteria.\n\n"
-    if has_tools:
-        system_prompt += "You have tools to read files from the workspace. Use filesystem_read to inspect the actual files that were written, and filesystem_list to check the project structure.\n\n"
-    system_prompt += 'Respond with ONLY a valid JSON object matching this schema:\n{\n  "task_id": "string (from the Blueprint)",\n  "status": "pass" or "fail" or "escalate",\n  "tests_passed": number,\n  "tests_failed": number,\n  "errors": ["list of specific error descriptions"],\n  "failed_files": ["list of files with issues"],\n  "is_architectural": true/false,\n  "failure_type": "code" or "architectural" or null (if pass),\n  "recommendation": "what to fix or why it should escalate"\n}\n\nFAILURE CLASSIFICATION (critical for correct routing):\n\nSet failure_type to "code" (status: "fail") when:\n- Implementation has bugs, syntax errors, or type errors\n- Tests fail due to logic errors in the code\n- Code does not follow the Blueprint\'s constraints\n- Missing error handling or edge cases\nAction: Lead Dev will retry with the same Blueprint.\n\nSet failure_type to "architectural" (status: "escalate") when:\n- Blueprint targets the WRONG files (code is in the wrong place)\n- A required dependency or import is missing from the Blueprint\n- The design approach is fundamentally flawed\n- Acceptance criteria are impossible to meet with current targets\n- The task requires files not listed in target_files\nAction: Architect will generate a completely NEW Blueprint.\n\nBe strict but fair. Only pass code that meets ALL acceptance criteria.\nDo not include any text before or after the JSON.'
-    bp_json = blueprint.model_dump_json(indent=2)
-    user_msg = f"Blueprint:\n{bp_json}\n\nGenerated Code:\n{generated_code}"
-    sandbox_result = state.get("sandbox_result")
-    if sandbox_result is not None:
-        user_msg += "\n\nSandbox Validation Results:\n"
-        user_msg += f"  Exit code: {sandbox_result.exit_code}\n"
-        if sandbox_result.tests_passed is not None:
-            user_msg += f"  Tests passed: {sandbox_result.tests_passed}\n"
-        if sandbox_result.tests_failed is not None:
-            user_msg += f"  Tests failed: {sandbox_result.tests_failed}\n"
-        if sandbox_result.errors:
-            user_msg += f"  Errors: {', '.join(sandbox_result.errors)}\n"
-        if sandbox_result.output_summary:
-            user_msg += f"  Output:\n{sandbox_result.output_summary}\n"
-        user_msg += "\nUse these real test results to inform your review. If sandbox tests passed, weigh that heavily in your verdict."
-    else:
-        user_msg += "\n\nNote: Sandbox validation was not available for this review. Evaluate the code based on the Blueprint criteria only."
-    messages = [SystemMessage(content=system_prompt), HumanMessage(content=user_msg)]
-    llm = _get_qa_llm()
-    if has_tools:
-        llm_with_tools = llm.bind_tools(tools)
-        response, tokens_used, new_tool_log = _run_async(_run_tool_loop(llm_with_tools, messages, tools, max_turns=5, tokens_used=tokens_used, trace=trace, agent_name="qa"))
-        tool_calls_log.extend(new_tool_log)
-    else:
-        response = llm.invoke(messages)
-        tokens_used += _extract_token_count(response)
-    try:
-        raw = _extract_text_content(response.content)
-        report_data = _extract_json(raw)
-        failure_report = FailureReport(**report_data)
-    except (json.JSONDecodeError, Exception) as e:
-        trace.append(f"qa: failed to parse report: {e}")
-        return {"status": WorkflowStatus.FAILED, "error_message": f"QA failed to produce valid report: {e}", "trace": trace}
-    trace.append(f"qa: verdict={failure_report.status}, passed={failure_report.tests_passed}, failed={failure_report.tests_failed}")
-    if failure_report.status == "pass":
-        status = WorkflowStatus.PASSED
-        memory_writes.append({"content": f"QA passed for {blueprint.task_id}: {failure_report.tests_passed} tests passed", "tier": "l2", "module": _infer_module(blueprint.target_files), "source_agent": "qa", "confidence": 1.0, "sandbox_origin": "locked-down", "related_files": ",".join(blueprint.target_files), "task_id": blueprint.task_id})
-    elif failure_report.is_architectural:
-        status = WorkflowStatus.ESCALATED
-        memory_writes.append({"content": f"Architectural issue in {blueprint.task_id}: {failure_report.recommendation}", "tier": "l0-discovered", "module": _infer_module(blueprint.target_files), "source_agent": "qa", "confidence": 0.85, "sandbox_origin": "locked-down", "related_files": ",".join(failure_report.failed_files), "task_id": blueprint.task_id})
-    else:
-        status = WorkflowStatus.REVIEWING
-    new_retry_count = retry_count + (1 if failure_report.status != "pass" else 0)
-    logger.info("[QA] verdict=%s, retry_count %d->%d, tokens_used=%d", failure_report.status, retry_count, new_retry_count, tokens_used)
-    return {"failure_report": failure_report, "status": status, "tokens_used": tokens_used, "retry_count": new_retry_count, "trace": trace, "memory_writes": memory_writes, "tool_calls_log": tool_calls_log}
-
-
-def _run_sandbox_validation(commands, template, generated_code, parsed_files=None, timeout=120):
-    api_key = os.getenv("E2B_API_KEY")
-    if not api_key:
-        return None
-    runner = E2BRunner(api_key=api_key, default_timeout=timeout)
-    project_files = None
-    if parsed_files:
-        project_files = {pf["path"]: pf["content"] for pf in parsed_files}
-    compound_cmd = " && ".join(commands)
-    return runner.run_tests(test_command=compound_cmd, project_files=project_files, timeout=timeout, template=template)
-
-
-def sandbox_validate_node(state: GraphState) -> dict:
-    trace = list(state.get("trace", []))
-    trace.append("sandbox_validate: starting")
-    blueprint = state.get("blueprint")
-    if not blueprint:
-        trace.append("sandbox_validate: no blueprint -- skipping")
-        return {"sandbox_result": None, "trace": trace}
-    plan = get_validation_plan(blueprint.target_files)
-    trace.append(f"sandbox_validate: {plan.description}")
-    if not plan.commands:
-        trace.append("sandbox_validate: no code validation needed -- skipping")
-        return {"sandbox_result": None, "trace": trace}
-    template_label = plan.template or "default"
-    trace.append(f"sandbox_validate: template={template_label}, commands={len(plan.commands)}")
-    generated_code = state.get("generated_code", "")
-    parsed_files = state.get("parsed_files", [])
-    if parsed_files:
-        trace.append(f"sandbox_validate: loading {len(parsed_files)} files into sandbox")
-    try:
-        result = _run_sandbox_validation(commands=plan.commands, template=plan.template, generated_code=generated_code, parsed_files=parsed_files if parsed_files else None)
-        if result is None:
-            logger.warning("[SANDBOX] E2B_API_KEY not configured -- sandbox validation SKIPPED.")
-            trace.append("sandbox_validate: WARNING -- E2B_API_KEY not configured, sandbox validation skipped")
-            return {"sandbox_result": None, "trace": trace}
-        trace.append(f"sandbox_validate: exit_code={result.exit_code}, passed={result.tests_passed}, failed={result.tests_failed}")
-        logger.info("[SANDBOX] Validation complete: exit=%d, passed=%s, failed=%s", result.exit_code, result.tests_passed, result.tests_failed)
-        return {"sandbox_result": result, "trace": trace}
-    except Exception as e:
-        logger.warning("[SANDBOX] Validation failed with error: %s", e)
-        trace.append(f"sandbox_validate: error -- {type(e).__name__}: {e}")
-        return {"sandbox_result": None, "trace": trace}
-
-
-def flush_memory_node(state: GraphState) -> dict:
-    trace = list(state.get("trace", []))
-    trace.append("flush_memory: starting")
-    memory_writes = state.get("memory_writes", [])
-    if not memory_writes:
-        trace.append("flush_memory: no writes to flush")
-        return {"trace": trace}
-    consolidated = summarize_writes_sync(memory_writes)
-    trace.append(f"flush_memory: summarizer {len(memory_writes)} -> {len(consolidated)} entries")
-    written_entries: list[dict] = []
-    try:
-        store = _get_memory_store()
-        for entry in consolidated:
-            tier = entry.get("tier", "l1")
-            content = entry.get("content", "")
-            module = entry.get("module", "global")
-            source_agent = entry.get("source_agent", "unknown")
-            confidence = entry.get("confidence", 1.0)
-            sandbox_origin = entry.get("sandbox_origin", "none")
-            related_files = entry.get("related_files", "")
-            task_id = entry.get("task_id", "")
-            if tier == "l0-discovered":
-                store.add_l0_discovered(content, module=module, source_agent=source_agent, confidence=confidence, sandbox_origin=sandbox_origin, related_files=related_files, task_id=task_id)
-            elif tier == "l2":
-                store.add_l2(content, module=module, source_agent=source_agent, related_files=related_files, task_id=task_id)
-            else:
-                store.add_l1(content, module=module, source_agent=source_agent, confidence=confidence, sandbox_origin=sandbox_origin, related_files=related_files, task_id=task_id)
-            written_entries.append(entry)
-        trace.append(f"flush_memory: wrote {len(written_entries)} entries to store")
-        logger.info("[FLUSH] Wrote %d memory entries", len(written_entries))
-    except Exception as e:
-        trace.append(f"flush_memory: store write failed: {e}")
-        logger.warning("[FLUSH] Memory store write failed: %s", e)
-    return {"trace": trace, "memory_writes_flushed": written_entries}
-
-
-def route_after_qa(state: GraphState) -> Literal["flush_memory", "developer", "architect", "__end__"]:
-    status = state.get("status", WorkflowStatus.FAILED)
-    retry_count = state.get("retry_count", 0)
-    tokens_used = state.get("tokens_used", 0)
-    logger.info("[ROUTER] status=%s, retry_count=%d, tokens_used=%d, max_retries=%d, token_budget=%d", status, retry_count, tokens_used, MAX_RETRIES, TOKEN_BUDGET)
-    if status == WorkflowStatus.PASSED:
-        return "flush_memory"
-    if status == WorkflowStatus.FAILED:
-        return END
-    if retry_count >= MAX_RETRIES:
-        return "flush_memory"
-    if tokens_used >= TOKEN_BUDGET:
-        return "flush_memory"
-    if status == WorkflowStatus.ESCALATED:
-        return "architect"
-    return "developer"
-
-
-def build_graph() -> StateGraph:
-    graph = StateGraph(GraphState)
-    graph.add_node("architect", architect_node)
-    graph.add_node("developer", developer_node)
-    graph.add_node("apply_code", apply_code_node)
-    graph.add_node("sandbox_validate", sandbox_validate_node)
-    graph.add_node("qa", qa_node)
-    graph.add_node("flush_memory", flush_memory_node)
-    graph.add_edge(START, "architect")
-    graph.add_edge("architect", "developer")
-    graph.add_edge("developer", "apply_code")
-    graph.add_edge("apply_code", "sandbox_validate")
-    graph.add_edge("sandbox_validate", "qa")
-    graph.add_conditional_edges("qa", route_after_qa)
-    graph.add_edge("flush_memory", END)
-    return graph
-
-
-def create_workflow():
-    return build_graph().compile()
-
-
-def init_tools_config(workspace_root=None):
-    if workspace_root is None:
-        workspace_root = _get_workspace_root()
-    try:
-        from .tools import create_provider, get_tools, load_mcp_config
-        config_path = Path(workspace_root) / "mcp-config.json"
-        if not config_path.is_file():
-            logger.info("[TOOLS] No mcp-config.json found at %s, tools disabled", config_path)
-            return {"configurable": {"tools": []}}
-        mcp_config = load_mcp_config(str(config_path))
-        provider = create_provider(mcp_config, workspace_root)
-        tools = get_tools(provider)
-        logger.info("[TOOLS] Loaded %d tools from provider", len(tools))
-        return {"configurable": {"tools": tools}}
-    except Exception as e:
-        logger.warning("[TOOLS] Failed to initialize tools: %s", e, exc_info=True)
-        return {"configurable": {"tools": []}}
-
-
-def run_task(task_description, enable_tracing=True, session_id=None, tags=None):
-    trace_config = create_trace_config(enabled=enable_tracing, task_description=task_description, session_id=session_id, tags=tags or ["orchestrator"], metadata={"max_retries": str(MAX_RETRIES), "token_budget": str(TOKEN_BUDGET)})
-    workflow = create_workflow()
-    tools_config = init_tools_config()
-    initial_state: GraphState = {"task_description": task_description, "blueprint": None, "generated_code": "", "failure_report": None, "status": WorkflowStatus.PLANNING, "retry_count": 0, "tokens_used": 0, "error_message": "", "memory_context": [], "memory_writes": [], "trace": [], "sandbox_result": None, "parsed_files": [], "tool_calls_log": []}
-    invoke_config = {"recursion_limit": 25, **tools_config}
-    if trace_config.callbacks:
-        invoke_config["callbacks"] = trace_config.callbacks
-    with trace_config.propagation_context():
-        add_trace_event(trace_config, "orchestrator_start", metadata={"task_preview": task_description[:200], "max_retries": MAX_RETRIES, "token_budget": TOKEN_BUDGET, "tools_available": len(tools_config.get("configurable", {}).get("tools", []))})
-        result = workflow.invoke(initial_state, config=invoke_config)
-        final_state = AgentState(**result)
-        add_trace_event(trace_config, "orchestrator_complete", metadata={"status": final_state.status.value, "tokens_used": final_state.tokens_used, "retry_count": final_state.retry_count, "memory_writes_count": len(final_state.memory_writes), "tool_calls_count": len(final_state.tool_calls_log)})
-    trace_config.flush()
-    return final_state
+replace_with_file_content
\ No newline at end of file

From 413745c0eeb9ce8a27819882c10c43a52b3ba14f Mon Sep 17 00:00:00 2001
From: Mike Abernathy <mabernathy87@gmail.com>
Date: Thu, 2 Apr 2026 12:40:47 -0600
Subject: [PATCH 05/12] feat(orchestrator): strategy-aware
 sandbox_validate_node (#95)

---
 dev-suite/src/orchestrator.py | 758 +++++++++++++++++++++++++++++++++-
 1 file changed, 757 insertions(+), 1 deletion(-)

diff --git a/dev-suite/src/orchestrator.py b/dev-suite/src/orchestrator.py
index 45fea74..65bd785 100644
--- a/dev-suite/src/orchestrator.py
+++ b/dev-suite/src/orchestrator.py
@@ -1 +1,757 @@
-replace_with_file_content
\ No newline at end of file
+"""LangGraph orchestrator -- Architect -> Lead Dev -> apply_code -> sandbox -> QA loop.
+
+This is the main entry point for the agent workflow.
+Implements the state machine with retry logic, token budgets,
+structured Blueprint passing, human escalation, code application,
+tool binding (issue #80), and memory write-back.
+
+Issue #80: Agent tool binding -- Dev and QA agents can now use
+workspace tools (filesystem_read, filesystem_write, etc.) via
+LangChain's bind_tools() + iterative tool execution loop.
+Tools are passed via RunnableConfig["configurable"]["tools"].
+"""
+
+import asyncio
+import json
+import logging
+import os
+import re
+from enum import Enum
+from pathlib import Path
+from typing import Any, Literal, TypedDict
+
+from dotenv import load_dotenv
+from langchain_anthropic import ChatAnthropic
+from langchain_core.messages import (
+    AIMessage,
+    HumanMessage,
+    SystemMessage,
+    ToolMessage,
+)
+from langchain_core.runnables.config import RunnableConfig
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langgraph.graph import END, START, StateGraph
+from pydantic import BaseModel
+
+from .agents.architect import Blueprint
+from .agents.qa import FailureReport, FailureType
+from .memory.factory import create_memory_store
+from .memory.protocol import MemoryQueryResult, MemoryStore
+from .memory.summarizer import summarize_writes_sync
+from .sandbox.e2b_runner import E2BRunner, SandboxResult
+from .sandbox.validation_commands import (
+    ValidationPlan,
+    ValidationStrategy,
+    format_validation_summary,
+    get_validation_plan,
+)
+from .tools.code_parser import (
+    CodeParserError,
+    parse_generated_code,
+    validate_paths_for_workspace,
+)
+from .tracing import add_trace_event, create_trace_config
+
+load_dotenv()
+
+logger = logging.getLogger(__name__)
+
+
+def _safe_int(env_key: str, default: int) -> int:
+    raw = os.getenv(env_key, str(default))
+    try:
+        return int(raw)
+    except (ValueError, TypeError):
+        logger.warning("%s=%r is not a valid integer, using default %d", env_key, raw, default)
+        return default
+
+MAX_RETRIES = _safe_int("MAX_RETRIES", 3)
+TOKEN_BUDGET = _safe_int("TOKEN_BUDGET", 50000)
+MAX_TOOL_TURNS = _safe_int("MAX_TOOL_TURNS", 10)
+
+
+def _get_workspace_root() -> Path:
+    raw = os.getenv("WORKSPACE_ROOT", ".")
+    return Path(raw).resolve()
+
+
+class WorkflowStatus(str, Enum):
+    PLANNING = "planning"
+    BUILDING = "building"
+    REVIEWING = "reviewing"
+    PASSED = "passed"
+    FAILED = "failed"
+    ESCALATED = "escalated"
+
+
+class GraphState(TypedDict, total=False):
+    task_description: str
+    blueprint: Blueprint | None
+    generated_code: str
+    failure_report: FailureReport | None
+    status: WorkflowStatus
+    retry_count: int
+    tokens_used: int
+    error_message: str
+    memory_context: list[str]
+    memory_writes: list[dict]
+    trace: list[str]
+    sandbox_result: SandboxResult | None
+    parsed_files: list[dict]
+    tool_calls_log: list[dict]
+    memory_writes_flushed: list[dict]
+
+
+class AgentState(BaseModel):
+    task_description: str = ""
+    blueprint: Blueprint | None = None
+    generated_code: str = ""
+    failure_report: FailureReport | None = None
+    status: WorkflowStatus = WorkflowStatus.PLANNING
+    retry_count: int = 0
+    tokens_used: int = 0
+    error_message: str = ""
+    memory_context: list[str] = []
+    memory_writes: list[dict] = []
+    trace: list[str] = []
+    sandbox_result: SandboxResult | None = None
+    parsed_files: list[dict] = []
+    tool_calls_log: list[dict] = []
+
+
+def _get_architect_llm():
+    return ChatGoogleGenerativeAI(model=os.getenv("ARCHITECT_MODEL", "gemini-3-flash-preview"), google_api_key=os.getenv("GOOGLE_API_KEY"), temperature=0.2)
+
+
+def _get_developer_llm():
+    return ChatAnthropic(model=os.getenv("DEVELOPER_MODEL", "claude-sonnet-4-20250514"), api_key=os.getenv("ANTHROPIC_API_KEY"), temperature=0.1, max_tokens=8192)
+
+
+def _get_qa_llm():
+    return ChatAnthropic(model=os.getenv("QA_MODEL", "claude-sonnet-4-20250514"), api_key=os.getenv("ANTHROPIC_API_KEY"), temperature=0.0, max_tokens=4096)
+
+
+def _extract_text_content(content: Any) -> str:
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        parts = []
+        for block in content:
+            if isinstance(block, str):
+                parts.append(block)
+            elif isinstance(block, dict) and "text" in block:
+                parts.append(block["text"])
+            elif hasattr(block, "text"):
+                parts.append(block.text)
+        return "\n".join(parts)
+    return str(content)
+
+
+def _extract_json(raw: str) -> dict:
+    text = raw.strip()
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        pass
+    if "```" in text:
+        parts = text.split("```")
+        for part in parts[1::2]:
+            inner = part.strip()
+            if inner.startswith("json"):
+                inner = inner[4:].strip()
+            try:
+                return json.loads(inner)
+            except json.JSONDecodeError:
+                continue
+    first_brace = text.find("{")
+    last_brace = text.rfind("}")
+    if first_brace != -1 and last_brace > first_brace:
+        try:
+            return json.loads(text[first_brace:last_brace + 1])
+        except json.JSONDecodeError:
+            pass
+    raise json.JSONDecodeError(f"No valid JSON found in response ({len(text)} chars): {text[:200]}...", text, 0)
+
+
+def _extract_token_count(response: Any) -> int:
+    meta = getattr(response, "usage_metadata", None)
+    if not meta:
+        return 0
+    if isinstance(meta, dict):
+        for key in ("total_tokens", "totalTokenCount", "total_token_count"):
+            if key in meta:
+                return int(meta[key])
+        input_t = meta.get("input_tokens", meta.get("prompt_tokens", 0))
+        output_t = meta.get("output_tokens", meta.get("completion_tokens", 0))
+        if input_t or output_t:
+            return int(input_t) + int(output_t)
+    return 0
+
+
+def _get_memory_store() -> MemoryStore:
+    return create_memory_store()
+
+
+def _fetch_memory_context(task_description: str) -> list[str]:
+    try:
+        store = _get_memory_store()
+        results = store.query(task_description, n_results=10)
+        return [r.content for r in results]
+    except Exception:
+        return []
+
+
+def _infer_module(target_files: list[str]) -> str:
+    if not target_files:
+        return "global"
+    first_file = target_files[0]
+    parts = first_file.replace("\\", "/").split("/")
+    if len(parts) > 2 and parts[0] == "src":
+        return parts[1]
+    if len(parts) > 1:
+        return parts[0]
+    return "global"
+
+
+# -- Tool Binding (Issue #80) --
+
+# Fix 2: Removed github_create_pr -- non-idempotent write should not be in
+# an iterative tool loop. PR creation belongs in a post-QA-pass step.
+DEV_TOOL_NAMES = {"filesystem_read", "filesystem_write", "filesystem_list", "github_read_diff"}
+QA_TOOL_NAMES = {"filesystem_read", "filesystem_list", "github_read_diff"}
+
+# Fix 4: Secret pattern regexes for sanitizing tool call previews
+_SECRET_PATTERNS = [
+    re.compile(r'(?:sk|pk|api|key|token|secret|password|bearer)[_-]?\w{10,}', re.IGNORECASE),
+    re.compile(r'(?:ghp|gho|ghu|ghs|ghr)_[A-Za-z0-9_]{30,}'),
+    re.compile(r'(?:eyJ)[A-Za-z0-9_-]{20,}'),
+    re.compile(r'(?:AKIA|ASIA)[A-Z0-9]{16}'),
+]
+
+
+def _sanitize_preview(text: str, max_len: int = 200) -> str:
+    """Truncate and redact known secret patterns from tool call previews."""
+    if not text:
+        return ""
+    sanitized = text
+    for pattern in _SECRET_PATTERNS:
+        sanitized = pattern.sub("[REDACTED]", sanitized)
+    if len(sanitized) > max_len:
+        sanitized = sanitized[:max_len] + "..."
+    return sanitized
+
+
+def _get_agent_tools(config, allowed_names=None):
+    if not config:
+        return []
+    configurable = config.get("configurable", {})
+    tools = configurable.get("tools", [])
+    if not tools:
+        return []
+    if allowed_names is None:
+        return list(tools)
+    return [t for t in tools if t.name in allowed_names]
+
+
+async def _execute_tool_call(tool_call, tools):
+    """Execute a single tool call. Uses ainvoke/invoke public API (Fix 3)."""
+    tool_name = tool_call.get("name", "")
+    tool_args = tool_call.get("args", {})
+    tool_id = tool_call.get("id", "unknown")
+    tool_map = {t.name: t for t in tools}
+    tool = tool_map.get(tool_name)
+    if not tool:
+        return ToolMessage(content=f"Error: Tool '{tool_name}' not found. Available: {list(tool_map.keys())}", tool_call_id=tool_id)
+    try:
+        # Fix 3: Use public ainvoke/invoke API instead of tool.coroutine.
+        # ainvoke handles input validation, callbacks, and config propagation.
+        if hasattr(tool, "ainvoke"):
+            result = await tool.ainvoke(tool_args)
+        else:
+            result = tool.invoke(tool_args)
+        return ToolMessage(content=str(result), tool_call_id=tool_id)
+    except Exception as e:
+        logger.warning("[TOOLS] Tool %s failed: %s", tool_name, e)
+        return ToolMessage(content=f"Error executing {tool_name}: {type(e).__name__}: {e}", tool_call_id=tool_id)
+
+
+async def _run_tool_loop(llm_with_tools, messages, tools, max_turns=MAX_TOOL_TURNS, tokens_used=0, trace=None, agent_name="agent"):
+    if trace is None:
+        trace = []
+    tool_calls_log = []
+    current_messages = list(messages)
+    # Fix 5: Guard against max_turns <= 0 to prevent unbound response variable
+    if max_turns <= 0:
+        logger.warning("[%s] max_turns=%d, skipping tool loop", agent_name.upper(), max_turns)
+        trace.append(f"{agent_name}: tool loop skipped (max_turns={max_turns})")
+        last_msg = current_messages[-1] if current_messages else AIMessage(content="")
+        return last_msg, tokens_used, tool_calls_log
+    for turn in range(max_turns):
+        response = await llm_with_tools.ainvoke(current_messages)
+        tokens_used += _extract_token_count(response)
+        tool_calls = getattr(response, "tool_calls", None) or []
+        if not tool_calls:
+            trace.append(f"{agent_name}: tool loop done after {turn} tool turn(s)")
+            return response, tokens_used, tool_calls_log
+        trace.append(f"{agent_name}: turn {turn + 1} -- {len(tool_calls)} tool call(s): {', '.join(tc.get('name', '?') for tc in tool_calls)}")
+        logger.info("[%s] Tool turn %d: %d calls", agent_name.upper(), turn + 1, len(tool_calls))
+        current_messages.append(response)
+        for tc in tool_calls:
+            tool_msg = await _execute_tool_call(tc, tools)
+            current_messages.append(tool_msg)
+            # Fix 4: Sanitize previews before persisting to tool_calls_log
+            tool_calls_log.append({"agent": agent_name, "turn": turn + 1, "tool": tc.get("name", "unknown"), "args_preview": _sanitize_preview(str(tc.get("args", {}))), "result_preview": _sanitize_preview(str(tool_msg.content)), "success": not tool_msg.content.startswith("Error")})
+    trace.append(f"{agent_name}: tool loop hit max turns ({max_turns})")
+    logger.warning("[%s] Hit max tool turns (%d)", agent_name.upper(), max_turns)
+    return response, tokens_used, tool_calls_log
+
+
+def _run_async(coro):
+    """Run an async coroutine from sync context for the tool loop.
+
+    Design note (re: CodeRabbit #8): This is intentional. developer_node and
+    qa_node are sync functions that use _run_async() to bridge into the async
+    tool loop. run_task() uses workflow.invoke() (sync). Converting everything
+    to async would cascade changes to CLI, tests, and callers with no benefit
+    since LangGraph handles sync nodes with internal async bridges fine.
+    """
+    try:
+        loop = asyncio.get_running_loop()
+    except RuntimeError:
+        loop = None
+    if loop and loop.is_running():
+        import concurrent.futures
+        with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
+            return pool.submit(asyncio.run, coro).result()
+    else:
+        return asyncio.run(coro)
+
+
+# -- Node Functions --
+
+def architect_node(state: GraphState) -> dict:
+    trace = list(state.get("trace", []))
+    trace.append("architect: starting planning")
+    retry_count = state.get("retry_count", 0)
+    tokens_used = state.get("tokens_used", 0)
+    logger.info("[ARCH] retry_count=%d, tokens_used=%d, status=%s", retry_count, tokens_used, state.get("status", "unknown"))
+    memory_context = _fetch_memory_context(state.get("task_description", ""))
+    memory_block = ""
+    if memory_context:
+        memory_block = "\n\nProject context from memory:\n" + "\n".join(f"- {c}" for c in memory_context)
+    system_prompt = f"""You are the Architect agent. Your job is to create a structured Blueprint
+for a coding task. You NEVER write code yourself.
+
+Respond with ONLY a valid JSON object matching this schema:
+{{
+  "task_id": "string (short unique identifier)",
+  "target_files": ["list of file paths to create or modify"],
+  "instructions": "clear step-by-step instructions for the developer",
+  "constraints": ["list of constraints or requirements"],
+  "acceptance_criteria": ["list of testable criteria for QA"]
+}}
+
+Do not include any text before or after the JSON.{memory_block}"""
+    user_msg = state.get("task_description", "")
+    failure_report = state.get("failure_report")
+    if failure_report and failure_report.is_architectural:
+        user_msg += "\n\nPREVIOUS ATTEMPT FAILED (architectural issue):\n"
+        user_msg += f"Errors: {', '.join(failure_report.errors)}\n"
+        if failure_report.failed_files:
+            user_msg += f"Failed files: {', '.join(failure_report.failed_files)}\n"
+        user_msg += f"Recommendation: {failure_report.recommendation}\n"
+        user_msg += "\nGenerate a COMPLETELY NEW Blueprint. Do not patch the old one. The previous target_files or approach was wrong."
+    llm = _get_architect_llm()
+    response = llm.invoke([SystemMessage(content=system_prompt), HumanMessage(content=user_msg)])
+    try:
+        raw = _extract_text_content(response.content)
+        blueprint_data = _extract_json(raw)
+        blueprint = Blueprint(**blueprint_data)
+    except (json.JSONDecodeError, Exception) as e:
+        trace.append(f"architect: failed to parse blueprint: {e}")
+        logger.error("[ARCH] Blueprint parse failed: %s", e)
+        return {"status": WorkflowStatus.FAILED, "error_message": f"Architect failed to produce valid Blueprint: {e}", "trace": trace, "memory_context": memory_context}
+    trace.append(f"architect: blueprint created for {len(blueprint.target_files)} files")
+    tokens_used = tokens_used + _extract_token_count(response)
+    logger.info("[ARCH] done. tokens_used now=%d", tokens_used)
+    return {"blueprint": blueprint, "status": WorkflowStatus.BUILDING, "tokens_used": tokens_used, "trace": trace, "memory_context": memory_context}
+
+
+def developer_node(state: GraphState, config: RunnableConfig | None = None) -> dict:
+    """Lead Dev: executes the Blueprint and generates code. Issue #80: tool binding support."""
+    trace = list(state.get("trace", []))
+    trace.append("developer: starting build")
+    memory_writes = list(state.get("memory_writes", []))
+    tool_calls_log = list(state.get("tool_calls_log", []))
+    retry_count = state.get("retry_count", 0)
+    tokens_used = state.get("tokens_used", 0)
+    logger.info("[DEV] retry_count=%d, tokens_used=%d, status=%s", retry_count, tokens_used, state.get("status", "unknown"))
+    blueprint = state.get("blueprint")
+    if not blueprint:
+        trace.append("developer: no blueprint provided")
+        return {"status": WorkflowStatus.FAILED, "error_message": "Developer received no Blueprint", "trace": trace}
+    tools = _get_agent_tools(config, DEV_TOOL_NAMES)
+    has_tools = len(tools) > 0
+    if has_tools:
+        tool_names = [t.name for t in tools]
+        trace.append(f"developer: {len(tools)} tools available: {', '.join(tool_names)}")
+        system_prompt = """You are the Lead Dev agent. You receive a structured Blueprint and implement it using the tools available to you.
+
+WORKFLOW:
+1. Use filesystem_read to examine the existing files listed in target_files.
+2. Use filesystem_list to explore the project directory structure if needed.
+3. Implement the changes described in the Blueprint.
+4. Use filesystem_write to write each file.
+5. After writing all files, provide a summary of what you implemented.
+
+IMPORTANT:
+- Use filesystem_write for EACH file you need to create or modify.
+- Follow the Blueprint exactly. Respect all constraints.
+- Write clean, well-documented code.
+- After completing all file writes, respond with a text summary.
+- Also include the complete code in your final response using the format:
+  # --- FILE: path/to/file.py ---
+  (file contents)"""
+    else:
+        trace.append("developer: no tools available, using single-shot mode")
+        system_prompt = """You are the Lead Dev agent. You receive a structured Blueprint
+and write the code to implement it.
+
+Respond with the complete code implementation. Include file paths as comments
+at the top of each file section, like:
+# --- FILE: path/to/file.py ---
+
+Follow the Blueprint exactly. Respect all constraints.
+Write clean, well-documented code."""
+    user_msg = f"Blueprint:\n{blueprint.model_dump_json(indent=2)}"
+    failure_report = state.get("failure_report")
+    if failure_report and not failure_report.is_architectural:
+        user_msg += "\n\nPREVIOUS ATTEMPT FAILED:\n"
+        user_msg += f"Tests passed: {failure_report.tests_passed}\n"
+        user_msg += f"Tests failed: {failure_report.tests_failed}\n"
+        user_msg += f"Errors: {', '.join(failure_report.errors)}\n"
+        user_msg += f"Failed files: {', '.join(failure_report.failed_files)}\n"
+        user_msg += f"Recommendation: {failure_report.recommendation}\n"
+        user_msg += "\nFix the issues and regenerate the code."
+    messages = [SystemMessage(content=system_prompt), HumanMessage(content=user_msg)]
+    llm = _get_developer_llm()
+    if has_tools:
+        llm_with_tools = llm.bind_tools(tools)
+        response, tokens_used, new_tool_log = _run_async(_run_tool_loop(llm_with_tools, messages, tools, max_turns=MAX_TOOL_TURNS, tokens_used=tokens_used, trace=trace, agent_name="developer"))
+        tool_calls_log.extend(new_tool_log)
+    else:
+        response = llm.invoke(messages)
+        tokens_used += _extract_token_count(response)
+    content = _extract_text_content(response.content)
+    trace.append(f"developer: code generated ({len(content)} chars)")
+    logger.info("[DEV] done. tokens_used now=%d", tokens_used)
+    memory_writes.append({"content": f"Implemented blueprint {blueprint.task_id}: {blueprint.instructions[:200]}", "tier": "l1", "module": _infer_module(blueprint.target_files), "source_agent": "developer", "confidence": 1.0, "sandbox_origin": "locked-down", "related_files": ",".join(blueprint.target_files), "task_id": blueprint.task_id})
+    return {"generated_code": content, "status": WorkflowStatus.REVIEWING, "tokens_used": tokens_used, "trace": trace, "memory_writes": memory_writes, "tool_calls_log": tool_calls_log}
+
+
+def apply_code_node(state: GraphState) -> dict:
+    trace = list(state.get("trace", []))
+    trace.append("apply_code: starting")
+    generated_code = state.get("generated_code", "")
+    blueprint = state.get("blueprint")
+    if not generated_code:
+        trace.append("apply_code: no generated_code -- skipping")
+        return {"parsed_files": [], "trace": trace}
+    if not blueprint:
+        trace.append("apply_code: no blueprint -- skipping")
+        return {"parsed_files": [], "trace": trace}
+    try:
+        parsed = parse_generated_code(generated_code)
+    except CodeParserError as e:
+        logger.warning("[APPLY_CODE] Parse error: %s", e)
+        trace.append(f"apply_code: parse error -- {e}")
+        return {"parsed_files": [], "trace": trace}
+    if not parsed:
+        trace.append("apply_code: parser returned no files")
+        return {"parsed_files": [], "trace": trace}
+    workspace_root = _get_workspace_root()
+    safe_files = validate_paths_for_workspace(parsed, workspace_root)
+    if len(safe_files) < len(parsed):
+        skipped = len(parsed) - len(safe_files)
+        trace.append(f"apply_code: WARNING -- {skipped} file(s) skipped due to path validation")
+    total_chars = 0
+    written_count = 0
+    for pf in safe_files:
+        try:
+            target = workspace_root / pf.path
+            target.parent.mkdir(parents=True, exist_ok=True)
+            target.write_text(pf.content, encoding="utf-8")
+            written_count += 1
+            total_chars += len(pf.content)
+        except Exception as e:
+            logger.warning("[APPLY_CODE] Failed to write %s: %s", pf.path, e)
+            trace.append(f"apply_code: failed to write {pf.path} -- {e}")
+    trace.append(f"apply_code: wrote {written_count} files ({total_chars:,} chars total) to workspace")
+    logger.info("[APPLY_CODE] Wrote %d files (%d chars) to %s", written_count, total_chars, workspace_root)
+    parsed_files_data = [{"path": pf.path, "content": pf.content} for pf in safe_files]
+    return {"parsed_files": parsed_files_data, "trace": trace}
+
+
+def qa_node(state: GraphState, config: RunnableConfig | None = None) -> dict:
+    """QA: reviews the generated code. Issue #80: read-only tool access."""
+    trace = list(state.get("trace", []))
+    trace.append("qa: starting review")
+    memory_writes = list(state.get("memory_writes", []))
+    tool_calls_log = list(state.get("tool_calls_log", []))
+    retry_count = state.get("retry_count", 0)
+    tokens_used = state.get("tokens_used", 0)
+    logger.info("[QA] retry_count=%d, tokens_used=%d, status=%s", retry_count, tokens_used, state.get("status", "unknown"))
+    generated_code = state.get("generated_code", "")
+    blueprint = state.get("blueprint")
+    if not generated_code or not blueprint:
+        trace.append("qa: missing code or blueprint")
+        return {"status": WorkflowStatus.FAILED, "error_message": "QA received no code or blueprint to review", "trace": trace}
+    tools = _get_agent_tools(config, QA_TOOL_NAMES)
+    has_tools = len(tools) > 0
+    if has_tools:
+        tool_names = [t.name for t in tools]
+        trace.append(f"qa: {len(tools)} tools available: {', '.join(tool_names)}")
+    system_prompt = "You are the QA agent. You review code against a Blueprint's acceptance criteria.\n\n"
+    if has_tools:
+        system_prompt += "You have tools to read files from the workspace. Use filesystem_read to inspect the actual files that were written, and filesystem_list to check the project structure.\n\n"
+    system_prompt += 'Respond with ONLY a valid JSON object matching this schema:\n{\n  "task_id": "string (from the Blueprint)",\n  "status": "pass" or "fail" or "escalate",\n  "tests_passed": number,\n  "tests_failed": number,\n  "errors": ["list of specific error descriptions"],\n  "failed_files": ["list of files with issues"],\n  "is_architectural": true/false,\n  "failure_type": "code" or "architectural" or null (if pass),\n  "recommendation": "what to fix or why it should escalate"\n}\n\nFAILURE CLASSIFICATION (critical for correct routing):\n\nSet failure_type to "code" (status: "fail") when:\n- Implementation has bugs, syntax errors, or type errors\n- Tests fail due to logic errors in the code\n- Code does not follow the Blueprint\'s constraints\n- Missing error handling or edge cases\nAction: Lead Dev will retry with the same Blueprint.\n\nSet failure_type to "architectural" (status: "escalate") when:\n- Blueprint targets the WRONG files (code is in the wrong place)\n- A required dependency or import is missing from the Blueprint\n- The design approach is fundamentally flawed\n- Acceptance criteria are impossible to meet with current targets\n- The task requires files not listed in target_files\nAction: Architect will generate a completely NEW Blueprint.\n\nBe strict but fair. Only pass code that meets ALL acceptance criteria.\nDo not include any text before or after the JSON.'
+    bp_json = blueprint.model_dump_json(indent=2)
+    user_msg = f"Blueprint:\n{bp_json}\n\nGenerated Code:\n{generated_code}"
+    sandbox_result = state.get("sandbox_result")
+    if sandbox_result is not None:
+        user_msg += "\n\nSandbox Validation Results:\n"
+        user_msg += f"  Exit code: {sandbox_result.exit_code}\n"
+        if sandbox_result.tests_passed is not None:
+            user_msg += f"  Tests passed: {sandbox_result.tests_passed}\n"
+        if sandbox_result.tests_failed is not None:
+            user_msg += f"  Tests failed: {sandbox_result.tests_failed}\n"
+        if sandbox_result.errors:
+            user_msg += f"  Errors: {', '.join(sandbox_result.errors)}\n"
+        if sandbox_result.output_summary:
+            user_msg += f"  Output:\n{sandbox_result.output_summary}\n"
+        user_msg += "\nUse these real test results to inform your review. If sandbox tests passed, weigh that heavily in your verdict."
+    else:
+        user_msg += "\n\nNote: Sandbox validation was not available for this review. Evaluate the code based on the Blueprint criteria only."
+    messages = [SystemMessage(content=system_prompt), HumanMessage(content=user_msg)]
+    llm = _get_qa_llm()
+    if has_tools:
+        llm_with_tools = llm.bind_tools(tools)
+        response, tokens_used, new_tool_log = _run_async(_run_tool_loop(llm_with_tools, messages, tools, max_turns=5, tokens_used=tokens_used, trace=trace, agent_name="qa"))
+        tool_calls_log.extend(new_tool_log)
+    else:
+        response = llm.invoke(messages)
+        tokens_used += _extract_token_count(response)
+    try:
+        raw = _extract_text_content(response.content)
+        report_data = _extract_json(raw)
+        failure_report = FailureReport(**report_data)
+    except (json.JSONDecodeError, Exception) as e:
+        trace.append(f"qa: failed to parse report: {e}")
+        return {"status": WorkflowStatus.FAILED, "error_message": f"QA failed to produce valid report: {e}", "trace": trace}
+    trace.append(f"qa: verdict={failure_report.status}, passed={failure_report.tests_passed}, failed={failure_report.tests_failed}")
+    if failure_report.status == "pass":
+        status = WorkflowStatus.PASSED
+        memory_writes.append({"content": f"QA passed for {blueprint.task_id}: {failure_report.tests_passed} tests passed", "tier": "l2", "module": _infer_module(blueprint.target_files), "source_agent": "qa", "confidence": 1.0, "sandbox_origin": "locked-down", "related_files": ",".join(blueprint.target_files), "task_id": blueprint.task_id})
+    elif failure_report.is_architectural:
+        status = WorkflowStatus.ESCALATED
+        memory_writes.append({"content": f"Architectural issue in {blueprint.task_id}: {failure_report.recommendation}", "tier": "l0-discovered", "module": _infer_module(blueprint.target_files), "source_agent": "qa", "confidence": 0.85, "sandbox_origin": "locked-down", "related_files": ",".join(failure_report.failed_files), "task_id": blueprint.task_id})
+    else:
+        status = WorkflowStatus.REVIEWING
+    new_retry_count = retry_count + (1 if failure_report.status != "pass" else 0)
+    logger.info("[QA] verdict=%s, retry_count %d->%d, tokens_used=%d", failure_report.status, retry_count, new_retry_count, tokens_used)
+    return {"failure_report": failure_report, "status": status, "tokens_used": tokens_used, "retry_count": new_retry_count, "trace": trace, "memory_writes": memory_writes, "tool_calls_log": tool_calls_log}
+
+
+def _run_sandbox_tests(commands, template, parsed_files=None, timeout=120):
+    """Run TEST_SUITE validation: sequential commands in sandbox."""
+    api_key = os.getenv("E2B_API_KEY")
+    if not api_key:
+        return None
+    runner = E2BRunner(api_key=api_key, default_timeout=timeout)
+    project_files = None
+    if parsed_files:
+        project_files = {pf["path"]: pf["content"] for pf in parsed_files}
+    return runner.run_tests(commands=commands, project_files=project_files, timeout=timeout, template=template)
+
+
+def _run_sandbox_script(script_file, template, parsed_files=None, timeout=30):
+    """Run SCRIPT_EXEC validation: execute a single script."""
+    api_key = os.getenv("E2B_API_KEY")
+    if not api_key:
+        return None
+    runner = E2BRunner(api_key=api_key, default_timeout=timeout)
+    project_files = None
+    if parsed_files:
+        project_files = {pf["path"]: pf["content"] for pf in parsed_files}
+    return runner.run_script(script_file=script_file, project_files=project_files, timeout=timeout, template=template)
+
+
+def sandbox_validate_node(state: GraphState) -> dict:
+    trace = list(state.get("trace", []))
+    trace.append("sandbox_validate: starting")
+    blueprint = state.get("blueprint")
+    if not blueprint:
+        trace.append("sandbox_validate: no blueprint -- skipping")
+        return {"sandbox_result": None, "trace": trace}
+    plan = get_validation_plan(blueprint.target_files)
+    trace.append(f"sandbox_validate: {plan.description}")
+
+    if plan.strategy == ValidationStrategy.SKIP:
+        trace.append("sandbox_validate: no code validation needed -- skipping")
+        return {"sandbox_result": SandboxResult(exit_code=0, validation_skipped=True, output_summary="Validation skipped: non-code files"), "trace": trace}
+
+    parsed_files = state.get("parsed_files", [])
+    if parsed_files:
+        trace.append(f"sandbox_validate: loading {len(parsed_files)} files into sandbox")
+
+    template_label = plan.template or "default"
+    trace.append(f"sandbox_validate: strategy={plan.strategy.value}, template={template_label}")
+
+    try:
+        result = None
+        if plan.strategy == ValidationStrategy.SCRIPT_EXEC:
+            trace.append(f"sandbox_validate: executing script {plan.script_file}")
+            result = _run_sandbox_script(
+                script_file=plan.script_file,
+                template=plan.template,
+                parsed_files=parsed_files if parsed_files else None,
+            )
+        elif plan.strategy == ValidationStrategy.TEST_SUITE:
+            trace.append(f"sandbox_validate: running {len(plan.commands)} commands sequentially")
+            result = _run_sandbox_tests(
+                commands=plan.commands,
+                template=plan.template,
+                parsed_files=parsed_files if parsed_files else None,
+            )
+        else:
+            trace.append(f"sandbox_validate: unhandled strategy {plan.strategy.value} -- skipping")
+            return {"sandbox_result": SandboxResult(exit_code=0, validation_skipped=True), "trace": trace}
+
+        if result is None:
+            logger.warning("[SANDBOX] E2B_API_KEY not configured -- sandbox validation SKIPPED.")
+            trace.append("sandbox_validate: WARNING -- E2B_API_KEY not configured, sandbox validation skipped")
+            return {"sandbox_result": SandboxResult(exit_code=0, validation_skipped=True, output_summary="E2B_API_KEY not configured"), "trace": trace}
+
+        if result.warnings:
+            for w in result.warnings:
+                trace.append(f"sandbox_validate: WARNING -- {w}")
+
+        trace.append(f"sandbox_validate: exit_code={result.exit_code}, passed={result.tests_passed}, failed={result.tests_failed}")
+        logger.info("[SANDBOX] Validation complete: exit=%d, passed=%s, failed=%s", result.exit_code, result.tests_passed, result.tests_failed)
+        return {"sandbox_result": result, "trace": trace}
+    except Exception as e:
+        logger.warning("[SANDBOX] Validation failed with error: %s", e)
+        trace.append(f"sandbox_validate: error -- {type(e).__name__}: {e}")
+        return {"sandbox_result": None, "trace": trace}
+
+
+def flush_memory_node(state: GraphState) -> dict:
+    trace = list(state.get("trace", []))
+    trace.append("flush_memory: starting")
+    memory_writes = state.get("memory_writes", [])
+    if not memory_writes:
+        trace.append("flush_memory: no writes to flush")
+        return {"trace": trace}
+    consolidated = summarize_writes_sync(memory_writes)
+    trace.append(f"flush_memory: summarizer {len(memory_writes)} -> {len(consolidated)} entries")
+    written_entries: list[dict] = []
+    try:
+        store = _get_memory_store()
+        for entry in consolidated:
+            tier = entry.get("tier", "l1")
+            content = entry.get("content", "")
+            module = entry.get("module", "global")
+            source_agent = entry.get("source_agent", "unknown")
+            confidence = entry.get("confidence", 1.0)
+            sandbox_origin = entry.get("sandbox_origin", "none")
+            related_files = entry.get("related_files", "")
+            task_id = entry.get("task_id", "")
+            if tier == "l0-discovered":
+                store.add_l0_discovered(content, module=module, source_agent=source_agent, confidence=confidence, sandbox_origin=sandbox_origin, related_files=related_files, task_id=task_id)
+            elif tier == "l2":
+                store.add_l2(content, module=module, source_agent=source_agent, related_files=related_files, task_id=task_id)
+            else:
+                store.add_l1(content, module=module, source_agent=source_agent, confidence=confidence, sandbox_origin=sandbox_origin, related_files=related_files, task_id=task_id)
+            written_entries.append(entry)
+        trace.append(f"flush_memory: wrote {len(written_entries)} entries to store")
+        logger.info("[FLUSH] Wrote %d memory entries", len(written_entries))
+    except Exception as e:
+        trace.append(f"flush_memory: store write failed: {e}")
+        logger.warning("[FLUSH] Memory store write failed: %s", e)
+    return {"trace": trace, "memory_writes_flushed": written_entries}
+
+
+def route_after_qa(state: GraphState) -> Literal["flush_memory", "developer", "architect", "__end__"]:
+    status = state.get("status", WorkflowStatus.FAILED)
+    retry_count = state.get("retry_count", 0)
+    tokens_used = state.get("tokens_used", 0)
+    logger.info("[ROUTER] status=%s, retry_count=%d, tokens_used=%d, max_retries=%d, token_budget=%d", status, retry_count, tokens_used, MAX_RETRIES, TOKEN_BUDGET)
+    if status == WorkflowStatus.PASSED:
+        return "flush_memory"
+    if status == WorkflowStatus.FAILED:
+        return END
+    if retry_count >= MAX_RETRIES:
+        return "flush_memory"
+    if tokens_used >= TOKEN_BUDGET:
+        return "flush_memory"
+    if status == WorkflowStatus.ESCALATED:
+        return "architect"
+    return "developer"
+
+
+def build_graph() -> StateGraph:
+    graph = StateGraph(GraphState)
+    graph.add_node("architect", architect_node)
+    graph.add_node("developer", developer_node)
+    graph.add_node("apply_code", apply_code_node)
+    graph.add_node("sandbox_validate", sandbox_validate_node)
+    graph.add_node("qa", qa_node)
+    graph.add_node("flush_memory", flush_memory_node)
+    graph.add_edge(START, "architect")
+    graph.add_edge("architect", "developer")
+    graph.add_edge("developer", "apply_code")
+    graph.add_edge("apply_code", "sandbox_validate")
+    graph.add_edge("sandbox_validate", "qa")
+    graph.add_conditional_edges("qa", route_after_qa)
+    graph.add_edge("flush_memory", END)
+    return graph
+
+
+def create_workflow():
+    return build_graph().compile()
+
+
+def init_tools_config(workspace_root=None):
+    if workspace_root is None:
+        workspace_root = _get_workspace_root()
+    try:
+        from .tools import create_provider, get_tools, load_mcp_config
+        config_path = Path(workspace_root) / "mcp-config.json"
+        if not config_path.is_file():
+            logger.info("[TOOLS] No mcp-config.json found at %s, tools disabled", config_path)
+            return {"configurable": {"tools": []}}
+        mcp_config = load_mcp_config(str(config_path))
+        provider = create_provider(mcp_config, workspace_root)
+        tools = get_tools(provider)
+        logger.info("[TOOLS] Loaded %d tools from provider", len(tools))
+        return {"configurable": {"tools": tools}}
+    except Exception as e:
+        logger.warning("[TOOLS] Failed to initialize tools: %s", e, exc_info=True)
+        return {"configurable": {"tools": []}}
+
+
+def run_task(task_description, enable_tracing=True, session_id=None, tags=None):
+    trace_config = create_trace_config(enabled=enable_tracing, task_description=task_description, session_id=session_id, tags=tags or ["orchestrator"], metadata={"max_retries": str(MAX_RETRIES), "token_budget": str(TOKEN_BUDGET)})
+    workflow = create_workflow()
+    tools_config = init_tools_config()
+    initial_state: GraphState = {"task_description": task_description, "blueprint": None, "generated_code": "", "failure_report": None, "status": WorkflowStatus.PLANNING, "retry_count": 0, "tokens_used": 0, "error_message": "", "memory_context": [], "memory_writes": [], "trace": [], "sandbox_result": None, "parsed_files": [], "tool_calls_log": []}
+    invoke_config = {"recursion_limit": 25, **tools_config}
+    if trace_config.callbacks:
+        invoke_config["callbacks"] = trace_config.callbacks
+    with trace_config.propagation_context():
+        add_trace_event(trace_config, "orchestrator_start", metadata={"task_preview": task_description[:200], "max_retries": MAX_RETRIES, "token_budget": TOKEN_BUDGET, "tools_available": len(tools_config.get("configurable", {}).get("tools", []))})
+        result = workflow.invoke(initial_state, config=invoke_config)
+        final_state = AgentState(**result)
+        add_trace_event(trace_config, "orchestrator_complete", metadata={"status": final_state.status.value, "tokens_used": final_state.tokens_used, "retry_count": final_state.retry_count, "memory_writes_count": len(final_state.memory_writes), "tool_calls_count": len(final_state.tool_calls_log)})
+    trace_config.flush()
+    return final_state
\ No newline at end of file

From 673c1cfa65549601449bbd65f59ff94eb95e5bfa Mon Sep 17 00:00:00 2001
From: Mike Abernathy <mabernathy87@gmail.com>
Date: Thu, 2 Apr 2026 13:02:14 -0600
Subject: [PATCH 06/12] fix: address all 6 CodeRabbit review items (#95)

CR #1 (Critical): Aggregate child exit codes from __RESULTS__ trailer
  - run_tests() now computes aggregate_exit_code from per-command rc values
  - Non-zero child exits properly propagate instead of being masked by wrapper

CR #2 (Critical): Scale outer timeout for sequential commands
  - overall_timeout = max(len(commands), 1) * timeout + 30

CR #3 (Major): Will be fixed in orchestrator.py (pushed locally)
  - SKIP/missing-key paths revert to returning None

CR #4 (Major): Fix ruff guard to preserve exit code
  - Replaced A && B || C with if/then/else construct
  - command -v ruff instead of which ruff

CR #5 (Major): Drop hardcoded tests/ from pytest
  - python -m pytest -v --tb=short (auto-discover)

CR #6 (Major): Boundary-based test file detection
  - _is_test_file() matches on basename boundaries
  - Avoids false positives like latest_utils.py, contest_helper.py
  - Added 6 regression tests for boundary matching
---
 dev-suite/src/sandbox/validation_commands.py | 55 +++++++++++++++-----
 dev-suite/tests/test_validation_commands.py  | 32 ++++++++++++
 2 files changed, 73 insertions(+), 14 deletions(-)

diff --git a/dev-suite/src/sandbox/validation_commands.py b/dev-suite/src/sandbox/validation_commands.py
index 0a3f7f1..fc5ab9c 100644
--- a/dev-suite/src/sandbox/validation_commands.py
+++ b/dev-suite/src/sandbox/validation_commands.py
@@ -26,9 +26,6 @@
 FRONTEND_EXTENSIONS = {".svelte", ".ts", ".tsx", ".js", ".jsx", ".css", ".vue"}
 PYTHON_EXTENSIONS = {".py", ".pyi"}
 
-# Filenames/patterns that indicate a test suite exists
-TEST_INDICATORS = {"tests/", "test_", "_test.py", "tests.py", ".test.", ".spec."}
-
 
 class ValidationStrategy(str, Enum):
     """How to validate the generated code in the sandbox.
@@ -68,12 +65,14 @@ class ValidationPlan:
 
 # -- Command Sets --
 
+# CR fix: if/then/else preserves ruff exit code; A && B || C masks failures
 PYTHON_LINT_COMMANDS = [
-    "which ruff >/dev/null 2>&1 && ruff check . --select=E,F,W --no-fix || echo '[WARN] ruff not available -- lint skipped'",
+    "if command -v ruff >/dev/null 2>&1; then ruff check . --select=E,F,W --no-fix; else echo '[WARN] ruff not available -- lint skipped'; fi",
 ]
 
+# CR fix: drop hardcoded tests/ -- let pytest auto-discover
 PYTHON_TEST_COMMANDS = [
-    "python -m pytest tests/ -v --tb=short",
+    "python -m pytest -v --tb=short",
 ]
 
 PYTHON_COMMANDS = PYTHON_LINT_COMMANDS + PYTHON_TEST_COMMANDS
@@ -87,14 +86,44 @@ class ValidationPlan:
 FULLSTACK_COMMANDS = FRONTEND_COMMANDS + PYTHON_COMMANDS
 
 
+# CR fix: boundary-based test detection instead of substring matching
+def _is_test_file(filepath: str) -> bool:
+    """Check if a single file looks like a test file using boundary matching.
+
+    Matches on basename boundaries to avoid false positives like
+    'latest_utils.py' or 'contest_helper.py'.
+    """
+    lower = filepath.lower().replace("\\", "/")
+    parts = lower.split("/")
+    basename = parts[-1] if parts else lower
+
+    # Directory component is "tests" (exact match)
+    if "tests" in parts[:-1]:
+        return True
+
+    # Basename starts with "test_"
+    if basename.startswith("test_"):
+        return True
+
+    # Basename ends with "_test.py" or "_test.pyi"
+    name_no_ext = os.path.splitext(basename)[0]
+    if name_no_ext.endswith("_test"):
+        return True
+
+    # Exact matches
+    if basename in ("tests.py", "conftest.py"):
+        return True
+
+    # JS/TS test patterns: .test.js, .test.ts, .spec.js, .spec.ts
+    if ".test." in basename or ".spec." in basename:
+        return True
+
+    return False
+
+
 def _has_test_files(target_files: list[str]) -> bool:
     """Check if any target files look like test files or test directories."""
-    for filepath in target_files:
-        lower = filepath.lower()
-        for indicator in TEST_INDICATORS:
-            if indicator in lower:
-                return True
-    return False
+    return any(_is_test_file(f) for f in target_files)
 
 
 def _get_primary_script(target_files: list[str]) -> str | None:
@@ -112,9 +141,7 @@ def _get_primary_script(target_files: list[str]) -> str | None:
 
     # Prefer non-test files
     for f in py_files:
-        lower = f.lower()
-        is_test = any(ind in lower for ind in TEST_INDICATORS)
-        if not is_test:
+        if not _is_test_file(f):
             return f
 
     # Fallback to first .py file
diff --git a/dev-suite/tests/test_validation_commands.py b/dev-suite/tests/test_validation_commands.py
index 0e19384..1a1183b 100644
--- a/dev-suite/tests/test_validation_commands.py
+++ b/dev-suite/tests/test_validation_commands.py
@@ -191,6 +191,38 @@ def test_script_file_single_file(self):
         plan = get_validation_plan(["utils.py"])
         assert plan.script_file == "utils.py"
 
+    # -- Boundary matching: false positive avoidance (CR fix) --
+
+    def test_latest_utils_not_detected_as_test(self):
+        """'latest_utils.py' contains 'test_' as substring but is not a test file."""
+        plan = get_validation_plan(["src/latest_utils.py"])
+        assert plan.strategy == ValidationStrategy.SCRIPT_EXEC
+
+    def test_contest_helper_not_detected_as_test(self):
+        """'contest_helper.py' contains 'test' as substring but is not a test file."""
+        plan = get_validation_plan(["src/contest_helper.py"])
+        assert plan.strategy == ValidationStrategy.SCRIPT_EXEC
+
+    def test_attestation_not_detected_as_test(self):
+        """'attestation.py' contains 'test' but is not a test file."""
+        plan = get_validation_plan(["src/attestation.py"])
+        assert plan.strategy == ValidationStrategy.SCRIPT_EXEC
+
+    def test_real_test_prefix_still_detected(self):
+        """'test_utils.py' should still be detected as a test file."""
+        plan = get_validation_plan(["src/main.py", "test_utils.py"])
+        assert plan.strategy == ValidationStrategy.TEST_SUITE
+
+    def test_real_test_suffix_still_detected(self):
+        """'auth_test.py' should still be detected as a test file."""
+        plan = get_validation_plan(["src/auth.py", "auth_test.py"])
+        assert plan.strategy == ValidationStrategy.TEST_SUITE
+
+    def test_tests_directory_still_detected(self):
+        """Files under tests/ directory should still be detected."""
+        plan = get_validation_plan(["src/main.py", "tests/test_main.py"])
+        assert plan.strategy == ValidationStrategy.TEST_SUITE
+
 
 class TestValidationPlan:
     """Tests for the ValidationPlan dataclass."""

From b06fb92e438eab02b69e72c3d71c672c5bbbed43 Mon Sep 17 00:00:00 2001
From: Mike Abernathy <mabernathy87@gmail.com>
Date: Thu, 2 Apr 2026 13:09:31 -0600
Subject: [PATCH 07/12] fix: revert SKIP/missing-key paths to return None for
 caller compat (CR #3)

---
 dev-suite/src/orchestrator.py            | 6 +++---
 dev-suite/tests/test_sandbox_validate.py | 6 ++----
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/dev-suite/src/orchestrator.py b/dev-suite/src/orchestrator.py
index 65bd785..c2b7ec9 100644
--- a/dev-suite/src/orchestrator.py
+++ b/dev-suite/src/orchestrator.py
@@ -598,7 +598,7 @@ def sandbox_validate_node(state: GraphState) -> dict:
 
     if plan.strategy == ValidationStrategy.SKIP:
         trace.append("sandbox_validate: no code validation needed -- skipping")
-        return {"sandbox_result": SandboxResult(exit_code=0, validation_skipped=True, output_summary="Validation skipped: non-code files"), "trace": trace}
+        return {"sandbox_result": None, "trace": trace}
 
     parsed_files = state.get("parsed_files", [])
     if parsed_files:
@@ -625,12 +625,12 @@ def sandbox_validate_node(state: GraphState) -> dict:
             )
         else:
             trace.append(f"sandbox_validate: unhandled strategy {plan.strategy.value} -- skipping")
-            return {"sandbox_result": SandboxResult(exit_code=0, validation_skipped=True), "trace": trace}
+            return {"sandbox_result": None, "trace": trace}
 
         if result is None:
             logger.warning("[SANDBOX] E2B_API_KEY not configured -- sandbox validation SKIPPED.")
             trace.append("sandbox_validate: WARNING -- E2B_API_KEY not configured, sandbox validation skipped")
-            return {"sandbox_result": SandboxResult(exit_code=0, validation_skipped=True, output_summary="E2B_API_KEY not configured"), "trace": trace}
+            return {"sandbox_result": None, "trace": trace}
 
         if result.warnings:
             for w in result.warnings:
diff --git a/dev-suite/tests/test_sandbox_validate.py b/dev-suite/tests/test_sandbox_validate.py
index 71bd600..a3f8be0 100644
--- a/dev-suite/tests/test_sandbox_validate.py
+++ b/dev-suite/tests/test_sandbox_validate.py
@@ -118,8 +118,7 @@ def test_skip_for_non_code_files(self):
 
         result = sandbox_validate_node(state)
 
-        assert result["sandbox_result"] is not None
-        assert result["sandbox_result"].validation_skipped is True
+        assert result["sandbox_result"] is None
         assert result["sandbox_result"].exit_code == 0
 
 
@@ -186,8 +185,7 @@ def test_graceful_skip_no_api_key(self):
             mock_run.return_value = None
             result = sandbox_validate_node(state)
 
-        assert result["sandbox_result"] is not None
-        assert result["sandbox_result"].validation_skipped is True
+        assert result["sandbox_result"] is None
 
     def test_no_blueprint_skips(self):
         """Should skip validation when there's no blueprint."""

From adefdc0e7e10ba69f7fe798f9eb1ec82344e8994 Mon Sep 17 00:00:00 2001
From: Mike Abernathy <mabernathy87@gmail.com>
Date: Thu, 2 Apr 2026 13:15:34 -0600
Subject: [PATCH 08/12] fix: remove stale exit_code assertion on None
 sandbox_result in skip test

---
 dev-suite/tests/test_sandbox_validate.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/dev-suite/tests/test_sandbox_validate.py b/dev-suite/tests/test_sandbox_validate.py
index a3f8be0..34086c4 100644
--- a/dev-suite/tests/test_sandbox_validate.py
+++ b/dev-suite/tests/test_sandbox_validate.py
@@ -5,7 +5,6 @@
   - Strategy-based dispatch (TEST_SUITE, SCRIPT_EXEC, SKIP)
   - Template selection from blueprint target_files
   - Graceful skip when E2B_API_KEY is missing
-  - validation_skipped flag propagation
   - SandboxResult stored in graph state
   - Warnings surfaced in trace
 
@@ -113,13 +112,13 @@ def test_test_suite_for_frontend_files(self):
         assert call_kwargs["template"] == "fullstack"
 
     def test_skip_for_non_code_files(self):
-        """Non-code files should get SKIP strategy with validation_skipped=True."""
+        """Non-code files should return None (no sandbox run)."""
         state = _make_state(["schema.sql", "config.yaml"])
 
         result = sandbox_validate_node(state)
 
         assert result["sandbox_result"] is None
-        assert result["sandbox_result"].exit_code == 0
+        assert any("skip" in t.lower() for t in result["trace"])
 
 
 class TestSandboxValidateNode:
@@ -178,7 +177,7 @@ def test_mixed_task_uses_fullstack_template(self):
         assert call_kwargs["template"] == "fullstack"
 
     def test_graceful_skip_no_api_key(self):
-        """Should return validation_skipped when E2B_API_KEY is missing."""
+        """Should return None when E2B_API_KEY is missing."""
         state = _make_state(["src/main.py", "tests/test_main.py"])
 
         with patch("src.orchestrator._run_sandbox_tests") as mock_run:

From a322ef10a2c3b9ca66ad8fe203c977f7a7a20603 Mon Sep 17 00:00:00 2001
From: Mike Abernathy <mabernathy87@gmail.com>
Date: Thu, 2 Apr 2026 13:43:03 -0600
Subject: [PATCH 09/12] =?UTF-8?q?fix:=20address=20CR=20review=20round=203?=
 =?UTF-8?q?=20=E2=80=94=20aggregate=20exit=20codes,=20scaled=20timeout,=20?=
 =?UTF-8?q?.pyi=20exclusion?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CR fixes addressed:
- e2b_runner.py: aggregate exit_code from __RESULTS__ trailer instead of
  wrapper process exit code (Critical — child cmd failures were silent)
- e2b_runner.py: scale outer sandbox timeout to num_commands * timeout + 30
  (Critical — sequential commands could be killed prematurely)
- validation_commands.py: exclude .pyi stubs from SCRIPT_EXEC selection via
  EXECUTABLE_PYTHON_EXTENSIONS; .pyi-only files route to LINT_ONLY
- test_validation_commands.py: update .pyi test for LINT_ONLY, rename
  misleading test, add script_file selection coverage, add .pyi+.py coexist test
---
 dev-suite/src/sandbox/e2b_runner.py          | 20 +++++++++--
 dev-suite/src/sandbox/validation_commands.py | 36 ++++++++++++++++----
 dev-suite/tests/test_validation_commands.py  | 27 ++++++++++++---
 3 files changed, 69 insertions(+), 14 deletions(-)

diff --git a/dev-suite/src/sandbox/e2b_runner.py b/dev-suite/src/sandbox/e2b_runner.py
index 24601e6..8f6f85a 100644
--- a/dev-suite/src/sandbox/e2b_runner.py
+++ b/dev-suite/src/sandbox/e2b_runner.py
@@ -367,16 +367,26 @@ def run_tests(
         setup_code = "\n".join(setup_lines) + "\n" if setup_lines else ""
         full_code = setup_code + run_code
 
+        # CR fix: scale outer timeout to account for sequential commands
+        num_commands = max(len(commands), 1)
+        overall_timeout = timeout * num_commands + 30
+
         result = self.run(
             full_code,
             profile=SandboxProfile.LOCKED,
             env_vars=env_vars,
-            timeout=timeout + 30,
+            timeout=overall_timeout,
             template=template,
         )
 
         warnings = list(result.warnings)
         summary = result.output_summary
+
+        # CR fix: aggregate exit_code from __RESULTS__ trailer instead of
+        # using the wrapper process exit code. A child command failure (e.g.,
+        # ruff finding lint errors) must propagate even if the wrapper exits 0.
+        aggregate_exit_code = result.exit_code
+
         if "[WARN]" in summary:
             for line in summary.split("\n"):
                 if "[WARN]" in line:
@@ -386,15 +396,19 @@ def run_tests(
             try:
                 json_part = summary.split("__RESULTS__")[-1].strip()
                 cmd_results = _json.loads(json_part)
+                # Aggregate: first non-zero rc, or 0 if all passed
                 for cr in cmd_results:
-                    if cr.get("rc") == 127:
+                    rc = cr.get("rc", 0)
+                    if rc == 127:
                         warnings.append(f"Command not found: {cr['cmd'][:60]}")
+                    elif rc != 0 and aggregate_exit_code == 0:
+                        aggregate_exit_code = rc
                 summary = summary.split("__RESULTS__")[0].strip()
             except (ValueError, IndexError):
                 pass
 
         return SandboxResult(
-            exit_code=result.exit_code,
+            exit_code=aggregate_exit_code,
             tests_passed=result.tests_passed,
             tests_failed=result.tests_failed,
             errors=result.errors,
diff --git a/dev-suite/src/sandbox/validation_commands.py b/dev-suite/src/sandbox/validation_commands.py
index fc5ab9c..013b502 100644
--- a/dev-suite/src/sandbox/validation_commands.py
+++ b/dev-suite/src/sandbox/validation_commands.py
@@ -26,6 +26,11 @@
 FRONTEND_EXTENSIONS = {".svelte", ".ts", ".tsx", ".js", ".jsx", ".css", ".vue"}
 PYTHON_EXTENSIONS = {".py", ".pyi"}
 
+# CR fix: .pyi stubs are type annotations only -- not executable.
+# Use this for _get_primary_script() selection so stubs are never
+# chosen as the script to run via SCRIPT_EXEC.
+EXECUTABLE_PYTHON_EXTENSIONS = {".py"}
+
 
 class ValidationStrategy(str, Enum):
     """How to validate the generated code in the sandbox.
@@ -34,8 +39,8 @@ class ValidationStrategy(str, Enum):
                  files are present in target_files.
     SCRIPT_EXEC: Run the primary script and check exit code + stdout.
                  Used for single-file scripts with no test suite.
-    LINT_ONLY:   Syntax/lint check only, no execution. Reserved for
-                 future use (config generators, migrations, etc.).
+    LINT_ONLY:   Syntax/lint check only, no execution. Used for
+                 non-executable code (e.g., .pyi stubs, config generators).
     SKIP:        No validation needed (non-code files).
     """
     TEST_SUITE = "test_suite"
@@ -51,7 +56,7 @@ class ValidationPlan:
     Attributes:
         strategy: The validation approach to use.
         template: Sandbox template name ("fullstack" or None for default).
-        commands: Shell commands to run in the sandbox (for TEST_SUITE).
+        commands: Shell commands to run in the sandbox (for TEST_SUITE/LINT_ONLY).
         script_file: Primary file to execute (for SCRIPT_EXEC).
         description: Human-readable summary for logging/QA context.
     """
@@ -129,12 +134,15 @@ def _has_test_files(target_files: list[str]) -> bool:
 def _get_primary_script(target_files: list[str]) -> str | None:
     """Find the primary executable script from target files.
 
-    Returns the first .py file that doesn't look like a test file,
-    or the first .py file if all look like tests, or None.
+    Returns the first .py file (not .pyi stubs) that doesn't look like
+    a test file, or the first .py file if all look like tests, or None.
+
+    CR fix: .pyi stubs are excluded -- they are type annotations only
+    and cannot be meaningfully executed.
     """
     py_files = [
         f for f in target_files
-        if os.path.splitext(f.lower())[1] in PYTHON_EXTENSIONS
+        if os.path.splitext(f.lower())[1] in EXECUTABLE_PYTHON_EXTENSIONS
     ]
     if not py_files:
         return None
@@ -156,7 +164,9 @@ def get_validation_plan(target_files: list[str]) -> ValidationPlan:
       - Non-code files only -> SKIP
       - Frontend files -> TEST_SUITE (pnpm check + tsc)
       - Python files with test files -> TEST_SUITE (ruff + pytest)
-      - Python files without test files -> SCRIPT_EXEC (run the script)
+      - Python files without test files:
+          - Has executable .py files -> SCRIPT_EXEC (run the script)
+          - Only .pyi stubs -> LINT_ONLY (ruff check only)
       - Mixed frontend + Python -> TEST_SUITE (fullstack)
 
     Args:
@@ -224,6 +234,18 @@ def get_validation_plan(target_files: list[str]) -> ValidationPlan:
             )
         else:
             primary = _get_primary_script(target_files)
+            # CR fix: if no executable .py files found (e.g., .pyi stubs
+            # only), fall back to lint-only instead of trying to execute
+            if primary is None:
+                return ValidationPlan(
+                    strategy=ValidationStrategy.LINT_ONLY,
+                    template=template,
+                    commands=list(PYTHON_LINT_COMMANDS),
+                    description=(
+                        f"Lint-only validation: {len(target_files)} files "
+                        f"using default template (no executable .py files)"
+                    ),
+                )
             return ValidationPlan(
                 strategy=ValidationStrategy.SCRIPT_EXEC,
                 template=template,
diff --git a/dev-suite/tests/test_validation_commands.py b/dev-suite/tests/test_validation_commands.py
index 1a1183b..61bf3a0 100644
--- a/dev-suite/tests/test_validation_commands.py
+++ b/dev-suite/tests/test_validation_commands.py
@@ -11,6 +11,7 @@
     FRONTEND_COMMANDS,
     FULLSTACK_COMMANDS,
     PYTHON_COMMANDS,
+    PYTHON_LINT_COMMANDS,
     ValidationPlan,
     ValidationStrategy,
     format_validation_summary,
@@ -63,10 +64,21 @@ def test_multiple_python_no_tests(self):
         assert plan.strategy == ValidationStrategy.SCRIPT_EXEC
         assert plan.script_file == "src/main.py"
 
-    def test_pyi_only_no_tests(self):
+    # -- .pyi stub files (LINT_ONLY -- not executable) --
+
+    def test_pyi_only_gets_lint_only(self):
+        """CR fix: .pyi stubs are not executable -- route to LINT_ONLY."""
         plan = get_validation_plan(["src/types.pyi"])
+        assert plan.strategy == ValidationStrategy.LINT_ONLY
+        assert plan.script_file is None
+        assert plan.commands == PYTHON_LINT_COMMANDS
+        assert "Lint-only" in plan.description
+
+    def test_pyi_with_py_gets_script_exec(self):
+        """When .pyi and .py coexist without tests, .py is the script."""
+        plan = get_validation_plan(["src/types.pyi", "src/main.py"])
         assert plan.strategy == ValidationStrategy.SCRIPT_EXEC
-        assert plan.script_file == "src/types.pyi"
+        assert plan.script_file == "src/main.py"
 
     # -- Frontend files (TEST_SUITE) --
 
@@ -181,12 +193,19 @@ def test_mixed_frontend_and_non_code(self):
 
     # -- Script file selection --
 
-    def test_script_file_prefers_non_test(self):
-        """_get_primary_script should prefer non-test files."""
+    def test_test_file_presence_triggers_test_suite(self):
+        """CR fix: renamed -- presence of test file triggers TEST_SUITE, not SCRIPT_EXEC."""
         files = ["src/main.py", "test_main.py"]
         plan = get_validation_plan(files)
         assert plan.strategy == ValidationStrategy.TEST_SUITE
 
+    def test_script_file_selects_first_py_file(self):
+        """CR fix: when multiple non-test .py files exist, script_file is the first one."""
+        files = ["src/secondary.py", "src/main.py"]
+        plan = get_validation_plan(files)
+        assert plan.strategy == ValidationStrategy.SCRIPT_EXEC
+        assert plan.script_file == "src/secondary.py"
+
     def test_script_file_single_file(self):
         plan = get_validation_plan(["utils.py"])
         assert plan.script_file == "utils.py"

From b8a300acdc7da3e81826fbbac4d64f1b11b8b017 Mon Sep 17 00:00:00 2001
From: Mike Abernathy <mabernathy87@gmail.com>
Date: Thu, 2 Apr 2026 13:46:56 -0600
Subject: [PATCH 10/12] =?UTF-8?q?fix:=20sandbox=5Fvalidate=5Fnode=20?=
 =?UTF-8?q?=E2=80=94=20empty=20parsed=5Ffiles=20guard=20+=20LINT=5FONLY=20?=
 =?UTF-8?q?dispatch?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CR fixes addressed:
- orchestrator.py: guard against empty parsed_files before dispatching
  SCRIPT_EXEC/TEST_SUITE/LINT_ONLY to sandbox (Major — spurious errors)
- orchestrator.py: fold LINT_ONLY into TEST_SUITE branch so lint-only
  plans execute their commands instead of falling through to skip (Minor)
---
 dev-suite/src/orchestrator.py | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/dev-suite/src/orchestrator.py b/dev-suite/src/orchestrator.py
index c2b7ec9..201f7f4 100644
--- a/dev-suite/src/orchestrator.py
+++ b/dev-suite/src/orchestrator.py
@@ -601,8 +601,20 @@ def sandbox_validate_node(state: GraphState) -> dict:
         return {"sandbox_result": None, "trace": trace}
 
     parsed_files = state.get("parsed_files", [])
-    if parsed_files:
-        trace.append(f"sandbox_validate: loading {len(parsed_files)} files into sandbox")
+
+    # CR fix: guard against empty parsed_files before dispatching to sandbox.
+    # apply_code_node can leave parsed_files=[] on parse/path-validation failures.
+    # Running SCRIPT_EXEC/TEST_SUITE/LINT_ONLY with no files causes spurious errors.
+    if plan.strategy in {
+        ValidationStrategy.SCRIPT_EXEC,
+        ValidationStrategy.TEST_SUITE,
+        ValidationStrategy.LINT_ONLY,
+    }:
+        if not parsed_files:
+            trace.append("sandbox_validate: no parsed files available -- skipping")
+            return {"sandbox_result": None, "trace": trace}
+
+    trace.append(f"sandbox_validate: loading {len(parsed_files)} files into sandbox")
 
     template_label = plan.template or "default"
     trace.append(f"sandbox_validate: strategy={plan.strategy.value}, template={template_label}")
@@ -616,8 +628,13 @@ def sandbox_validate_node(state: GraphState) -> dict:
                 template=plan.template,
                 parsed_files=parsed_files if parsed_files else None,
             )
-        elif plan.strategy == ValidationStrategy.TEST_SUITE:
-            trace.append(f"sandbox_validate: running {len(plan.commands)} commands sequentially")
+        # CR fix: fold LINT_ONLY into TEST_SUITE -- both run commands
+        # sequentially via _run_sandbox_tests(). LINT_ONLY was falling
+        # through to the skip branch instead of executing plan.commands.
+        elif plan.strategy in {ValidationStrategy.TEST_SUITE, ValidationStrategy.LINT_ONLY}:
+            trace.append(
+                f"sandbox_validate: running {len(plan.commands)} command(s) sequentially"
+            )
             result = _run_sandbox_tests(
                 commands=plan.commands,
                 template=plan.template,
@@ -754,4 +771,4 @@ def run_task(task_description, enable_tracing=True, session_id=None, tags=None):
         final_state = AgentState(**result)
         add_trace_event(trace_config, "orchestrator_complete", metadata={"status": final_state.status.value, "tokens_used": final_state.tokens_used, "retry_count": final_state.retry_count, "memory_writes_count": len(final_state.memory_writes), "tool_calls_count": len(final_state.tool_calls_log)})
     trace_config.flush()
-    return final_state
\ No newline at end of file
+    return final_state

From b56164b01de1ad27a6d8fdb57930f1a2e16918b2 Mon Sep 17 00:00:00 2001
From: Mike Abernathy <mabernathy87@gmail.com>
Date: Thu, 2 Apr 2026 13:54:12 -0600
Subject: [PATCH 11/12] fix: update test_sandbox_validate for empty
 parsed_files guard

- _make_state now populates parsed_files from target_files by default
  so sandbox dispatch reaches mocked runners
- Tests needing empty parsed_files pass it explicitly
- Added test_empty_parsed_files_skips_sandbox (CR fix #4 coverage)
- Added test_lint_only_for_pyi_stubs (CR fix #5 coverage)
---
 dev-suite/tests/test_sandbox_validate.py | 48 ++++++++++++++++++++++--
 1 file changed, 45 insertions(+), 3 deletions(-)

diff --git a/dev-suite/tests/test_sandbox_validate.py b/dev-suite/tests/test_sandbox_validate.py
index 34086c4..e5241cd 100644
--- a/dev-suite/tests/test_sandbox_validate.py
+++ b/dev-suite/tests/test_sandbox_validate.py
@@ -2,11 +2,12 @@
 
 These tests validate the sandbox_validate node behavior with mocked
 E2B runner -- no real sandbox needed. Tests cover:
-  - Strategy-based dispatch (TEST_SUITE, SCRIPT_EXEC, SKIP)
+  - Strategy-based dispatch (TEST_SUITE, SCRIPT_EXEC, SKIP, LINT_ONLY)
   - Template selection from blueprint target_files
   - Graceful skip when E2B_API_KEY is missing
   - SandboxResult stored in graph state
   - Warnings surfaced in trace
+  - Empty parsed_files guard
 
 Issue #95: Updated for ValidationStrategy dispatch.
 """
@@ -25,7 +26,12 @@
 
 
 def _make_state(target_files: list[str], **overrides) -> GraphState:
-    """Helper to create a GraphState with a blueprint."""
+    """Helper to create a GraphState with a blueprint.
+
+    By default, populates parsed_files from target_files so the
+    empty-files guard doesn't skip sandbox dispatch. Tests that
+    need empty parsed_files should pass parsed_files=[] explicitly.
+    """
     bp = Blueprint(
         task_id="test-task",
         target_files=target_files,
@@ -33,6 +39,11 @@ def _make_state(target_files: list[str], **overrides) -> GraphState:
         constraints=["constraint1"],
         acceptance_criteria=["criterion1"],
     )
+    # Default: populate parsed_files with dummy content for each target file
+    default_parsed = [
+        {"path": f, "content": f"# stub content for {f}"}
+        for f in target_files
+    ]
     state: GraphState = {
         "task_description": "test task",
         "blueprint": bp,
@@ -45,7 +56,7 @@ def _make_state(target_files: list[str], **overrides) -> GraphState:
         "memory_context": [],
         "memory_writes": [],
         "trace": [],
-        "parsed_files": [],
+        "parsed_files": default_parsed,
         "tool_calls_log": [],
     }
     state.update(overrides)
@@ -120,6 +131,37 @@ def test_skip_for_non_code_files(self):
         assert result["sandbox_result"] is None
         assert any("skip" in t.lower() for t in result["trace"])
 
+    def test_empty_parsed_files_skips_sandbox(self):
+        """CR fix: empty parsed_files should skip sandbox dispatch."""
+        state = _make_state(
+            ["src/main.py", "tests/test_main.py"],
+            parsed_files=[],
+        )
+
+        result = sandbox_validate_node(state)
+
+        assert result["sandbox_result"] is None
+        assert any("no parsed files" in t.lower() for t in result["trace"])
+
+    def test_lint_only_for_pyi_stubs(self):
+        """CR fix: .pyi-only files should dispatch via LINT_ONLY -> _run_sandbox_tests."""
+        state = _make_state(["src/types.pyi"])
+
+        with patch("src.orchestrator._run_sandbox_tests") as mock_tests:
+            mock_tests.return_value = SandboxResult(
+                exit_code=0,
+                tests_passed=None,
+                tests_failed=None,
+                output_summary="ruff check passed",
+            )
+            result = sandbox_validate_node(state)
+
+        assert result["sandbox_result"] is not None
+        assert result["sandbox_result"].exit_code == 0
+        mock_tests.assert_called_once()
+        trace_text = " ".join(result.get("trace", []))
+        assert "lint_only" in trace_text.lower() or "command(s) sequentially" in trace_text.lower()
+
 
 class TestSandboxValidateNode:
     """Core sandbox_validate_node tests."""

From 101f4ca4354099d37d28ab422894b634a2c069f7 Mon Sep 17 00:00:00 2001
From: Mike Abernathy <mabernathy87@gmail.com>
Date: Thu, 2 Apr 2026 14:07:22 -0600
Subject: [PATCH 12/12] =?UTF-8?q?fix:=20CR=20round=204=20=E2=80=94=20rc=3D?=
 =?UTF-8?q?127=20must=20fail=20aggregate,=20fullstack=20without=20tests=20?=
 =?UTF-8?q?skips=20pytest?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CR fixes addressed:
- e2b_runner.py: change `elif` to `if` so rc=127 (missing tool) both
  warns AND sets aggregate_exit_code (Critical — missing pnpm/tsc passed)
- validation_commands.py: fullstack branch now checks _has_test_files()
  and uses PYTHON_LINT_COMMANDS when no tests present (Major — "no tests
  collected" failure for mixed frontend+Python without test files)
- test_validation_commands.py: existing mixed tests now include test files,
  added test_mixed_frontend_and_python_no_tests for lint-only path
---
 dev-suite/src/sandbox/e2b_runner.py          |  5 ++++-
 dev-suite/src/sandbox/validation_commands.py |  7 +++++--
 dev-suite/tests/test_validation_commands.py  | 12 +++++++++++-
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/dev-suite/src/sandbox/e2b_runner.py b/dev-suite/src/sandbox/e2b_runner.py
index 8f6f85a..7ae409f 100644
--- a/dev-suite/src/sandbox/e2b_runner.py
+++ b/dev-suite/src/sandbox/e2b_runner.py
@@ -401,7 +401,10 @@ def run_tests(
                     rc = cr.get("rc", 0)
                     if rc == 127:
                         warnings.append(f"Command not found: {cr['cmd'][:60]}")
-                    elif rc != 0 and aggregate_exit_code == 0:
+                    # CR fix: use `if` not `elif` so rc=127 both warns
+                    # AND fails the aggregate. A missing tool (pnpm, tsc)
+                    # should not silently pass validation.
+                    if rc != 0 and aggregate_exit_code == 0:
                         aggregate_exit_code = rc
                 summary = summary.split("__RESULTS__")[0].strip()
             except (ValueError, IndexError):
diff --git a/dev-suite/src/sandbox/validation_commands.py b/dev-suite/src/sandbox/validation_commands.py
index 013b502..99feb63 100644
--- a/dev-suite/src/sandbox/validation_commands.py
+++ b/dev-suite/src/sandbox/validation_commands.py
@@ -198,11 +198,14 @@ def get_validation_plan(target_files: list[str]) -> ValidationPlan:
 
     # Pick strategy and commands
     if has_frontend and has_python:
-        # Full-stack: always TEST_SUITE
+        # Full-stack: frontend checks + Python (lint always, pytest only if tests exist)
+        # CR fix: unconditionally using FULLSTACK_COMMANDS reintroduces pytest
+        # even when no test files are present, causing "no tests collected" failures.
+        python_cmds = PYTHON_COMMANDS if _has_test_files(target_files) else PYTHON_LINT_COMMANDS
         return ValidationPlan(
             strategy=ValidationStrategy.TEST_SUITE,
             template=template,
-            commands=list(FULLSTACK_COMMANDS),
+            commands=list(FRONTEND_COMMANDS + python_cmds),
             description=(
                 f"Full-stack validation: {len(target_files)} files "
                 f"(frontend + Python) using fullstack template"
diff --git a/dev-suite/tests/test_validation_commands.py b/dev-suite/tests/test_validation_commands.py
index 61bf3a0..7204760 100644
--- a/dev-suite/tests/test_validation_commands.py
+++ b/dev-suite/tests/test_validation_commands.py
@@ -126,6 +126,7 @@ def test_mixed_python_and_frontend(self):
         files = [
             "dev-suite/src/api/main.py",
             "dashboard/src/lib/Widget.svelte",
+            "tests/test_api.py",
         ]
         plan = get_validation_plan(files)
         assert plan.strategy == ValidationStrategy.TEST_SUITE
@@ -134,12 +135,21 @@ def test_mixed_python_and_frontend(self):
         assert "Full-stack" in plan.description
 
     def test_mixed_python_and_typescript(self):
-        files = ["src/server.py", "dashboard/src/routes/+page.ts"]
+        files = ["src/server.py", "dashboard/src/routes/+page.ts", "tests/test_server.py"]
         plan = get_validation_plan(files)
         assert plan.strategy == ValidationStrategy.TEST_SUITE
         assert plan.template == "fullstack"
         assert plan.commands == FULLSTACK_COMMANDS
 
+    def test_mixed_frontend_and_python_no_tests(self):
+        """CR fix: mixed without tests should use frontend + lint only (no pytest)."""
+        files = ["src/api.py", "dashboard/src/App.svelte"]
+        plan = get_validation_plan(files)
+        assert plan.strategy == ValidationStrategy.TEST_SUITE
+        assert plan.template == "fullstack"
+        assert plan.commands == FRONTEND_COMMANDS + PYTHON_LINT_COMMANDS
+        assert "Full-stack" in plan.description
+
     # -- Non-code files (SKIP) --
 
     def test_non_code_files(self):