Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
44ee17b
feat: add CLI shell and exec commands for deployment pod terminal access
V2arK Mar 12, 2026
c62f890
fix: use urlparse for scheme replacement to satisfy CodeQL
V2arK Mar 12, 2026
095bd1e
fix: apply black formatting and fix CodeQL url.startswith alert
V2arK Mar 12, 2026
d03f493
style: condense multiline expressions for readability
V2arK Mar 12, 2026
3dadb75
fix: resolve pylint warnings in shell.py and test_shell.py
V2arK Mar 12, 2026
8a93733
fix: skip PyTorch-dependent tests in sanity mode
V2arK Mar 12, 2026
0a4c1bb
fix: break out of exec loop after end marker to prevent hanging
V2arK Mar 12, 2026
7259bd4
fix: re-enable OPOST after setraw to fix terminal rendering
V2arK Mar 12, 2026
ab51beb
fix: replace pytest-asyncio with asyncio.run in tests for CI compat
V2arK Mar 12, 2026
8770660
fix: match Web UI protocol - remove rows/cols from stdin messages, re…
V2arK Mar 12, 2026
b79a30a
fix: send delayed resize to fix prompt rendering after shell startup
V2arK Mar 12, 2026
0b03a53
fix: await cancelled tasks for cleanup, reduce WS close_timeout to 2s
V2arK Mar 12, 2026
54445b8
fix: toggle PTY width to force SIGWINCH and prompt redraw on connect
V2arK Mar 12, 2026
0a0636c
fix: include rows/cols in stdin messages and send Ctrl+L after resize…
V2arK Mar 12, 2026
31d41ae
fix: use stty to set PTY dimensions from inside shell instead of resi…
V2arK Mar 12, 2026
ec8286b
fix: re-enable OPOST after setraw to convert bare \n to \r\n like xte…
V2arK Mar 12, 2026
5d94f14
fix: convert \n to \r\n in output and use stty to fix PTY dimensions …
V2arK Mar 12, 2026
559460f
feat: use pyte terminal emulator for interactive shell rendering
V2arK Mar 12, 2026
ff0e893
fix: swap rows/cols unpacking from shutil.get_terminal_size
V2arK Mar 12, 2026
c6d42ed
fix: use alternate screen buffer to prevent scrollback in Warp terminal
V2arK Mar 12, 2026
ba0e9d5
fix: handle WebSocket ConnectionClosed to prevent hang on shell exit
V2arK Mar 12, 2026
f29df94
refactor: use pyte for exec ANSI stripping and add ConnectionClosed h…
V2arK Mar 12, 2026
db40469
fix: treat ArgoCD Code message as reconnect signal, not shell exit code
V2arK Mar 12, 2026
f0b37b8
fix: stop reconnecting when shell has genuinely exited
V2arK Mar 12, 2026
4265839
chore: add debug file logging to shell and exec for exit hang diagnosis
V2arK Mar 13, 2026
27977af
fix: detect shell exit via idle timeout instead of Code message
V2arK Mar 13, 2026
1857f89
fix: exit immediately on exit echo, ignore echo exit with trailing pr…
V2arK Mar 13, 2026
76fd598
fix: skip websocket close handshake wait after session ends
V2arK Mar 13, 2026
a531f00
refactor: extract shell logic from CLI to SDK layer
V2arK Mar 13, 2026
e9e3829
refactor: extract shell logic to SDK layer, rely on server close frame
V2arK Mar 13, 2026
423cdc7
ruff format
V2arK Mar 13, 2026
9ef67f5
refactor: remove debug logging, fix unused imports and SDK/CLI bounda…
V2arK Mar 13, 2026
59d9d08
update redundancy
michaelshin Mar 17, 2026
c49e5c5
clean up pyte
michaelshin Mar 17, 2026
dd00cbb
clean up implementation
michaelshin Mar 17, 2026
4e01d82
address comments
michaelshin Mar 17, 2026
3ff11ee
fix lint
michaelshin Mar 17, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions centml/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from centml.cli.login import login, logout
from centml.cli.cluster import ls, get, delete, pause, resume
from centml.cli.shell import shell, exec_cmd


@click.group()
Expand Down Expand Up @@ -47,6 +48,8 @@ def ccluster():
ccluster.add_command(delete)
ccluster.add_command(pause)
ccluster.add_command(resume)
ccluster.add_command(shell)
ccluster.add_command(exec_cmd, name="exec")


cli.add_command(ccluster, name="cluster")
92 changes: 92 additions & 0 deletions centml/cli/shell.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
"""CLI commands for interactive shell and command execution in deployment pods."""

import asyncio
import sys

import click

from centml.cli.cluster import handle_exception
from centml.sdk import auth
from centml.sdk.api import get_centml_client
from centml.sdk.config import settings
from centml.sdk.shell import (
PodNotFoundError,
ShellError,
build_ws_url,
exec_session,
get_running_pods,
interactive_session,
)


def _resolve_pod(running_pods: list[str], pod_name: str) -> str:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this check is also handled at the API. we should be able to remove it from the client

"""Validate that *pod_name* exists in *running_pods*."""
if pod_name not in running_pods:
pods_list = ", ".join(running_pods)
raise PodNotFoundError(f"Pod '{pod_name}' not found. Available running pods: {pods_list}")
return pod_name


def _select_pod(running_pods, deployment_id):
click.echo(f"Multiple running pods found for deployment {deployment_id}:")
for i, name in enumerate(running_pods, 1):
click.echo(f" [{i}] {name}")

choice = click.prompt(
"Select a pod", type=click.IntRange(1, len(running_pods)), prompt_suffix=f" [1-{len(running_pods)}]: "
)
return running_pods[choice - 1]


def _connect_args(deployment_id, pod, shell_type, first_pod=False):
"""Resolve pod, build WebSocket URL, and obtain auth token."""
with get_centml_client() as cclient:
running_pods = get_running_pods(cclient, deployment_id)
if not running_pods:
raise click.ClickException(f"No running pods found for deployment {deployment_id}")

if pod is not None:
try:
pod_name = _resolve_pod(running_pods, pod)
except ShellError as exc:
raise click.ClickException(str(exc)) from exc
elif len(running_pods) == 1 or first_pod:
pod_name = running_pods[0]
else:
pod_name = _select_pod(running_pods, deployment_id)
Comment on lines +41 to +56
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# First, let's understand the structure of the file and locate the key functions
cd /repo
find . -name "shell.py" -path "*/cli/*" 2>/dev/null | head -5

Repository: CentML/centml-python-client

Length of output: 165


🏁 Script executed:

#!/bin/bash
# Read the entire shell.py file to understand the context
wc -l centml/cli/shell.py

Repository: CentML/centml-python-client

Length of output: 93


🏁 Script executed:

#!/bin/bash
# Get the content of lines 1-100 to see the function definitions and calls
cat -n centml/cli/shell.py | head -100

Repository: CentML/centml-python-client

Length of output: 4356


Avoid prompting from exec_cmd when stdin isn't interactive.

The exec_cmd() function calls _connect_args() without checking TTY, but _connect_args() can trigger _select_pod() which calls click.prompt() and will block when multiple pods exist and --pod is not specified. This causes unexpected hangs or failures in non-interactive contexts like CI and scripts. Only shell() has the TTY check (line 72–73). Gate the prompting logic in _connect_args() on sys.stdin.isatty() and require --pod or --first-pod for non-interactive callers.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@centml/cli/shell.py` around lines 41 - 56, The _connect_args function can
block by calling _select_pod (which uses click.prompt) when multiple pods exist;
modify _connect_args to avoid prompting when stdin is not a TTY: import sys and
before calling _select_pod check sys.stdin.isatty(), and if it's False and
neither pod nor first_pod is provided raise click.ClickException instructing the
caller to pass --pod or --first-pod; keep the existing behavior (call
_select_pod) only when sys.stdin.isatty() is True. Ensure this logic is applied
inside the _connect_args function surrounding the branch that currently sets
pod_name = _select_pod(running_pods, deployment_id).


ws_url = build_ws_url(settings.CENTML_PLATFORM_API_URL, deployment_id, pod_name, shell_type)
token = auth.get_centml_token()
return ws_url, token


@click.command(help="Open an interactive shell to a deployment pod")
@click.argument("deployment_id", type=int)
@click.option("--pod", default=None, help="Specify a pod name")
@click.option("--shell", "shell_type", default=None, type=click.Choice(["bash", "sh", "zsh"]), help="Shell type")
@click.option(
"--first-pod", is_flag=True, default=False, help="Auto-select the first running pod (skip interactive selection)"
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we need this right? if --pod is not provided, then we default to first pod

)
@handle_exception
def shell(deployment_id, pod, shell_type, first_pod):
if not sys.stdin.isatty():
raise click.ClickException("Interactive shell requires a terminal (TTY)")

ws_url, token = _connect_args(deployment_id, pod, shell_type, first_pod)
exit_code = asyncio.run(interactive_session(ws_url, token))
sys.exit(exit_code)


@click.command(help="Execute a command in a deployment pod", context_settings={"ignore_unknown_options": True})
@click.argument("deployment_id", type=int)
@click.argument("command", nargs=-1, required=True, type=click.UNPROCESSED)
@click.option("--pod", default=None, help="Specific pod name")
@click.option("--shell", "shell_type", default=None, type=click.Choice(["bash", "sh", "zsh"]), help="Shell type")
@click.option(
"--first-pod", is_flag=True, default=False, help="Auto-select the first running pod (skip interactive selection)"
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here

)
@handle_exception
def exec_cmd(deployment_id, command, pod, shell_type, first_pod):
ws_url, token = _connect_args(deployment_id, pod, shell_type, first_pod)
exit_code = asyncio.run(exec_session(ws_url, token, " ".join(command)))
Comment on lines +89 to +91
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

# First, let's look at the file and the specific lines mentioned
cat -n centml/cli/shell.py | sed -n '80,100p'

Repository: CentML/centml-python-client

Length of output: 965


🏁 Script executed:

# Also check the imports at the top of the file
head -20 centml/cli/shell.py

Repository: CentML/centml-python-client

Length of output: 510


🏁 Script executed:

# Let's understand the broader context of the exec_cmd function
cat -n centml/cli/shell.py | sed -n '85,110p'

Repository: CentML/centml-python-client

Length of output: 522


🏁 Script executed:

# Check if shlex is already imported anywhere in the file
grep -n "import shlex" centml/cli/shell.py

Repository: CentML/centml-python-client

Length of output: 53


🏁 Script executed:

# Search for how command is passed to exec_cmd - look for Click decorators
cat -n centml/cli/shell.py | sed -n '1,100p'

Repository: CentML/centml-python-client

Length of output: 4356


🏁 Script executed:

# Check Python version requirements
fd -t f -e "toml" -e "cfg" -e "txt" | xargs grep -l "python" 2>/dev/null | head -5

Repository: CentML/centml-python-client

Length of output: 87


🏁 Script executed:

# Look for pyproject.toml or setup.py
ls -la centml/ | head -20
git ls-files | grep -E "(pyproject|setup)" | head -10

Repository: CentML/centml-python-client

Length of output: 409


🏁 Script executed:

# Check the project config
cat pyproject.toml 2>/dev/null || cat setup.py 2>/dev/null

Repository: CentML/centml-python-client

Length of output: 747


Use shlex.join() to preserve command argument boundaries.

The command argument is tokenized by Click (nargs=-1, type=click.UNPROCESSED). Using " ".join(command) loses quoting and escaping, so inputs like "hello world", $HOME, or * are misinterpreted in the pod shell. Use shlex.join(command) for the POSIX shells (bash, sh, zsh) specified here.

Suggested fix
+import shlex
...
-    exit_code = asyncio.run(exec_session(ws_url, token, " ".join(command)))
+    exit_code = asyncio.run(exec_session(ws_url, token, shlex.join(command)))
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@centml/cli/shell.py` around lines 89 - 91, The exec_cmd function currently
concatenates the Click-provided command tuple with " ".join(command), which
loses quoting/escaping; replace that with shlex.join(command) and ensure shlex
is imported so exec_cmd calls asyncio.run(exec_session(ws_url, token,
shlex.join(command))) to preserve argument boundaries for POSIX shells (refer to
exec_cmd and exec_session symbols).

sys.exit(exit_code)
3 changes: 3 additions & 0 deletions centml/sdk/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ def get(self, depl_type):
def get_status(self, id):
return self._api.get_deployment_status_deployments_status_deployment_id_get(id)

def get_status_v3(self, deployment_id):
return self._api.get_deployment_status_v3_deployments_status_v3_deployment_id_get(deployment_id)

def get_inference(self, id):
"""Get Inference deployment details - automatically handles both V2 and V3 deployments"""
# Try V3 first (recommended), fallback to V2 if deployment is V2
Expand Down
12 changes: 12 additions & 0 deletions centml/sdk/shell/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from centml.sdk.shell.exceptions import NoPodAvailableError, PodNotFoundError, ShellError
from centml.sdk.shell.session import build_ws_url, exec_session, get_running_pods, interactive_session

__all__ = [
"ShellError",
"NoPodAvailableError",
"PodNotFoundError",
"build_ws_url",
"get_running_pods",
"interactive_session",
"exec_session",
]
10 changes: 10 additions & 0 deletions centml/sdk/shell/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
class ShellError(Exception):
"""Base exception for shell operations."""


class NoPodAvailableError(ShellError):
"""No running pods found for the deployment."""


class PodNotFoundError(ShellError):
"""Specified pod not found among running pods."""
Loading
Loading