Enable real-time display of LLM responses as they’re generated, token by token. This guide demonstrates how to use
streaming callbacks to process and display tokens as they arrive from the language model.
Streaming allows you to display LLM responses progressively as the model generates them, rather than waiting for the
complete response. This creates a more responsive user experience, especially for long-form content generation.
Create a callback function that processes streaming chunks as they arrive:
Copy
Ask AI
def on_token(chunk: ModelResponseStream) -> None: """Process each streaming chunk as it arrives.""" choices = chunk.choices for choice in choices: delta = choice.delta if delta is not None: content = getattr(delta, "content", None) if isinstance(content, str): sys.stdout.write(content) sys.stdout.flush()
The callback receives a ModelResponseStream object containing:
choices: List of response choices from the model
delta: Incremental content changes for each choice
The token_callbacks parameter accepts a list of callbacks, allowing you to register multiple handlers
if needed (e.g., one for display, another for logging).
import osimport sysfrom typing import Literalfrom pydantic import SecretStrfrom openhands.sdk import ( Conversation, get_logger,)from openhands.sdk.llm import LLMfrom openhands.sdk.llm.streaming import ModelResponseStreamfrom openhands.tools.preset.default import get_default_agentlogger = get_logger(__name__)api_key = os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY")if not api_key: raise RuntimeError("Set LLM_API_KEY or OPENAI_API_KEY in your environment.")model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929")base_url = os.getenv("LLM_BASE_URL")llm = LLM( model=model, api_key=SecretStr(api_key), base_url=base_url, usage_id="stream-demo", stream=True,)agent = get_default_agent(llm=llm, cli_mode=True)# Define streaming statesStreamingState = Literal["thinking", "content", "tool_name", "tool_args"]# Track state across on_token calls for boundary detection_current_state: StreamingState | None = Nonedef on_token(chunk: ModelResponseStream) -> None: """ Handle all types of streaming tokens including content, tool calls, and thinking blocks with dynamic boundary detection. """ global _current_state choices = chunk.choices for choice in choices: delta = choice.delta if delta is not None: # Handle thinking blocks (reasoning content) reasoning_content = getattr(delta, "reasoning_content", None) if isinstance(reasoning_content, str) and reasoning_content: if _current_state != "thinking": if _current_state is not None: sys.stdout.write("\n") sys.stdout.write("THINKING: ") _current_state = "thinking" sys.stdout.write(reasoning_content) sys.stdout.flush() # Handle regular content content = getattr(delta, "content", None) if isinstance(content, str) and content: if _current_state != "content": if _current_state is not None: sys.stdout.write("\n") sys.stdout.write("CONTENT: ") _current_state = "content" sys.stdout.write(content) sys.stdout.flush() # Handle tool calls tool_calls = getattr(delta, "tool_calls", None) if tool_calls: for tool_call in tool_calls: tool_name = ( tool_call.function.name if tool_call.function.name else "" ) tool_args = ( tool_call.function.arguments if tool_call.function.arguments else "" ) if tool_name: if _current_state != "tool_name": if _current_state is not None: sys.stdout.write("\n") sys.stdout.write("TOOL NAME: ") _current_state = "tool_name" sys.stdout.write(tool_name) sys.stdout.flush() if tool_args: if _current_state != "tool_args": if _current_state is not None: sys.stdout.write("\n") sys.stdout.write("TOOL ARGS: ") _current_state = "tool_args" sys.stdout.write(tool_args) sys.stdout.flush()conversation = Conversation( agent=agent, workspace=os.getcwd(), token_callbacks=[on_token],)story_prompt = ( "Tell me a long story about LLM streaming, write it a file, " "make sure it has multiple paragraphs. ")conversation.send_message(story_prompt)print("Token Streaming:")print("-" * 100 + "\n")conversation.run()cleanup_prompt = ( "Thank you. Please delete the streaming story file now that I've read it, " "then confirm the deletion.")conversation.send_message(cleanup_prompt)print("Token Streaming:")print("-" * 100 + "\n")conversation.run()# Report costcost = llm.metrics.accumulated_costprint(f"EXAMPLE_COST: {cost}")
You can run the example code as-is.
The model name should follow the LiteLLM convention: provider/model_name (e.g., anthropic/claude-sonnet-4-5-20250929, openai/gpt-4o).
The LLM_API_KEY should be the API key for your chosen provider.
ChatGPT Plus/Pro subscribers: You can use LLM.subscription_login() to authenticate with your ChatGPT account and access Codex models without consuming API credits. See the LLM Subscriptions guide for details.