From 4c737672b98b3b5b320a1b97b4ae3302abb500c9 Mon Sep 17 00:00:00 2001 From: Paddy Byers Date: Wed, 8 Apr 2026 21:20:49 +0100 Subject: [PATCH 1/6] WIP: editorial updates --- src/pages/docs/ai-transport/index.mdx | 208 +++++++++++++----- .../docs/ai-transport/why-ai-transport.mdx | 51 ++--- 2 files changed, 174 insertions(+), 85 deletions(-) diff --git a/src/pages/docs/ai-transport/index.mdx b/src/pages/docs/ai-transport/index.mdx index ca991133f3..2d2e133950 100644 --- a/src/pages/docs/ai-transport/index.mdx +++ b/src/pages/docs/ai-transport/index.mdx @@ -1,91 +1,189 @@ --- -title: "AI Transport" -meta_description: "Ably AI Transport is a durable session layer for AI applications. Upgrade HTTP streams into resilient, multi-device, steerable AI experiences with a few lines of code." -meta_keywords: "AI Transport, durable sessions, AI streaming, realtime AI, multi-device AI, Ably" -intro: "A durable session layer that upgrades your AI streams into resilient, multi-device, steerable experiences." +title: "About AI Transport" +intro: "Ably AI Transport is a drop-in infrastructure layer that upgrades your AI streams into bi-directional, stateful experiences. It enables you to build multi-device, steerable AI applications that are agent agnostic, incredibly resilient and highly scalable." +meta_description: "AI Transport provides realtime infrastructure for AI agents, enabling token streaming, tool calls, and bidirectional communication between clients and AI backends." --- -Ably AI Transport decouples your AI agents from your clients through a durable session layer. Instead of coupling everything to a single HTTP request, agents and clients communicate through a shared, persistent session that survives disconnections, works across devices, and supports bidirectional control. +AI Transport enables you to add a realtime delivery layer to your application, providing the infrastructure required to deliver modern, stateful AI experiences to users. It works seamlessly with any AI model or framework, such as OpenAI, Anthropic, Vercel or LangChain. -AI Transport works with any AI model or framework. It drops into your existing stack with a few lines of code. +AI Transport runs on Ably's [fault-tolerant](/docs/platform/architecture/fault-tolerance) and highly-available platform. The platform supports streaming data between all internet-connected devices at [low latencies](/docs/platform/architecture/latency) across the globe. Its elastic global infrastructure delivers enterprise-scale messaging that [effortlessly scales](/docs/platform/architecture/platform-scalability) to meet demand. -## What AI Transport provides +Drop AI Transport into your applications to transform them into modern, bi-directional AI experiences that keep users engaged. AI Transport provides the building blocks to deliver reliable, resumable token streams with robust session management and state hydration to always keep your users and agents in sync. -AI Transport brings three foundational capabilities to your AI applications: +![Before and after adding AI Transport](../../../images/content/diagrams/ai-transport-before-and-after.png) -- Streams survive connection drops. When a client disconnects and reconnects, it picks up exactly where it left off. No lost tokens, no broken responses. -- Users move between tabs, phones, and laptops. The session follows them. Every device sees the same conversation, fully in sync. -- Clients can cancel, interrupt, and steer agents mid-stream. Users interact with agents while they work, not just when they finish. +## Get started -## Choose your path +Start learning the basics of AI Transport right away with a getting started guide using your agent and framework of choice: + +### OpenAI + +Use the following guides to get started with OpenAI: {[ { - title: 'Why AI Transport', - description: 'Understand the problem AI Transport solves and why durable sessions matter.', - link: '/docs/ai-transport/why-ai-transport', + title: 'Message-per-response', + description: 'Stream OpenAI responses using message appends', + image: 'icon-tech-javascript', + link: '/docs/ai-transport/guides/openai/openai-message-per-response', }, { - title: 'How it works', - description: 'Learn the mental model: sessions, turns, and the transport architecture.', - link: '/docs/ai-transport/how-it-works', + title: 'Human-in-the-loop', + description: 'Implement human-in-the-loop approval workflows with OpenAI', + image: 'icon-tech-javascript', + link: '/docs/ai-transport/guides/openai/openai-human-in-the-loop', }, +]} + + +### Anthropic + +Use the following guides to get started with Anthropic: + + +{[ + { + title: 'Message-per-response', + description: 'Stream Anthropic responses using message appends', + image: 'icon-tech-javascript', + link: '/docs/ai-transport/guides/anthropic/anthropic-message-per-response', + }, + { + title: 'Human-in-the-loop', + description: 'Implement human-in-the-loop approval workflows with Anthropic', + image: 'icon-tech-javascript', + link: '/docs/ai-transport/guides/anthropic/anthropic-human-in-the-loop', + }, +]} + + +### Vercel AI SDK + +Use the following guides to get started with the Vercel AI SDK: + + +{[ { - title: 'Get started', - description: 'Build a streaming AI app in 5 minutes with your framework of choice.', - link: '/docs/ai-transport/getting-started/vercel-ai-sdk', + title: 'Message-per-response', + description: 'Stream Vercel AI SDK responses using message appends', + image: 'icon-tech-javascript', + link: '/docs/ai-transport/guides/vercel-ai-sdk/vercel-message-per-response', }, { - title: 'Features', - description: 'Explore what you can build: cancellation, branching, multi-device, and more.', - link: '/docs/ai-transport/features/token-streaming', + title: 'Human-in-the-loop', + description: 'Implement HITL workflows with tool approval over Ably', + image: 'icon-tech-javascript', + link: '/docs/ai-transport/guides/vercel-ai-sdk/vercel-human-in-the-loop', }, ]} -## How it works at a glance +### LangGraph + +Use the following guides to get started with LangGraph: + + +{[ + { + title: 'Message-per-response', + description: 'Stream LangGraph responses using message appends', + image: 'icon-tech-javascript', + link: '/docs/ai-transport/guides/langgraph/langgraph-message-per-response', + }, + { + title: 'Human-in-the-loop', + description: 'Implement HITL workflows with tool approval over Ably', + image: 'icon-tech-javascript', + link: '/docs/ai-transport/guides/langgraph/langgraph-human-in-the-loop', + }, +]} + -Without AI Transport, a client establishes a direct HTTP connection to an agent. The agent streams tokens back over that connection. If the connection drops, the stream dies. Other devices can't see it. There's no way to signal the agent. +## Features -With AI Transport, the client sends an HTTP request to invoke the agent, but the response comes through an Ably channel. The channel is the durable session: +AI Transport provides a range of features built on Ably's highly-scalable realtime platform to enable you to deliver reliable, stateful AI experiences that provide the first-class UX your users expect from modern applications. -- The agent publishes response tokens to the channel -- Any number of clients subscribe to the channel and receive tokens in realtime -- Clients can publish control signals (cancel, interrupt) back through the channel -- If a connection drops, the client reconnects and resumes from where it left off +### Token streaming - -```javascript -// Before: default HTTP streaming -const { messages } = useChat() +Token streaming is the core of how LLMs deliver their responses to users. Tokens are progressively streamed to users from your LLM so that users don't need to wait for a complete response before seeing any output. -// After: Ably AI Transport (everything else stays the same) -const channel = ably.channels.get(chatId) -const transport = useChatTransport({ channel }) -const { messages } = useChat({ transport }) -``` - +Using AI Transport, your token streams are reliable and persistent. They survive modern environments where users change browser tabs, refresh the page or switch devices, and common interruptions such as temporary network loss. Your users can always reconnect and continue where they left off without having to start over. -The initial HTTP request still happens for sending user prompts to the agent. But the response stream is decoupled from that request and delivered through the durable session. +[Read more about token streaming](/docs/ai-transport/token-streaming). -## Framework support +### Bi-directional communication -AI Transport is framework-agnostic. The SDK provides a core transport layer that works with any AI provider or framework: +AI Transport supports rich, bi-directional communication patterns between users and agents. -| Framework | Integration | Entry point | -| --- | --- | --- | -| Vercel AI SDK | First-class adapter for `useChat` | `@ably/ai-transport/vercel/react` | -| OpenAI | Core transport with direct SDK usage | `@ably/ai-transport` | -| Anthropic | Core transport with direct SDK usage | `@ably/ai-transport` | -| LangGraph | Core transport with direct SDK usage | `@ably/ai-transport` | +Build sophisticated AI experiences with features such as accepting user input for interactive conversations, streaming chain-of-thought reasoning for transparency, attaching citations to responses for verifiability, implementing human-in-the-loop workflows for sensitive operations, and exposing tool calls for generative UI and visibility. + +These messaging features work seamlessly with [token streaming](/docs/ai-transport/token-streaming) to create complete, interactive AI experiences. + +[Read more about messaging features](/docs/ai-transport/messaging/accepting-user-input). + +### Durable sessions + +AI Transport enables durable sessions that persist beyond the lifetime of individual connections, allowing users and agents to connect and disconnect independently. + +Communication shouldn't be tied to the connection state of either party. If a user goes offline or their connection drops, they should be able to continue their session without losing context. AI Transport provides robust session management by enabling users and agents to connect independently of one another. + +Your users can start a conversation on their mobile and seamlessly continue it on their desktop. Similarly, multiple users can participate in the same conversation with a single agent and they will all remain in sync, in realtime. + +[Read more about sessions and identity](/docs/ai-transport/sessions-identity). + +### Automatic catch-up + +AI Transport enables clients to hydrate conversation and session state from the [channel](/docs/channels), including [message history](/docs/storage-history/history) and in-progress responses. + +Whether a user is briefly disconnected when they drive through a tunnel, or they're rejoining a conversation the following day of work, AI Transport allows clients to resynchronise the full conversation state, including both historical messages and in-progress responses. Your users are always up to date with the full conversation, in order, anywhere. + +[Read more about client hydration](/docs/ai-transport/token-streaming/message-per-response#hydration). + +### Background processing + +AI Transport allows agents to process jobs in the background while users go offline, with full awareness of their online status through realtime presence tracking. + +Users can work asynchronously by prompting an agent to perform a task without having to monitor its progress. They can go offline and receive a push notification when the agent has completed the task, or reconnect at any time to seamlessly resume and see all progress made while they were away using [state hydration](#catch-up). + +It also puts you in control of how you manage your application when there aren't any users online. For example, you can choose whether to pause a conversation when a user exits their browser tab, or allow the agent to complete its response for the user to view when they return. + +[Read more about status-aware cost controls](/docs/ai-transport/sessions-identity/online-status). + +### Enterprise controls + +Ably's platform provides [integrations](/docs/platform/integrations) and functionality to ensure that your applications always exceed the requirements of enterprise environments. Whether that's [message auditing](/docs/platform/integrations/streaming), [client identification](/docs/auth/identified-clients) or [fine-grained authorization](/docs/auth/capabilities). + +## Examples + +Take a look at some example code running in-browser of the sorts of features you can build with AI Transport underpinning your applications: + + +{[ + { + title: 'Message per response streaming', + description: 'Stream individual tokens from AI models into a single message.', + image: 'icon-tech-javascript', + link: '/examples/ai-transport-message-per-response?lang=javascript', + }, + { + title: 'Message per response streaming', + description: 'Stream individual tokens from AI models into a single message.', + image: 'icon-tech-react', + link: '/examples/ai-transport-message-per-response?lang=react', + }, +]} + -Vercel AI SDK has the deepest integration with a dedicated codec and React hooks. Other frameworks use the core transport directly, which accepts a standard `ReadableStream` from any AI provider. +## Pricing -## What to read next +Most AI frameworks support simple client-driven interactions, with streamed responses from the agent via server-sent events (SSE) or similar HTTP streaming. The client's request is handled by an agent instance; the agent pipes tokens in response to the client request. This approach is simple, surprisingly effective for simple interactions, and every framework supports it. However, the simplicity of the pattern is also the source of its limitations. -Most AI frameworks use server-sent events (SSE) or similar HTTP streaming. The client makes a request, establishes a persistent point-to-point connection to the agent, and the agent pipes LLM tokens back over that connection. +## Problems with HTTP streaming -This works well for getting started. It's simple and every framework supports it. But the pattern is oriented around a single client establishing a single connection to a single agent. Everything is coupled to that one request. +### Streams fail on disconnection -## Where it falls short +The operation of a response stream is tied to the health of the underlying connection. When the connection drops, the response stream fails. -### Streams die on disconnect +This happens all the time in practice: a phone switches from Wi-Fi to cellular, a user refreshes the page, a laptop lid closes mid-response. The LLM continues to generate tokens, but there's no way to deliver them to the lient, so there's nowhere for them to go. -The health of a response stream is tied to the health of the connection. When the connection drops, the stream dies with it. - -This happens constantly in practice: a phone switches from Wi-Fi to cellular, a user refreshes the page, a laptop lid closes mid-response. The LLM keeps generating tokens, but there's nowhere for them to go. - -To support resumable streams with direct HTTP, you need to buffer events in an external store, assign sequence numbers for ordering, build an explicit resume handler, and work out what the client missed. You build all this plumbing from scratch because the stream is explicitly managed by the agent. +The SSE protocol does support a mechanism, and the protocol level, for a reconnecting client to specify a position in the stream to resume from. However, this is usually not supported in practice because it would require a signficant increase in complexity in the backend; to support resumable streams with SSE, you would need to assign sequence numbers to token events for ordering, buffer those events in an external store, and build a resume handler. This is a big departure from a simple, stateless request handler. Even having done that, you have only addressed a part of the problem; the solution would not support continuity of streams after a page refresh because that's not supported by SSE. ### Sessions don't span devices -With direct HTTP streaming, the connection is a private pipe between the requesting client and the agent. A second tab or a phone can't access the stream. It only exists for the client that established the connection. +With HTTP streaming, the connection is an pipe exclusively between the requesting client and the agent and a second tab or a phone can't access the stream. It only exists for the client that initiated that request. -Users move between surfaces constantly: a second browser tab, a phone, a tablet. Without a shared session layer, each surface is isolated. There's no way for a new device to see the in-progress stream, the conversation history, or the current state. +Users move between surfaces constantly, whether that's a second browser tab, or an app on their phone. Without some shared access to sessions, each surface is isolated. There's no way for a new client to see the in-progress stream, the conversation history, or the current state. ### Clients can't reach the agent -SSE is one-way: server to client. The client has no way to send a signal to the agent through the same connection. - -This creates a fundamental conflict. Take a "stop" button that cancels an in-progress generation. The only mechanism available is closing the connection. But if you close the connection to cancel, you lose the ability to resume. Cancel and resume become mutually exclusive. +An SSE request initiated by the client, but from that point it is one-way: server to client. The client has no way to send a signal to the agent through the same connection once the initial request has been made. The only thing the client can do is to read the stream to completion, or cancel it by closing the connection. - +Having cancellation as the sole way to signal to the agent creates a fundamental conflict. Take a "stop" button for example, that cancels an in-progress generation; this could use request cancellation. But if a closed connection is interpreted as a cancellation (resulting in the LLM response being suspended), you lose the ability to resume from a closed connection; so cancel and resume are mutually exclusive. -Even with a bidirectional transport like WebSockets, the connection is still a private pipe. Other devices have no upstream channel to the agent. You can't interrupt or steer from a second device. +Even with a bidirectional transport between client and agent, such as WebSockets, the connection would still be an exclusive pipe. Other devices have no upstream channel to the agent, so you can't interrupt or steer from a second device. ### Multi-agent architectures are complex -In multi-agent systems, an orchestrator handles the client's connection and delegates to specialized subagents. If you want users to see granular progress from subagents, every update must flow through the orchestrator, adding complexity and coupling. +In multi-agent systems, an orchestrator handles the client's connection and delegates to specialized subagents. When there is an exclusive, point to point connection between the client and the orchestrator agent, all interactions with subagents must be proxied by the orchestrator. If you want users to see intermediate events (progress or responses) from subagents, every update must mediated by the orchestrator, adding complexity and coupling. ## Durable sessions -The pattern engineering teams are adopting to solve these problems is to decouple the agent layer from the client layer through a durable session - a shared, persistent medium through which they interact. +These problems all stem from the coupling between the client-to-agent interaction - ie the prompt and responses - and the transport layer used to mediate that interaction. The transport (ie the connection, request and streamed response) is ephemeral, so it only exists for the lifetime of that single interaction, and exclusive, so no other agent or client instance can interact with it. -Instead of a private pipe between one client and one agent: +The pattern that engineering teams are adopting to solve these problems is to break that coupling, through the idea of a durable session - a shared, persistent medium through which they interact. Instead of a exclusive pipe between one client and one agent: - The agent writes events to the session - Clients independently connect to the session @@ -71,18 +62,18 @@ Ably AI Transport implements durable sessions on top of [Ably channels](/docs/ch - Any client or agent connects to the session by specifying the channel name. - Messages on the channel outlive any single connection, device, or agent. -- Events arrive in the order they were published, regardless of reconnection. +- Events are received by subscribers in the order that they were published, even if there are disconnections. - A client that drops its connection automatically reconnects and picks up where it left off. -- Any participant can publish to the channel. Cancel, steer, interrupt - all through the same session. -- Multiple participants subscribe to the same channel. Every participant sees every event. +- Any participant can publish to the channel. Cancel, steer, interrupt can all happen through the same session. +- Multiple participants subscribe to the same channel, and every participant sees every event. -On top of these channel properties, the AI Transport SDK adds: +In addition to these channel properties, the AI Transport SDK adds: - Turns that structure prompt-response cycles with clear boundaries, concurrent lifecycles, and scoped cancellation - A codec layer that maps between your AI framework's event types and Ably messages - A conversation tree that supports branching, edit, regenerate, and history navigation - React hooks for building UIs with streaming, pagination, and branch navigation -- Framework adapters that drop into Vercel AI SDK's `useChat` with one line of code +- Adapters that drop into various frameworks; for example AI Transport can be used with Vercel AI SDK's `useChat` with one line of code ## What to read next +## Sessions -A session is an Ably channel shared between one or more agents and one or more clients. It represents a single conversation and persists beyond any individual connection. When a client disconnects and reconnects, the session is still there. When a second device joins, it sees the same session. +The central concept is of a session. This is a durable, addressable, communication channel, that carries client-to-agent, and agent-to-client events. A session may be shared between one or more agents and one or more clients. A single session would usually represent a single conversation or chat, but that can be long-lived, and can survive individual client connections, or episodes of client interactions. A single ChatGPT chat, for example, could be a session; you can revisit and continue that session multiple times, on different devices. A second device interacting with that chat would do so by joining the same session. -Sessions provide four guarantees: +In AI Transport, a session maps to an Ably Channel. Channels support realtime pub/sub messaging, so multiple (agent or client) partipants in a channel can each publish and subscribe to messages; channels support presence, so each connected participant can advertise its online status; and channels support structured state, so arbitrary data, in addition to messages, can be durably added, modified and observed by any participant. -- Events arrive in the order they were published. Token streams render correctly without reordering logic. -- Messages survive disconnections. A client that drops and reconnects picks up exactly where it left off. -- All participants see all events. Every subscribed device receives every token, every control signal, every lifecycle event. -- Clients signal agents through the same channel. Cancel requests, interrupts, and metadata flow back to the agent without a separate control plane. +Messages in a session are ordered, so subscribers see events in the order in which they were published. This ordering ensures that it is possible to distribute streamed token events at a high rate, knowing that they are received in order by clients. -When a client disconnects, the session continues to exist. The agent keeps publishing tokens to the channel. On reconnect, Ably's connection protocol automatically resumes from the last received message with no gaps. If the client has been offline longer, it loads the full conversation from channel history using `view.loadOlder()`. +AI Transport sessions persist messages and structured state, so interaction history survives the disconnection of any client or agent. Clients can therefore retrieve historical messages, as well as receive them in real time. Sessions are a unifying primitive that expose the same message stream to be consumed either in real time or after the fact. + +Since sessions are based on a pub/sub primitive, multiple clients can receive messages, either serially (ie different devices at different times) or simultaneously . + +Sessions don't just carry client prompts and agent responses; they are bidirectional and capable of carrying multiple signals, whether that's for cancellation, interruption, steering, and whether it's chat text, artifacts, tool parameters, or other metadata. + +When a client disconnects, the session continues to exist and the agent keeps publishing tokens to the channel. On reconnect, Ably's connection protocol automatically resumes from the last received message with no gaps. If the client has been offline longer, it loads the full conversation from channel history. See [Sessions and turns](/docs/ai-transport/how-it-works/sessions-and-turns) for a detailed explanation of how sessions and recovery work. -## Understand turns +## Turns -A turn is one prompt-response cycle within a session. The user sends a prompt, the agent streams back a response. That exchange, from start to finish, is a single turn. +A turn groups a related set of interactions in a session; the simplest and most typical example of a turn is ais one prompt-response cycle within a session: the client sends a prompt, and the agent streams back a response. In general, however, a turn can include more complex interactions, including those where the turn is initiated by some agent activity, such as an autonomous agent responding to an external event. -Each turn has a lifecycle: it starts when the agent begins generating, streams tokens as they are produced, and ends when the agent completes its response. Turns have clear boundaries. Cancellation is scoped to a turn, not the whole session. Cancelling one turn does not affect other turns or the session itself. Each turn carries its own stream, its own cancel handle, and its own lifecycle events. +Turns are the principal way of structuring interactions within a single session. Each turn has a livecycle; cancellation is scoped to a turn, so cancelling a turn does not affect other turns or the session itself. Each turn carries its own stream, its own cancel handle, and its own lifecycle events. Turns can also exist concurrently, such as if a user submits a follow-up prompt before the previous response finishes, or where multiple subagents are each interacting with the client at the same time. In general, you can think of turns as a way of multiplexing multiple independent threads in a shared session. -Multiple turns can be active simultaneously. A user can send a follow-up prompt before the previous response finishes, and both turns stream independently. This enables concurrent interactions without waiting for each turn to complete. +In many agent deployments, the client prompt that initiates a turn is made as a request to the server endpoint that makes the agent invocation, or initiates the agent workflow if using a durable execution framework. The agent workflow will usually end with the completion of that turn; therefore, turn lifecycle is often correlated with agent invocation lifecycle. See [Sessions and turns](/docs/ai-transport/how-it-works/sessions-and-turns) for details on turn lifecycle, cancellation, and concurrent turns. -## Understand the transport architecture +## Transport architecture + +Sessions are powered by Ably pub/sub channels. Channels provide: + +- pub/sub message delivery, including message persistence and history; +- presence so that the state of participants in a channel can be observed in real time; +- structured durable and collborative state via the LiveObjects API. + +The AI Transport library implements two principal protocol layers on top of the channel primitive: + +- a "transport" layer that implements the Turns abstraction. This supports the multiplexing of turns onto channels, so that turns can provide independent streams; and a conversation tree that exposes a branching conversation structure from the linear stream of channel messages. Together, these provide the rich message structure required to model AI conversations, both for real time and historical messages. -The transport has two layers: a core transport and a codec. The core transport manages the turn lifecycle, cancellation, conversation tree, and history. The codec translates between your AI framework's event types and Ably messages. +- a "codec" that handles format conversion of messages between the domain event and message types associated with the agent or client framework - for example `UIMessage` in the case of the Vercel AI SDK - and messages suitable for exchange over the Ably channel. -On the server, the transport takes the output stream from your AI framework and publishes it to the session channel. On the client, the transport subscribes to the channel and reconstructs the conversation from the incoming events. The HTTP POST that sends the user's prompt to the agent is fire-and-forget. Response tokens come back through the channel, not through the HTTP response. +On the server, the transport takes the output stream from the AI framework and publishes it to channel via the turn's stream. On the client, the transport subscribes to the channel and reconstructs the conversation from the incoming events. Clients that initiate a turn via an HTTP request to the backend (containing the user's prompt) do not receive the agent response on that request; the response to that request simply confirms successful invocation of the agent. Response tokens come back through the turn stream. The SDK ships with a Vercel AI SDK codec (`UIMessageCodec`) that maps Vercel AI SDK message types to Ably messages. You can write custom codecs for other frameworks by implementing the codec interface. This keeps the core transport framework-agnostic while giving each framework a native integration surface. From 7aceb6a45497df81c8072b39fad941b9afce6563 Mon Sep 17 00:00:00 2001 From: Paddy Byers Date: Thu, 9 Apr 2026 09:07:54 +0100 Subject: [PATCH 3/6] Update token streaming feature page --- .../ai-transport/features/token-streaming.mdx | 39 ++++++++++++++++--- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/src/pages/docs/ai-transport/features/token-streaming.mdx b/src/pages/docs/ai-transport/features/token-streaming.mdx index 9a942455b1..a92593a6f8 100644 --- a/src/pages/docs/ai-transport/features/token-streaming.mdx +++ b/src/pages/docs/ai-transport/features/token-streaming.mdx @@ -1,14 +1,43 @@ --- title: "Token streaming" -meta_description: "Stream LLM tokens through Ably AI Transport with durable delivery. Tokens survive disconnections and sync across devices automatically." -meta_keywords: "token streaming, LLM streaming, AI Transport, Ably, real-time AI, durable streaming" +meta_description: "Stream AI-generated tokens to clients in realtime using AI Transport, with support for message-per-response and message-per-token patterns." +redirect_from: + - /docs/ai-transport/token-streaming + - /docs/ai-transport/token-streaming/message-per-response + - /docs/ai-transport/token-streaming/message-per-token + - /docs/ai-transport/guides/anthropic/anthropic-message-per-response + - /docs/ai-transport/guides/anthropic/anthropic-message-per-token + - /docs/ai-transport/guides/openai/openai-message-per-response + - /docs/ai-transport/guides/openai/openai-message-per-token + - /docs/ai-transport/guides/langgraph/langgraph-message-per-response + - /docs/ai-transport/guides/langgraph/langgraph-message-per-token + - /docs/ai-transport/guides/vercel-ai-sdk/vercel-message-per-response + - /docs/ai-transport/guides/vercel-ai-sdk/vercel-message-per-token + - /docs/guides/ai-transport/anthropic-message-per-response + - /docs/guides/ai-transport/anthropic/anthropic-message-per-response + - /docs/guides/ai-transport/anthropic-message-per-token + - /docs/guides/ai-transport/anthropic/anthropic-message-per-token + - /docs/guides/ai-transport/openai-message-per-response + - /docs/guides/ai-transport/openai/openai-message-per-response + - /docs/guides/ai-transport/openai-message-per-token + - /docs/guides/ai-transport/openai/openai-message-per-token + - /docs/guides/ai-transport/langgraph-message-per-response + - /docs/guides/ai-transport/langgraph/langgraph-message-per-response + - /docs/guides/ai-transport/langgraph-message-per-token + - /docs/guides/ai-transport/langgraph/langgraph-message-per-token + - /docs/guides/ai-transport/vercel-message-per-response + - /docs/guides/ai-transport/vercel-ai-sdk/vercel-message-per-response + - /docs/guides/ai-transport/vercel-message-per-token + - /docs/guides/ai-transport/vercel-ai-sdk/vercel-message-per-token --- -Token streaming in AI Transport delivers LLM responses progressively through an Ably channel. Tokens are published as they're generated and persist on the channel - clients that disconnect and reconnect receive the complete response without gaps. +LLMs generate responses progressively - token by token - and the best user experience is achieved by delivering those tokens progressively also to clients, minimising perceived response latency. This token by token delivery, as a response is still being generated, is referred to as token streaming. -## How it works +## How it works -The server pipes an LLM response stream through the server transport. The transport's codec encodes each token as an Ably message append operation. Subscribing clients receive tokens in real time through their channel subscription. The channel accumulates tokens, so reconnecting clients get the assembled response, not individual deltas to replay. +Although tokens need to be delivered individually when being consumed in real time, the reality is still that these are fragments of a response, not just discrete, independent messages. It must be possible to consume them as coherent responses when not in real time - for example when looking at history, refreshing a client, or returning to a conversation. + +A key feature of AI Transport's transport layer is that it understands this relationship between responses and their constituent tokens. By doing this, service can support clients that resume an interrupted connection, or those that refresh, during a streamed response. AI Transport supports token streaming by enabling agents to form responses incrementally by making a stream of appends to the content. Each appended token can be received immediately by a subscriber if they are consuming in real time; but the durable session layer structures the conversation as responses, including completed responses and still-in-progress responses. On the server, a single call streams the entire response: From e4f77d398206ff45b9476d1e9aa9a25ff5ec9617 Mon Sep 17 00:00:00 2001 From: Paddy Byers Date: Thu, 9 Apr 2026 09:13:49 +0100 Subject: [PATCH 4/6] Minor updates to agent presence concepts --- src/pages/docs/ai-transport/features/agent-presence.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pages/docs/ai-transport/features/agent-presence.mdx b/src/pages/docs/ai-transport/features/agent-presence.mdx index 5c06cbdea4..2a66edae50 100644 --- a/src/pages/docs/ai-transport/features/agent-presence.mdx +++ b/src/pages/docs/ai-transport/features/agent-presence.mdx @@ -4,7 +4,7 @@ meta_description: "Show agent status in your AI application with Ably Presence. meta_keywords: "agent presence, AI status, presence API, agent state, AI Transport, Ably" --- -Agent presence uses Ably's native [Presence](/docs/presence) API to show real-time agent status in your application. Display whether the agent is streaming, thinking, idle, or offline - across all connected clients. +Agent presence provides a realtime view to other session participants so they can know which agents are active in a session. Agent presence uses Ably's native [Presence](/docs/presence) API to show real-time agent status in your application, and this could include a sole or orcestrator agent, or multiple sub-agents. Presence can convey whether the agent is streaming, thinking, idle, or offline - across all connected clients.