Skip to content

Commit 64a30d3

Browse files
authored
feat: use os assigned free port for tcp rpc (#4891)
1 parent 3324994 commit 64a30d3

File tree

5 files changed

+240
-89
lines changed

5 files changed

+240
-89
lines changed

docs/guides/request_plane.md

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -108,13 +108,15 @@ export DYN_REQUEST_PLANE=tcp
108108

109109
# Optional: Configure TCP server host and port
110110
export DYN_TCP_RPC_HOST=0.0.0.0 # Default host
111-
export DYN_TCP_RPC_PORT=9999 # Default port
111+
# export DYN_TCP_RPC_PORT=9999 # Optional: specify a fixed port
112112

113113
# Run your Dynamo service
114114
DYN_REQUEST_PLANE=tcp python -m dynamo.frontend --http-port=8000 &
115115
DYN_REQUEST_PLANE=tcp python -m dynamo.vllm --model Qwen/Qwen3-0.6B
116116
```
117117

118+
**Note:** By default, TCP uses an OS-assigned free port (port 0). This is ideal for environments where multiple services may run on the same machine or when you want to avoid port conflicts. If you need a specific port (e.g., for firewall rules), set `DYN_TCP_RPC_PORT` explicitly.
119+
118120
**When to use TCP:**
119121
- Simple deployments with direct service-to-service communication (e.g. frontend to backend)
120122
- Minimal infrastructure requirements (no NATS needed)
@@ -124,7 +126,7 @@ DYN_REQUEST_PLANE=tcp python -m dynamo.vllm --model Qwen/Qwen3-0.6B
124126

125127
Additional TCP-specific environment variables:
126128
- `DYN_TCP_RPC_HOST`: Server host address (default: auto-detected)
127-
- `DYN_TCP_RPC_PORT`: Server port (default: 9999)
129+
- `DYN_TCP_RPC_PORT`: Server port. If not set, the OS assigns a free port automatically (recommended for most deployments). Set explicitly only if you need a specific port for firewall rules.
128130
- `DYN_TCP_MAX_MESSAGE_SIZE`: Maximum message size for TCP client (default: 32MB)
129131
- `DYN_TCP_REQUEST_TIMEOUT`: Request timeout for TCP client (default: 10 seconds)
130132
- `DYN_TCP_POOL_SIZE`: Connection pool size for TCP client (default: 50)
@@ -228,7 +230,7 @@ Request plane configuration is loaded from environment variables at startup and
228230

229231
1. Stop your Dynamo services
230232
2. Set environment variable `DYN_REQUEST_PLANE=tcp`
231-
3. Optionally configure TCP-specific settings (`DYN_TCP_RPC_PORT`, etc.)
233+
3. Optionally configure TCP-specific settings (e.g., `DYN_TCP_RPC_HOST`). Note: `DYN_TCP_RPC_PORT` is optional; if not set, an OS-assigned free port is used automatically.
232234
4. Restart your services
233235

234236

@@ -279,7 +281,7 @@ curl http://localhost:8000/v1/chat/completions \
279281
**Symptoms:** Server fails to start due to "address already in use"
280282

281283
**Solutions:**
282-
- TCP default port: 9999 (adjust environment variable `DYN_TCP_RPC_PORT`)
284+
- TCP: By default, TCP uses an OS-assigned free port, so port conflicts should be rare. If you explicitly set `DYN_TCP_RPC_PORT` to a specific port and get conflicts, either change the port or remove the setting to use automatic port assignment.
283285
- HTTP default port: 8888 (adjust environment variable `DYN_HTTP_RPC_PORT`)
284286

285287
## Performance Considerations

lib/llm/src/discovery/model_manager.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -330,12 +330,11 @@ impl ModelManager {
330330
// Register router via discovery mechanism
331331
let discovery = endpoint.component().drt().discovery();
332332
let instance_id = discovery.instance_id();
333-
let request_plane_mode = endpoint.drt().request_plane();
334333

335334
// Build transport for router endpoint based on request plane mode
336335
// Use KV_ROUTER_COMPONENT as the component name to distinguish from the generate endpoint's component
337336
let router_endpoint_id = router_endpoint_id(endpoint.id().namespace);
338-
let transport = build_transport_type(request_plane_mode, &router_endpoint_id, instance_id);
337+
let transport = build_transport_type(endpoint, &router_endpoint_id, instance_id).await?;
339338

340339
let discovery_spec = DiscoverySpec::Endpoint {
341340
namespace: router_endpoint_id.namespace.clone(),

lib/runtime/src/component/endpoint.rs

Lines changed: 67 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -89,30 +89,6 @@ impl EndpointConfigBuilder {
8989
let request_plane_mode = endpoint.drt().request_plane();
9090
tracing::info!("Endpoint starting with request plane mode: {request_plane_mode}",);
9191

92-
// Register health check target in SystemHealth if provided
93-
if let Some(health_check_payload) = &health_check_payload {
94-
// Build transport based on request plane mode
95-
let transport = build_transport_type(request_plane_mode, &endpoint_id, connection_id);
96-
97-
let instance = Instance {
98-
component: endpoint_id.component.clone(),
99-
endpoint: endpoint_id.name.clone(),
100-
namespace: endpoint_id.namespace.clone(),
101-
instance_id: connection_id,
102-
transport,
103-
};
104-
tracing::debug!(endpoint_name = %endpoint.name, "Registering endpoint health check target");
105-
let guard = system_health.lock();
106-
guard.register_health_check_target(
107-
&endpoint.name,
108-
instance,
109-
health_check_payload.clone(),
110-
);
111-
if let Some(notifier) = guard.get_endpoint_health_check_notifier(&endpoint.name) {
112-
handler.set_endpoint_health_check_notifier(notifier)?;
113-
}
114-
}
115-
11692
// Register with graceful shutdown tracker if needed
11793
if graceful_shutdown {
11894
tracing::debug!(
@@ -137,9 +113,33 @@ impl EndpointConfigBuilder {
137113
let component_name_for_task = endpoint_id.component.clone();
138114
let endpoint_name_for_task = endpoint_id.name.clone();
139115

140-
// Get the unified request plane server (works for all transport types)
116+
// Get the unified request plane server
141117
let server = endpoint.drt().request_plane_server().await?;
142118

119+
// Register health check target in SystemHealth if provided
120+
if let Some(health_check_payload) = &health_check_payload {
121+
// Build transport based on request plane mode
122+
let transport = build_transport_type(&endpoint, &endpoint_id, connection_id).await?;
123+
124+
let instance = Instance {
125+
component: endpoint_id.component.clone(),
126+
endpoint: endpoint_id.name.clone(),
127+
namespace: endpoint_id.namespace.clone(),
128+
instance_id: connection_id,
129+
transport,
130+
};
131+
tracing::debug!(endpoint_name = %endpoint.name, "Registering endpoint health check target");
132+
let guard = system_health.lock();
133+
guard.register_health_check_target(
134+
&endpoint.name,
135+
instance,
136+
health_check_payload.clone(),
137+
);
138+
if let Some(notifier) = guard.get_endpoint_health_check_notifier(&endpoint.name) {
139+
handler.set_endpoint_health_check_notifier(notifier)?;
140+
}
141+
}
142+
143143
tracing::info!(
144144
endpoint = %endpoint_name_for_task,
145145
transport = server.transport_name(),
@@ -198,7 +198,7 @@ impl EndpointConfigBuilder {
198198
let discovery = endpoint.drt().discovery();
199199

200200
// Build transport for discovery service based on request plane mode
201-
let transport = build_transport_type(request_plane_mode, &endpoint_id, connection_id);
201+
let transport = build_transport_type(&endpoint, &endpoint_id, connection_id).await?;
202202

203203
let discovery_spec = crate::discovery::DiscoverySpec::Endpoint {
204204
namespace: endpoint_id.namespace.clone(),
@@ -232,11 +232,14 @@ impl EndpointConfigBuilder {
232232
/// - HTTP: Uses full URL path including endpoint name (e.g., http://host:port/v1/rpc/endpoint_name)
233233
/// - TCP: Includes endpoint name for routing (e.g., host:port/endpoint_name)
234234
/// - NATS: Uses subject-based addressing (unique per endpoint)
235-
pub fn build_transport_type(
235+
///
236+
/// # Errors
237+
/// Returns an error if TCP mode is used but the TCP server hasn't been started yet.
238+
fn build_transport_type_inner(
236239
mode: RequestPlaneMode,
237240
endpoint_id: &EndpointId,
238241
connection_id: u64,
239-
) -> TransportType {
242+
) -> Result<TransportType> {
240243
match mode {
241244
RequestPlaneMode::Http => {
242245
let http_host = crate::utils::get_http_rpc_host_from_env();
@@ -252,23 +255,54 @@ pub fn build_transport_type(
252255
endpoint_id.name
253256
);
254257

255-
TransportType::Http(http_endpoint)
258+
Ok(TransportType::Http(http_endpoint))
256259
}
257260
RequestPlaneMode::Tcp => {
258261
let tcp_host = crate::utils::get_tcp_rpc_host_from_env();
262+
// If a fixed port is explicitly configured, use it directly (no init ordering dependency).
263+
// Otherwise, use the actual bound port (set by TCP server after binding when port 0 is used).
259264
let tcp_port = std::env::var("DYN_TCP_RPC_PORT")
260265
.ok()
261266
.and_then(|p| p.parse::<u16>().ok())
262-
.unwrap_or(9999);
267+
.unwrap_or(crate::pipeline::network::manager::get_actual_tcp_rpc_port()?);
263268

264269
// Include endpoint name for proper TCP routing
265270
// TCP client parses this format and adds x-endpoint-path header for server-side routing
266271
let tcp_endpoint = format!("{}:{}/{}", tcp_host, tcp_port, endpoint_id.name);
267272

268-
TransportType::Tcp(tcp_endpoint)
273+
Ok(TransportType::Tcp(tcp_endpoint))
269274
}
270-
RequestPlaneMode::Nats => {
271-
TransportType::Nats(nats::instance_subject(endpoint_id, connection_id))
275+
RequestPlaneMode::Nats => Ok(TransportType::Nats(nats::instance_subject(
276+
endpoint_id,
277+
connection_id,
278+
))),
279+
}
280+
}
281+
282+
/// Build transport type, ensuring TCP server is initialized when needed.
283+
///
284+
/// In TCP mode with an OS-assigned port (`DYN_TCP_RPC_PORT` unset or invalid), the server must bind
285+
/// before we can construct a correct transport address. This helper ensures that initialization
286+
/// occurs, then delegates to the internal builder.
287+
pub async fn build_transport_type(
288+
endpoint: &Endpoint,
289+
endpoint_id: &EndpointId,
290+
connection_id: u64,
291+
) -> Result<TransportType> {
292+
let mode = endpoint.drt().request_plane();
293+
294+
if mode == RequestPlaneMode::Tcp {
295+
// Only force server init when we *don't* have a valid explicit port.
296+
let has_fixed_port = std::env::var("DYN_TCP_RPC_PORT")
297+
.ok()
298+
.and_then(|p| p.parse::<u16>().ok())
299+
.is_some();
300+
301+
if !has_fixed_port {
302+
// Ensure request plane server is initialized before building transport.
303+
let _ = endpoint.drt().request_plane_server().await?;
272304
}
273305
}
306+
307+
build_transport_type_inner(mode, endpoint_id, connection_id)
274308
}

lib/runtime/src/pipeline/network/ingress/shared_tcp_endpoint.rs

Lines changed: 90 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ use crate::pipeline::network::PushWorkHandler;
1111
use anyhow::Result;
1212
use bytes::Bytes;
1313
use dashmap::DashMap;
14-
use parking_lot::Mutex;
14+
use parking_lot::{Mutex, RwLock};
1515
use std::net::SocketAddr;
1616
use std::sync::Arc;
1717
use std::sync::atomic::{AtomicU64, Ordering};
@@ -36,7 +36,10 @@ fn get_max_message_size() -> usize {
3636
/// Shared TCP server that handles multiple endpoints on a single port
3737
pub struct SharedTcpServer {
3838
handlers: Arc<DashMap<String, Arc<EndpointHandler>>>,
39+
/// The address to bind to (may have port 0 for OS-assigned port)
3940
bind_addr: SocketAddr,
41+
/// The actual bound address (populated after bind_and_start, contains actual port)
42+
actual_addr: RwLock<Option<SocketAddr>>,
4043
cancellation_token: CancellationToken,
4144
}
4245

@@ -55,11 +58,83 @@ impl SharedTcpServer {
5558
pub fn new(bind_addr: SocketAddr, cancellation_token: CancellationToken) -> Arc<Self> {
5659
Arc::new(Self {
5760
handlers: Arc::new(DashMap::new()),
61+
// address we requested to bind to.
5862
bind_addr,
63+
// actual address after free port assignment (if DYN_TCP_RPC_PORT is not specified)
64+
actual_addr: RwLock::new(None),
5965
cancellation_token,
6066
})
6167
}
6268

69+
/// Bind the server and start accepting connections.
70+
///
71+
/// This method binds to the configured address first, then starts the accept loop.
72+
/// If the configured port is 0, the OS will assign a free port.
73+
/// The actual bound address is stored and can be retrieved via `actual_address()`.
74+
///
75+
/// Returns the actual bound address (useful when port 0 was specified).
76+
pub async fn bind_and_start(self: Arc<Self>) -> Result<SocketAddr> {
77+
tracing::info!("Binding TCP server to {}", self.bind_addr);
78+
79+
let listener = TcpListener::bind(&self.bind_addr).await?;
80+
let actual_addr = listener.local_addr()?;
81+
82+
tracing::info!(
83+
requested = %self.bind_addr,
84+
actual = %actual_addr,
85+
"TCP server bound successfully"
86+
);
87+
88+
// Store the actual bound address
89+
*self.actual_addr.write() = Some(actual_addr);
90+
91+
// Start accepting connections in a background task
92+
let server = self.clone();
93+
tokio::spawn(async move {
94+
server.accept_loop(listener).await;
95+
});
96+
97+
Ok(actual_addr)
98+
}
99+
100+
/// Get the actual bound address (after bind_and_start has been called).
101+
///
102+
/// Returns None if the server hasn't been started yet.
103+
pub fn actual_address(&self) -> Option<SocketAddr> {
104+
*self.actual_addr.read()
105+
}
106+
107+
/// Internal accept loop - runs after binding
108+
async fn accept_loop(self: Arc<Self>, listener: TcpListener) {
109+
let cancellation_token = self.cancellation_token.clone();
110+
111+
loop {
112+
tokio::select! {
113+
accept_result = listener.accept() => {
114+
match accept_result {
115+
Ok((stream, peer_addr)) => {
116+
tracing::trace!("Accepted TCP connection from {}", peer_addr);
117+
118+
let handlers = self.handlers.clone();
119+
tokio::spawn(async move {
120+
if let Err(e) = Self::handle_connection(stream, handlers).await {
121+
tracing::error!("TCP connection error: {}", e);
122+
}
123+
});
124+
}
125+
Err(e) => {
126+
tracing::error!("Failed to accept TCP connection: {}", e);
127+
}
128+
}
129+
}
130+
_ = cancellation_token.cancelled() => {
131+
tracing::info!("SharedTcpServer received cancellation signal, shutting down");
132+
return;
133+
}
134+
}
135+
}
136+
}
137+
63138
#[allow(clippy::too_many_arguments)]
64139
pub async fn register_endpoint(
65140
&self,
@@ -93,7 +168,7 @@ impl SharedTcpServer {
93168
tracing::info!(
94169
"Registered endpoint '{}' with shared TCP server on {}",
95170
endpoint_name,
96-
self.bind_addr
171+
self.actual_address().unwrap_or(self.bind_addr)
97172
);
98173

99174
Ok(())
@@ -129,37 +204,16 @@ impl SharedTcpServer {
129204
}
130205
}
131206

207+
/// Start the server (legacy method - prefer bind_and_start for new code).
208+
///
209+
/// This method is kept for backwards compatibility. It binds and starts
210+
/// the server but doesn't return the actual bound address.
132211
pub async fn start(self: Arc<Self>) -> Result<()> {
133-
tracing::info!("Starting shared TCP server on {}", self.bind_addr);
134-
135-
let listener = TcpListener::bind(&self.bind_addr).await?;
136-
let cancellation_token = self.cancellation_token.clone();
137-
138-
loop {
139-
tokio::select! {
140-
accept_result = listener.accept() => {
141-
match accept_result {
142-
Ok((stream, peer_addr)) => {
143-
tracing::trace!("Accepted TCP connection from {}", peer_addr);
144-
145-
let handlers = self.handlers.clone();
146-
tokio::spawn(async move {
147-
if let Err(e) = Self::handle_connection(stream, handlers).await {
148-
tracing::debug!("TCP connection error: {}", e);
149-
}
150-
});
151-
}
152-
Err(e) => {
153-
tracing::error!("Failed to accept TCP connection: {}", e);
154-
}
155-
}
156-
}
157-
_ = cancellation_token.cancelled() => {
158-
tracing::info!("SharedTcpServer received cancellation signal, shutting down");
159-
return Ok(());
160-
}
161-
}
162-
}
212+
let cancel_token = self.cancellation_token.clone();
213+
self.bind_and_start().await?;
214+
// Wait for cancellation (the accept loop runs in background)
215+
cancel_token.cancelled().await;
216+
Ok(())
163217
}
164218

165219
async fn handle_connection(
@@ -378,7 +432,10 @@ impl super::unified_server::RequestPlaneServer for SharedTcpServer {
378432
}
379433

380434
fn address(&self) -> String {
381-
format!("tcp://{}:{}", self.bind_addr.ip(), self.bind_addr.port())
435+
// Return actual bound address if available (after bind_and_start),
436+
// otherwise fall back to configured bind address
437+
let addr = self.actual_address().unwrap_or(self.bind_addr);
438+
format!("tcp://{}:{}", addr.ip(), addr.port())
382439
}
383440

384441
fn transport_name(&self) -> &'static str {

0 commit comments

Comments
 (0)