diff --git a/crates/pulsing-actor/src/cluster/member.rs b/crates/pulsing-actor/src/cluster/member.rs index cb41a39e2..ecdb9d0ab 100644 --- a/crates/pulsing-actor/src/cluster/member.rs +++ b/crates/pulsing-actor/src/cluster/member.rs @@ -383,467 +383,3 @@ impl NamedActorInfo { self.instance_nodes.iter().choose(&mut rng).cloned() } } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_member_supersedes() { - let node_id = NodeId::generate(); - let addr: SocketAddr = "127.0.0.1:8000".parse().unwrap(); - - let mut m1 = MemberInfo::new(node_id, addr, addr); - let mut m2 = MemberInfo::new(node_id, addr, addr); - - // Same incarnation, same status - neither supersedes - assert!(!m1.supersedes(&m2)); - assert!(!m2.supersedes(&m1)); - - // Suspect supersedes Alive at same incarnation - m1.suspect(); - assert!(m1.supersedes(&m2)); - assert!(!m2.supersedes(&m1)); - - // Higher incarnation always wins - m2.incarnation = 1; - assert!(!m1.supersedes(&m2)); - assert!(m2.supersedes(&m1)); - } - - #[test] - fn test_member_refute() { - let node_id = NodeId::generate(); - let addr: SocketAddr = "127.0.0.1:8000".parse().unwrap(); - - let mut member = MemberInfo::new(node_id, addr, addr); - member.suspect(); - assert_eq!(member.status, MemberStatus::Suspect); - - member.refute(); - assert_eq!(member.status, MemberStatus::Alive); - assert_eq!(member.incarnation, 1); - } - - #[test] - fn test_node_status() { - assert!(NodeStatus::Online.is_online()); - assert!(!NodeStatus::PFail.is_online()); - assert!(!NodeStatus::Fail.is_online()); - assert!(!NodeStatus::Handshake.is_online()); - - assert!(!NodeStatus::Online.is_failed()); - assert!(NodeStatus::PFail.is_failed()); - assert!(NodeStatus::Fail.is_failed()); - assert!(!NodeStatus::Handshake.is_failed()); - } - - #[test] - fn test_member_status() { - assert!(MemberStatus::Alive.is_alive()); - assert!(!MemberStatus::Suspect.is_alive()); - assert!(!MemberStatus::Dead.is_alive()); - assert!(!MemberStatus::Leaving.is_alive()); - - assert!(MemberStatus::Alive.is_reachable()); - assert!(MemberStatus::Suspect.is_reachable()); - assert!(!MemberStatus::Dead.is_reachable()); - assert!(!MemberStatus::Leaving.is_reachable()); - } - - #[test] - fn test_cluster_node_new() { - let node_id = NodeId::generate(); - let addr: SocketAddr = "127.0.0.1:8000".parse().unwrap(); - - let node = ClusterNode::new(node_id, addr, 1); - assert_eq!(node.node_id, node_id); - assert_eq!(node.addr, addr); - assert_eq!(node.status, NodeStatus::Online); - assert_eq!(node.epoch, 1); - assert!(node.last_seen > 0); - } - - #[test] - fn test_cluster_node_supersedes() { - let node_id = NodeId::generate(); - let addr: SocketAddr = "127.0.0.1:8000".parse().unwrap(); - - let mut n1 = ClusterNode::new(node_id, addr, 1); - let mut n2 = ClusterNode::new(node_id, addr, 1); - - // Same epoch, same status - neither supersedes - assert!(!n1.supersedes(&n2)); - assert!(!n2.supersedes(&n1)); - - // Higher epoch wins - n2.epoch = 2; - assert!(n2.supersedes(&n1)); - assert!(!n1.supersedes(&n2)); - - // Same epoch, higher status wins - n1.epoch = 2; - n1.status = NodeStatus::Fail; - assert!(n1.supersedes(&n2)); - } - - #[test] - fn test_member_info_mark_dead() { - let node_id = NodeId::generate(); - let addr: SocketAddr = "127.0.0.1:8000".parse().unwrap(); - - let mut member = MemberInfo::new(node_id, addr, addr); - assert_eq!(member.status, MemberStatus::Alive); - - member.mark_dead(); - assert_eq!(member.status, MemberStatus::Dead); - } - - #[test] - fn test_member_info_suspect_from_alive() { - let node_id = NodeId::generate(); - let addr: SocketAddr = "127.0.0.1:8000".parse().unwrap(); - - let mut member = MemberInfo::new(node_id, addr, addr); - assert_eq!(member.status, MemberStatus::Alive); - - member.suspect(); - assert_eq!(member.status, MemberStatus::Suspect); - } - - #[test] - fn test_member_info_suspect_already_suspect() { - let node_id = NodeId::generate(); - let addr: SocketAddr = "127.0.0.1:8000".parse().unwrap(); - - let mut member = MemberInfo::new(node_id, addr, addr); - member.suspect(); - member.suspect(); // Should not change - assert_eq!(member.status, MemberStatus::Suspect); - } - - #[test] - fn test_member_info_equality() { - let node_id = NodeId::generate(); - let addr1: SocketAddr = "127.0.0.1:8000".parse().unwrap(); - let addr2: SocketAddr = "127.0.0.1:9000".parse().unwrap(); - - let m1 = MemberInfo::new(node_id, addr1, addr1); - let m2 = MemberInfo::new(node_id, addr2, addr2); - - // Equality is based on node_id only - assert_eq!(m1, m2); - } - - #[test] - fn test_member_info_hash() { - use std::collections::hash_map::DefaultHasher; - use std::hash::{Hash, Hasher}; - - let node_id = NodeId::generate(); - let addr1: SocketAddr = "127.0.0.1:8000".parse().unwrap(); - let addr2: SocketAddr = "127.0.0.1:9000".parse().unwrap(); - - let m1 = MemberInfo::new(node_id, addr1, addr1); - let m2 = MemberInfo::new(node_id, addr2, addr2); - - let mut hasher1 = DefaultHasher::new(); - let mut hasher2 = DefaultHasher::new(); - m1.hash(&mut hasher1); - m2.hash(&mut hasher2); - - // Same node_id should have same hash - assert_eq!(hasher1.finish(), hasher2.finish()); - } - - #[test] - fn test_actor_location() { - let actor_id = ActorId::local(1); - let node_id = NodeId::generate(); - - let location = ActorLocation::new(actor_id, node_id); - assert_eq!(location.actor_id, actor_id); - assert_eq!(location.node_id, node_id); - assert_eq!(location.version, 0); - } - - #[test] - fn test_named_actor_info_new() { - let path = ActorPath::new("services/llm").unwrap(); - let info = NamedActorInfo::new(path.clone()); - - assert_eq!(info.path, path); - assert!(info.instances.is_empty()); - assert!(info.is_empty()); - assert_eq!(info.version, 0); - } - - #[test] - fn test_named_actor_info_with_instance() { - let path = ActorPath::new("services/llm").unwrap(); - let node_id = NodeId::generate(); - - let info = NamedActorInfo::with_instance(path.clone(), node_id); - - assert_eq!(info.path, path); - assert_eq!(info.instance_count(), 1); - assert!(!info.is_empty()); - assert_eq!(info.version, 1); - assert!(info.instance_nodes.contains(&node_id)); - } - - #[test] - fn test_named_actor_info_add_instance() { - let path = ActorPath::new("services/llm").unwrap(); - let node1 = NodeId::generate(); - let node2 = NodeId::generate(); - - let mut info = NamedActorInfo::new(path); - info.add_instance(node1); - info.add_instance(node2); - - assert_eq!(info.instance_count(), 2); - assert_eq!(info.version, 2); - } - - #[test] - fn test_named_actor_info_add_duplicate_instance() { - let path = ActorPath::new("services/llm").unwrap(); - let node_id = NodeId::generate(); - - let mut info = NamedActorInfo::new(path); - info.add_instance(node_id); - info.add_instance(node_id); // Duplicate - - assert_eq!(info.instance_count(), 1); - assert_eq!(info.version, 1); // Version not incremented for duplicate - } - - #[test] - fn test_named_actor_info_remove_instance() { - let path = ActorPath::new("services/llm").unwrap(); - let node1 = NodeId::generate(); - let node2 = NodeId::generate(); - - let mut info = NamedActorInfo::new(path); - info.add_instance(node1); - info.add_instance(node2); - - assert!(info.remove_instance(&node1)); - assert_eq!(info.instance_count(), 1); - - assert!(!info.remove_instance(&node1)); // Already removed - } - - #[test] - fn test_named_actor_info_merge() { - let path = ActorPath::new("services/llm").unwrap(); - let node1 = NodeId::generate(); - let node2 = NodeId::generate(); - let node3 = NodeId::generate(); - - let mut info1 = NamedActorInfo::with_instance(path.clone(), node1); - info1.add_instance(node2); - - let mut info2 = NamedActorInfo::with_instance(path.clone(), node2); - info2.add_instance(node3); - - info1.merge(&info2); - - assert_eq!(info1.instance_count(), 3); - assert!(info1.instance_nodes.contains(&node1)); - assert!(info1.instance_nodes.contains(&node2)); - assert!(info1.instance_nodes.contains(&node3)); - } - - #[test] - fn test_named_actor_info_select_instance() { - let path = ActorPath::new("services/llm").unwrap(); - let node_id = NodeId::generate(); - - let info = NamedActorInfo::with_instance(path, node_id); - - // Should return the only instance - let selected = info.select_instance(); - assert_eq!(selected, Some(node_id)); - } - - #[test] - fn test_named_actor_info_select_instance_empty() { - let path = ActorPath::new("services/llm").unwrap(); - let info = NamedActorInfo::new(path); - - assert!(info.select_instance().is_none()); - } - - #[test] - fn test_failure_info() { - let node_id = NodeId::generate(); - let reporter_id = NodeId::generate(); - - let failure = FailureInfo { - node_id, - status: NodeStatus::PFail, - epoch: 5, - reported_by: reporter_id, - }; - - assert_eq!(failure.node_id, node_id); - assert_eq!(failure.status, NodeStatus::PFail); - assert_eq!(failure.epoch, 5); - assert_eq!(failure.reported_by, reporter_id); - } - - #[test] - fn test_member_supersedes_dead() { - let node_id = NodeId::generate(); - let addr: SocketAddr = "127.0.0.1:8000".parse().unwrap(); - - let alive = MemberInfo::new(node_id, addr, addr); - let mut dead = MemberInfo::new(node_id, addr, addr); - dead.mark_dead(); - - // Dead supersedes Alive at same incarnation - assert!(dead.supersedes(&alive)); - assert!(!alive.supersedes(&dead)); - } - - // ======================================================================== - // NamedActorInstance Tests - // ======================================================================== - - #[test] - fn test_named_actor_instance_new() { - let node_id = NodeId::generate(); - let actor_id = ActorId::local(42); - - let instance = NamedActorInstance::new(node_id, actor_id); - - assert_eq!(instance.node_id, node_id); - assert_eq!(instance.actor_id, actor_id); - assert!(instance.metadata.is_empty()); - } - - #[test] - fn test_named_actor_instance_with_metadata() { - let node_id = NodeId::generate(); - let actor_id = ActorId::local(42); - let mut metadata = HashMap::new(); - metadata.insert("class".to_string(), "Counter".to_string()); - metadata.insert("module".to_string(), "__main__".to_string()); - metadata.insert("file".to_string(), "/app/main.py".to_string()); - - let instance = NamedActorInstance::with_metadata(node_id, actor_id, metadata.clone()); - - assert_eq!(instance.node_id, node_id); - assert_eq!(instance.actor_id, actor_id); - assert_eq!(instance.metadata.get("class"), Some(&"Counter".to_string())); - assert_eq!( - instance.metadata.get("module"), - Some(&"__main__".to_string()) - ); - assert_eq!( - instance.metadata.get("file"), - Some(&"/app/main.py".to_string()) - ); - } - - #[test] - fn test_named_actor_info_with_full_instance() { - let path = ActorPath::new("actors/counter").unwrap(); - let node_id = NodeId::generate(); - let actor_id = ActorId::local(42); - let mut metadata = HashMap::new(); - metadata.insert("class".to_string(), "Counter".to_string()); - - let instance = NamedActorInstance::with_metadata(node_id, actor_id, metadata); - let info = NamedActorInfo::with_full_instance(path.clone(), instance); - - assert_eq!(info.path, path); - assert_eq!(info.instance_count(), 1); - assert!(info.instance_nodes.contains(&node_id)); - assert!(info.instances.contains_key(&node_id)); - - let retrieved = info.get_instance(&node_id).unwrap(); - assert_eq!(retrieved.actor_id, actor_id); - assert_eq!( - retrieved.metadata.get("class"), - Some(&"Counter".to_string()) - ); - } - - #[test] - fn test_named_actor_info_add_full_instance() { - let path = ActorPath::new("actors/counter").unwrap(); - let node1 = NodeId::generate(); - let node2 = NodeId::generate(); - let actor_id1 = ActorId::local(1); - let actor_id2 = ActorId::local(2); - - let mut info = NamedActorInfo::new(path); - - let instance1 = NamedActorInstance::new(node1, actor_id1); - info.add_full_instance(instance1); - assert_eq!(info.instance_count(), 1); - - let instance2 = NamedActorInstance::new(node2, actor_id2); - info.add_full_instance(instance2); - assert_eq!(info.instance_count(), 2); - - assert!(info.get_instance(&node1).is_some()); - assert!(info.get_instance(&node2).is_some()); - assert_eq!(info.get_instance(&node1).unwrap().actor_id, actor_id1); - assert_eq!(info.get_instance(&node2).unwrap().actor_id, actor_id2); - } - - #[test] - fn test_named_actor_info_get_instance_not_found() { - let path = ActorPath::new("actors/counter").unwrap(); - let node_id = NodeId::generate(); - - let info = NamedActorInfo::new(path); - - assert!(info.get_instance(&node_id).is_none()); - } - - #[test] - fn test_named_actor_info_merge_with_full_instances() { - let path = ActorPath::new("actors/counter").unwrap(); - let node1 = NodeId::generate(); - let node2 = NodeId::generate(); - let actor_id1 = ActorId::local(1); - let actor_id2 = ActorId::local(2); - - let mut metadata1 = HashMap::new(); - metadata1.insert("class".to_string(), "Counter".to_string()); - let instance1 = NamedActorInstance::with_metadata(node1, actor_id1, metadata1); - let mut info1 = NamedActorInfo::with_full_instance(path.clone(), instance1); - - let mut metadata2 = HashMap::new(); - metadata2.insert("class".to_string(), "Counter".to_string()); - let instance2 = NamedActorInstance::with_metadata(node2, actor_id2, metadata2); - let info2 = NamedActorInfo::with_full_instance(path.clone(), instance2); - - info1.merge(&info2); - - assert_eq!(info1.instance_count(), 2); - assert!(info1.get_instance(&node1).is_some()); - assert!(info1.get_instance(&node2).is_some()); - } - - #[test] - fn test_named_actor_info_node_ids_iterator() { - let path = ActorPath::new("actors/counter").unwrap(); - let node1 = NodeId::generate(); - let node2 = NodeId::generate(); - - let mut info = NamedActorInfo::new(path); - info.add_instance(node1); - info.add_instance(node2); - - let node_ids: Vec<_> = info.node_ids().collect(); - assert_eq!(node_ids.len(), 2); - assert!(node_ids.contains(&&node1)); - assert!(node_ids.contains(&&node2)); - } -} diff --git a/crates/pulsing-actor/src/cluster/mod.rs b/crates/pulsing-actor/src/cluster/mod.rs index df84b0810..367b9af61 100644 --- a/crates/pulsing-actor/src/cluster/mod.rs +++ b/crates/pulsing-actor/src/cluster/mod.rs @@ -10,5 +10,8 @@ mod member; pub mod swim; pub use gossip::{GossipCluster, GossipConfig, GossipMessage}; -pub use member::{ActorLocation, MemberInfo, MemberStatus, NamedActorInfo, NamedActorInstance}; +pub use member::{ + ActorLocation, ClusterNode, FailureInfo, MemberInfo, MemberStatus, NamedActorInfo, + NamedActorInstance, NodeStatus, +}; pub use swim::{SwimConfig, SwimDetector, SwimMessage}; diff --git a/crates/pulsing-actor/src/lib.rs b/crates/pulsing-actor/src/lib.rs index 37e07a810..35bd05067 100644 --- a/crates/pulsing-actor/src/lib.rs +++ b/crates/pulsing-actor/src/lib.rs @@ -132,9 +132,7 @@ pub mod watch; pub mod prelude { pub use crate::actor::{Actor, ActorContext, ActorRef, Message}; pub use crate::supervision::{BackoffStrategy, RestartPolicy, SupervisionSpec}; - pub use crate::system::{ - ActorSystem, LoadBalanceStrategy, ResolveOptions, SpawnOptions, SystemConfig, - }; + pub use crate::system::{ActorSystem, ResolveOptions, SpawnOptions, SystemConfig}; pub use async_trait::async_trait; pub use serde::{Deserialize, Serialize}; } diff --git a/crates/pulsing-actor/src/system.rs b/crates/pulsing-actor/src/system.rs deleted file mode 100644 index a65c687e0..000000000 --- a/crates/pulsing-actor/src/system.rs +++ /dev/null @@ -1,1516 +0,0 @@ -//! Actor System - the main entry point for creating and managing actors - -use crate::actor::{ - Actor, ActorAddress, ActorContext, ActorId, ActorPath, ActorRef, ActorSystemRef, Envelope, - Mailbox, Message, NodeId, StopReason, DEFAULT_MAILBOX_SIZE, -}; -use crate::cluster::{ - GossipCluster, GossipConfig, GossipMessage, MemberInfo, MemberStatus, NamedActorInfo, - NamedActorInstance, -}; -use crate::metrics::{metrics, SystemMetrics as PrometheusMetrics}; -use crate::supervision::SupervisionSpec; -use crate::system_actor::{ - BoxedActorFactory, SystemActor, SystemRef, SYSTEM_ACTOR_LOCAL_NAME, SYSTEM_ACTOR_PATH, -}; -use crate::transport::{Http2Config, Http2RemoteTransport, Http2ServerHandler, Http2Transport}; -use crate::watch::ActorLifecycle; -use dashmap::DashMap; -use std::collections::HashMap; -use std::net::SocketAddr; -use std::sync::atomic::{AtomicU64, Ordering}; -use std::sync::Arc; -use tokio::sync::{mpsc, RwLock}; -use tokio::task::JoinHandle; -use tokio_util::sync::CancellationToken; - -/// Actor runtime statistics -#[derive(Debug, Default)] -pub struct ActorStats { - /// Number of times the actor started - pub start_count: AtomicU64, - /// Number of times the actor stopped - pub stop_count: AtomicU64, - /// Number of messages processed - pub message_count: AtomicU64, -} - -impl ActorStats { - fn inc_stop(&self) { - self.stop_count.fetch_add(1, Ordering::Relaxed); - } - - fn inc_message(&self) { - self.message_count.fetch_add(1, Ordering::Relaxed); - } - - fn to_json(&self) -> serde_json::Value { - serde_json::json!({ - "start_count": self.start_count.load(Ordering::Relaxed), - "stop_count": self.stop_count.load(Ordering::Relaxed), - "message_count": self.message_count.load(Ordering::Relaxed), - }) - } -} - -/// Actor System configuration -#[derive(Clone, Debug)] -pub struct SystemConfig { - /// HTTP/2 address for all communication (actors + gossip) - pub addr: SocketAddr, - - /// Seed nodes to join (HTTP/2 addresses) - pub seed_nodes: Vec, - - /// Gossip configuration - pub gossip_config: GossipConfig, - - /// HTTP/2 transport configuration - pub http2_config: Http2Config, - - /// Default mailbox capacity for all actors - pub default_mailbox_capacity: usize, -} - -impl Default for SystemConfig { - fn default() -> Self { - Self { - addr: "0.0.0.0:0".parse().unwrap(), - seed_nodes: Vec::new(), - gossip_config: GossipConfig::default(), - http2_config: Http2Config::default(), - default_mailbox_capacity: DEFAULT_MAILBOX_SIZE, - } - } -} - -impl SystemConfig { - /// Create config for a standalone node (no cluster) - pub fn standalone() -> Self { - Self::default() - } - - /// Create config with specific address - pub fn with_addr(addr: SocketAddr) -> Self { - Self { - addr, - ..Default::default() - } - } - - /// Add seed nodes for cluster joining - pub fn with_seeds(mut self, seeds: Vec) -> Self { - self.seed_nodes = seeds; - self - } - - /// Set default mailbox capacity - pub fn with_mailbox_capacity(mut self, capacity: usize) -> Self { - self.default_mailbox_capacity = capacity; - self - } - - /// Enable TLS with passphrase-derived certificates - /// - /// All nodes using the same passphrase will be able to communicate securely. - /// The passphrase is used to derive a shared CA certificate, enabling - /// automatic mutual TLS authentication. - #[cfg(feature = "tls")] - pub fn with_tls(mut self, passphrase: &str) -> anyhow::Result { - self.http2_config = self.http2_config.with_tls(passphrase)?; - Ok(self) - } - - /// Check if TLS is enabled - pub fn is_tls_enabled(&self) -> bool { - self.http2_config.is_tls_enabled() - } -} - -/// Options for spawning an actor -#[derive(Default, Clone, Debug)] -pub struct SpawnOptions { - /// Override mailbox capacity (None = use system default) - pub mailbox_capacity: Option, - /// Whether this actor is public (can be resolved by name across cluster) - pub public: bool, - /// Supervision specification (restart policy) - pub supervision: SupervisionSpec, - /// Actor metadata (e.g., Python class, module, file path) - pub metadata: HashMap, -} - -impl SpawnOptions { - /// Create new spawn options with defaults - pub fn new() -> Self { - Self::default() - } - - /// Set mailbox capacity override - pub fn mailbox_capacity(mut self, capacity: usize) -> Self { - self.mailbox_capacity = Some(capacity); - self - } - - /// Set whether actor is public - pub fn public(mut self, public: bool) -> Self { - self.public = public; - self - } - - /// Set supervision specification - pub fn supervision(mut self, supervision: SupervisionSpec) -> Self { - self.supervision = supervision; - self - } - - /// Set actor metadata - pub fn metadata(mut self, metadata: HashMap) -> Self { - self.metadata = metadata; - self - } -} - -/// Load balance strategy for resolving named actors with multiple instances -#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] -pub enum LoadBalanceStrategy { - /// Pick the first available instance (original behavior) - First, - /// Round-robin across instances - #[default] - RoundRobin, - /// Random selection - Random, - /// Prefer local instance if available, fallback to round-robin - PreferLocal, -} - -/// Options for resolving named actors -#[derive(Clone, Debug, Default)] -pub struct ResolveOptions { - /// Target node ID (if specified, skip load balancing) - pub node_id: Option, - /// Load balance strategy (default: RoundRobin) - pub strategy: LoadBalanceStrategy, - /// Only select Alive nodes (default: true) - pub filter_alive: bool, -} - -impl ResolveOptions { - /// Create new resolve options with defaults - pub fn new() -> Self { - Self { - filter_alive: true, - ..Default::default() - } - } - - /// Set target node ID (bypasses load balancing) - pub fn node_id(mut self, node_id: NodeId) -> Self { - self.node_id = Some(node_id); - self - } - - /// Set load balance strategy - pub fn strategy(mut self, strategy: LoadBalanceStrategy) -> Self { - self.strategy = strategy; - self - } - - /// Set whether to filter only alive nodes - pub fn filter_alive(mut self, filter: bool) -> Self { - self.filter_alive = filter; - self - } -} - -/// Local actor handle -struct LocalActorHandle { - /// Sender to the actor's mailbox - sender: mpsc::Sender, - - /// Actor task handle - join_handle: JoinHandle<()>, - - /// Runtime statistics - stats: Arc, - - /// Static metadata provided by the actor - metadata: HashMap, - - /// Named actor path (if this is a named actor) - named_path: Option, - - /// Full actor ID - actor_id: ActorId, -} - -/// The Actor System - manages actors and cluster membership -pub struct ActorSystem { - /// Local node ID - node_id: NodeId, - - /// HTTP/2 address - addr: SocketAddr, - - /// Default mailbox capacity for actors - default_mailbox_capacity: usize, - - /// Local actors (actor_name -> handle) - local_actors: Arc>, - - /// Named actor path to local actor name mapping (path_string -> actor_name) - named_actor_paths: Arc>, - - /// Gossip cluster (for discovery) - cluster: Arc>>>, - - /// HTTP/2 transport - transport: Arc, - - /// Cancellation token - cancel_token: CancellationToken, - - /// Actor lifecycle manager (watch, termination handling) - lifecycle: Arc, - - /// Actor ID counter (for generating unique local IDs) - actor_id_counter: AtomicU64, - - /// Round-robin counter for load balancing (per actor path) - lb_counters: DashMap, -} - -impl ActorSystem { - /// Create a new actor system - pub async fn new(config: SystemConfig) -> anyhow::Result> { - let cancel_token = CancellationToken::new(); - let node_id = NodeId::generate(); - let local_actors: Arc> = Arc::new(DashMap::new()); - let named_actor_paths: Arc> = Arc::new(DashMap::new()); - let cluster_holder: Arc>>> = Arc::new(RwLock::new(None)); - let lifecycle = Arc::new(ActorLifecycle::new()); - - // Create message handler (needs cluster reference for gossip) - let handler = SystemMessageHandler { - node_id, - local_actors: local_actors.clone(), - named_actor_paths: named_actor_paths.clone(), - cluster: cluster_holder.clone(), - }; - - // Create HTTP/2 transport - let (transport, actual_addr) = Http2Transport::new( - config.addr, - Arc::new(handler), - config.http2_config, - cancel_token.clone(), - ) - .await?; - - // Create gossip cluster - let cluster = GossipCluster::new( - node_id, - actual_addr, - transport.clone(), - config.gossip_config, - ); - - let cluster = Arc::new(cluster); - { - let mut holder = cluster_holder.write().await; - *holder = Some(cluster.clone()); - } - - // Start cluster gossip - cluster.start(cancel_token.clone()); - - // Join cluster if seed nodes provided - if !config.seed_nodes.is_empty() { - cluster.join(config.seed_nodes).await?; - } - - let system = Arc::new(Self { - node_id, - addr: actual_addr, - default_mailbox_capacity: config.default_mailbox_capacity, - local_actors: local_actors.clone(), - named_actor_paths: named_actor_paths.clone(), - cluster: cluster_holder, - transport, - cancel_token: cancel_token.clone(), - lifecycle, - actor_id_counter: AtomicU64::new(1), - lb_counters: DashMap::new(), - }); - - // Start the builtin SystemActor with path "system" - system - .start_system_actor(local_actors, named_actor_paths) - .await?; - - tracing::info!( - node_id = %node_id, - addr = %actual_addr, - "Actor system started" - ); - - Ok(system) - } - - /// Start the builtin SystemActor - async fn start_system_actor( - self: &Arc, - local_actors: Arc>, - named_actor_paths: Arc>, - ) -> anyhow::Result<()> { - // Create SystemRef for SystemActor - let system_ref = Arc::new(SystemRef { - node_id: self.node_id, - addr: self.addr, - local_actors: local_actors - .iter() - .map(|e| (e.key().clone(), e.sender.clone())) - .collect::>() - .into(), - named_actor_paths, - }); - - // Create SystemActor with default factory - let system_actor = SystemActor::with_default_factory(system_ref); - - // Spawn as named actor with path "system" - let path = ActorPath::new(SYSTEM_ACTOR_PATH)?; - self.spawn_named(path, SYSTEM_ACTOR_LOCAL_NAME, system_actor) - .await?; - - tracing::debug!(path = SYSTEM_ACTOR_PATH, "SystemActor started"); - Ok(()) - } - - /// Start SystemActor with custom factory (for Python extension) - pub async fn start_system_actor_with_factory( - self: &Arc, - factory: BoxedActorFactory, - ) -> anyhow::Result<()> { - // Check if already started - if self.local_actors.contains_key(SYSTEM_ACTOR_LOCAL_NAME) { - return Err(anyhow::anyhow!("SystemActor already started")); - } - - // Create SystemRef - let system_ref = Arc::new(SystemRef { - node_id: self.node_id, - addr: self.addr, - local_actors: Arc::new(DashMap::new()), // Will be updated - named_actor_paths: self.named_actor_paths.clone(), - }); - - // Create SystemActor with custom factory - let system_actor = SystemActor::new(system_ref, factory); - - // Spawn as named actor - let path = ActorPath::new(SYSTEM_ACTOR_PATH)?; - self.spawn_named(path, SYSTEM_ACTOR_LOCAL_NAME, system_actor) - .await?; - - tracing::debug!( - path = SYSTEM_ACTOR_PATH, - "SystemActor started with custom factory" - ); - Ok(()) - } - - /// Get SystemActor reference - pub async fn system(&self) -> anyhow::Result { - self.resolve_named(&ActorPath::new(SYSTEM_ACTOR_PATH)?, None) - .await - } - - /// Get node ID - pub fn node_id(&self) -> &NodeId { - &self.node_id - } - - /// Get local address - pub fn addr(&self) -> SocketAddr { - self.addr - } - - /// Get list of local actor names - pub fn local_actor_names(&self) -> Vec { - self.local_actors.iter().map(|e| e.key().clone()).collect() - } - - /// Generate a new unique local actor ID - fn next_actor_id(&self) -> ActorId { - let local_id = self.actor_id_counter.fetch_add(1, Ordering::Relaxed); - ActorId::new(self.node_id, local_id) - } - - /// Spawn an actor with a local name (uses system default mailbox capacity) - pub async fn spawn(&self, name: impl AsRef, actor: A) -> anyhow::Result - where - A: Actor, - { - self.spawn_with_options(name, actor, SpawnOptions::default()) - .await - } - - /// Spawn an actor with custom options - pub async fn spawn_with_options( - &self, - name: impl AsRef, - actor: A, - options: SpawnOptions, - ) -> anyhow::Result - where - A: Actor, - { - // Wrap actor in a factory that only works once - let mut actor_opt = Some(actor); - let factory = move || { - actor_opt - .take() - .ok_or_else(|| anyhow::anyhow!("Actor cannot be restarted (spawned as instance)")) - }; - - self.spawn_factory(name, factory, options).await - } - - /// Spawn an actor using a factory function (enables supervision restarts) - pub async fn spawn_factory( - &self, - name: impl AsRef, - factory: F, - options: SpawnOptions, - ) -> anyhow::Result - where - F: FnMut() -> anyhow::Result + Send + 'static, - A: Actor, - { - let name = name.as_ref(); - - // Check for duplicate - if self.local_actors.contains_key(name) { - return Err(anyhow::anyhow!("Actor already exists: {}", name)); - } - - let actor_id = self.next_actor_id(); - - // Use configured mailbox capacity - let capacity = options - .mailbox_capacity - .unwrap_or(self.default_mailbox_capacity); - let mailbox = Mailbox::with_capacity(capacity); - let (sender, receiver) = mailbox.split(); - - let stats = Arc::new(ActorStats::default()); - // We can't get metadata from factory without creating an instance, - // so we start with empty metadata. It could be updated later if we wanted. - let metadata = HashMap::new(); - - // Create context - let ctx = ActorContext::new(actor_id); - - // Spawn actor loop - let stats_clone = stats.clone(); - let cancel = self.cancel_token.clone(); - let actor_id_for_log = actor_id; - let supervision = options.supervision.clone(); - - let join_handle = tokio::spawn(async move { - let reason = - run_supervision_loop(factory, receiver, ctx, cancel, stats_clone, supervision) - .await; - tracing::debug!(actor_id = ?actor_id_for_log, reason = ?reason, "Actor stopped"); - }); - - // Register actor - let handle = LocalActorHandle { - sender: sender.clone(), - join_handle, - stats: stats.clone(), - metadata, - named_path: None, - actor_id, - }; - - self.local_actors.insert(name.to_string(), handle); - - // Create ActorRef - Ok(ActorRef::local(actor_id, sender)) - } - - /// Spawn a named actor (publicly accessible via named path) - pub async fn spawn_named( - &self, - path: ActorPath, - local_name: impl AsRef, - actor: A, - ) -> anyhow::Result - where - A: Actor, - { - self.spawn_named_with_options(path, local_name, actor, SpawnOptions::default()) - .await - } - - /// Spawn a named actor with custom options - pub async fn spawn_named_with_options( - &self, - path: ActorPath, - local_name: impl AsRef, - actor: A, - options: SpawnOptions, - ) -> anyhow::Result - where - A: Actor, - { - // Wrap actor in a factory that only works once - let mut actor_opt = Some(actor); - let factory = move || { - actor_opt - .take() - .ok_or_else(|| anyhow::anyhow!("Actor cannot be restarted (spawned as instance)")) - }; - - self.spawn_named_factory(path, local_name, factory, options) - .await - } - - /// Spawn a named actor using a factory function - pub async fn spawn_named_factory( - &self, - path: ActorPath, - local_name: impl AsRef, - factory: F, - options: SpawnOptions, - ) -> anyhow::Result - where - F: FnMut() -> anyhow::Result + Send + 'static, - A: Actor, - { - let local_name = local_name.as_ref(); - - // Check for duplicate local name - if self.local_actors.contains_key(local_name) { - return Err(anyhow::anyhow!("Actor already exists: {}", local_name)); - } - - // Check for duplicate named path - if self - .named_actor_paths - .contains_key(&path.as_str().to_string()) - { - return Err(anyhow::anyhow!( - "Named path already registered: {}", - path.as_str() - )); - } - - let actor_id = self.next_actor_id(); - - // Use configured mailbox capacity - let capacity = options - .mailbox_capacity - .unwrap_or(self.default_mailbox_capacity); - let mailbox = Mailbox::with_capacity(capacity); - let (sender, receiver) = mailbox.split(); - - let stats = Arc::new(ActorStats::default()); - let metadata = options.metadata.clone(); - - // Create context - let ctx = ActorContext::new(actor_id); - - // Spawn actor loop - let stats_clone = stats.clone(); - let cancel = self.cancel_token.clone(); - let actor_id_for_log = actor_id; - let supervision = options.supervision.clone(); - - let join_handle = tokio::spawn(async move { - let reason = - run_supervision_loop(factory, receiver, ctx, cancel, stats_clone, supervision) - .await; - tracing::debug!(actor_id = ?actor_id_for_log, reason = ?reason, "Actor stopped"); - }); - - // Register actor - let handle = LocalActorHandle { - sender: sender.clone(), - join_handle, - stats: stats.clone(), - metadata: metadata.clone(), - named_path: Some(path.clone()), - actor_id, - }; - - self.local_actors.insert(local_name.to_string(), handle); - self.named_actor_paths - .insert(path.as_str().to_string(), local_name.to_string()); - - // Register in cluster with full details - if let Some(cluster) = self.cluster.read().await.as_ref() { - if metadata.is_empty() { - cluster.register_named_actor(path.clone()).await; - } else { - cluster - .register_named_actor_full(path.clone(), actor_id, metadata) - .await; - } - } - - // Create ActorRef - Ok(ActorRef::local(actor_id, sender)) - } - - /// Get ActorRef for a local or remote actor by ID - pub async fn actor_ref(&self, id: &ActorId) -> anyhow::Result { - // Check if local - if id.node() == self.node_id || id.node().is_local() { - // Find local actor by iterating (since we don't have name in ActorId anymore) - // When id.node().is_local(), compare only local_id since node_id=0 means "current node" - let target_local_id = id.local_id(); - for entry in self.local_actors.iter() { - let entry_local_id = entry.value().actor_id.local_id(); - if entry_local_id == target_local_id { - return Ok(ActorRef::local( - entry.value().actor_id, - entry.value().sender.clone(), - )); - } - } - return Err(anyhow::anyhow!("Local actor not found: {}", id)); - } - - // Remote actor - get address from cluster - let cluster_guard = self.cluster.read().await; - let cluster = cluster_guard - .as_ref() - .ok_or_else(|| anyhow::anyhow!("Cluster not initialized"))?; - - let member = cluster - .get_member(&id.node()) - .await - .ok_or_else(|| anyhow::anyhow!("Node not found in cluster: {}", id.node()))?; - - // Create remote transport using actor id - let transport = Http2RemoteTransport::new_by_id(self.transport.client(), member.addr, *id); - - Ok(ActorRef::remote(*id, member.addr, Arc::new(transport))) - } - - /// Resolve a named actor and get an ActorRef (uses default load balancing: RoundRobin) - pub async fn resolve_named( - &self, - path: &ActorPath, - node_id: Option<&NodeId>, - ) -> anyhow::Result { - let options = if let Some(nid) = node_id { - ResolveOptions::new().node_id(*nid) - } else { - ResolveOptions::new() - }; - self.resolve_named_with_options(path, options).await - } - - /// Resolve a named actor with custom options (load balancing, health filtering) - pub async fn resolve_named_with_options( - &self, - path: &ActorPath, - options: ResolveOptions, - ) -> anyhow::Result { - let cluster_guard = self.cluster.read().await; - let cluster = cluster_guard - .as_ref() - .ok_or_else(|| anyhow::anyhow!("Cluster not initialized"))?; - - let instances = cluster.get_named_actor_instances(path).await; - - if instances.is_empty() { - return Err(anyhow::anyhow!("Named actor not found: {}", path.as_str())); - } - - // Health filtering: only select Alive nodes - let healthy_instances: Vec<_> = if options.filter_alive { - instances - .into_iter() - .filter(|i| i.status == MemberStatus::Alive) - .collect() - } else { - instances - }; - - if healthy_instances.is_empty() { - return Err(anyhow::anyhow!( - "No healthy instances for named actor: {}", - path.as_str() - )); - } - - // Select target instance - let target = if let Some(nid) = options.node_id { - // If node_id specified, find that specific instance - healthy_instances - .iter() - .find(|i| i.node_id == nid) - .ok_or_else(|| anyhow::anyhow!("Actor instance not found on node: {}", nid))? - } else { - // Use load balancing strategy - self.select_instance(path, &healthy_instances, options.strategy) - }; - - // If local, get local ref - if target.node_id == self.node_id { - let actor_name = self - .named_actor_paths - .get(&path.as_str()) - .ok_or_else(|| anyhow::anyhow!("Named actor not found locally"))? - .clone(); - - let handle = self - .local_actors - .get(&actor_name) - .ok_or_else(|| anyhow::anyhow!("Actor not found: {}", actor_name))?; - - return Ok(ActorRef::local(handle.actor_id, handle.sender.clone())); - } - - // Remote actor - let transport = - Http2RemoteTransport::new_named(self.transport.client(), target.addr, path.clone()); - - // For remote named actors, we use a placeholder ActorId since we don't know the actual ID - // The transport will use the path for routing - let actor_id = ActorId::new(target.node_id, 0); - Ok(ActorRef::remote(actor_id, target.addr, Arc::new(transport))) - } - - /// Select an instance based on load balancing strategy - fn select_instance<'a>( - &self, - path: &ActorPath, - instances: &'a [MemberInfo], - strategy: LoadBalanceStrategy, - ) -> &'a MemberInfo { - match strategy { - LoadBalanceStrategy::First => &instances[0], - - LoadBalanceStrategy::RoundRobin => { - let path_key = path.as_str().to_string(); - let counter = self - .lb_counters - .entry(path_key) - .or_insert_with(|| AtomicU64::new(0)); - let idx = counter.fetch_add(1, Ordering::Relaxed) as usize % instances.len(); - &instances[idx] - } - - LoadBalanceStrategy::Random => { - use std::collections::hash_map::DefaultHasher; - use std::hash::{Hash, Hasher}; - use std::time::SystemTime; - - // Simple random using time-based seed - let mut hasher = DefaultHasher::new(); - SystemTime::now() - .duration_since(SystemTime::UNIX_EPOCH) - .unwrap_or_default() - .as_nanos() - .hash(&mut hasher); - std::thread::current().id().hash(&mut hasher); - let idx = hasher.finish() as usize % instances.len(); - &instances[idx] - } - - LoadBalanceStrategy::PreferLocal => { - // Try to find local instance first - if let Some(local) = instances.iter().find(|i| i.node_id == self.node_id) { - local - } else { - // Fallback to round-robin - self.select_instance(path, instances, LoadBalanceStrategy::RoundRobin) - } - } - } - } - - /// Resolve an actor address and get an ActorRef - /// - /// This is a general resolution method that handles both Named and Global addresses. - pub async fn resolve(&self, address: &ActorAddress) -> anyhow::Result { - match address { - ActorAddress::Named { path, instance } => { - self.resolve_named(path, instance.as_ref()).await - } - ActorAddress::Global { node_id, actor_id } => { - let id = ActorId::new(*node_id, *actor_id); - self.actor_ref(&id).await - } - } - } - - /// Get all instances of a named actor across the cluster - pub async fn get_named_instances(&self, path: &ActorPath) -> Vec { - let cluster_guard = self.cluster.read().await; - if let Some(cluster) = cluster_guard.as_ref() { - cluster.get_named_actor_instances(path).await - } else { - Vec::new() - } - } - - /// Get detailed instances with actor_id and metadata - pub async fn get_named_instances_detailed( - &self, - path: &ActorPath, - ) -> Vec<(MemberInfo, Option)> { - let cluster_guard = self.cluster.read().await; - if let Some(cluster) = cluster_guard.as_ref() { - cluster.get_named_actor_instances_detailed(path).await - } else { - Vec::new() - } - } - - /// Lookup named actor information - pub async fn lookup_named(&self, path: &ActorPath) -> Option { - let cluster_guard = self.cluster.read().await; - if let Some(cluster) = cluster_guard.as_ref() { - cluster.lookup_named_actor(path).await - } else { - None - } - } - - /// Get cluster member information - pub async fn members(&self) -> Vec { - let cluster_guard = self.cluster.read().await; - if let Some(cluster) = cluster_guard.as_ref() { - cluster.all_members().await - } else { - Vec::new() - } - } - - /// Get all named actors in the cluster - pub async fn all_named_actors(&self) -> Vec { - let cluster_guard = self.cluster.read().await; - if let Some(cluster) = cluster_guard.as_ref() { - cluster.all_named_actors().await - } else { - Vec::new() - } - } - - /// Stop an actor - pub async fn stop(&self, name: impl AsRef) -> anyhow::Result<()> { - self.stop_with_reason(name, StopReason::Killed).await - } - - /// Stop an actor with a specific reason - pub async fn stop_with_reason( - &self, - name: impl AsRef, - reason: StopReason, - ) -> anyhow::Result<()> { - let name = name.as_ref(); - - if let Some((_, handle)) = self.local_actors.remove(name) { - handle.join_handle.abort(); - - let local_actors = self.local_actors.clone(); - self.lifecycle - .handle_termination( - &handle.actor_id, - name, - handle.named_path.clone(), - reason, - &self.named_actor_paths, - &self.cluster, - |n| local_actors.get(n).map(|h| h.sender.clone()), - ) - .await; - } - - Ok(()) - } - - /// Stop a named actor by path - pub async fn stop_named(&self, path: &ActorPath) -> anyhow::Result<()> { - self.stop_named_with_reason(path, StopReason::Killed).await - } - - /// Stop a named actor by path with a specific reason - pub async fn stop_named_with_reason( - &self, - path: &ActorPath, - reason: StopReason, - ) -> anyhow::Result<()> { - let path_key = path.as_str(); - - // Find the local actor name for this path - if let Some(actor_name_ref) = self.named_actor_paths.get(&path_key) { - let actor_name = actor_name_ref.clone(); - drop(actor_name_ref); - - if let Some((_, handle)) = self.local_actors.remove(&actor_name) { - handle.join_handle.abort(); - - let local_actors = self.local_actors.clone(); - self.lifecycle - .handle_termination( - &handle.actor_id, - &actor_name, - Some(path.clone()), - reason, - &self.named_actor_paths, - &self.cluster, - |name| local_actors.get(name).map(|h| h.sender.clone()), - ) - .await; - } - } - - Ok(()) - } - - /// Shutdown the entire actor system - pub async fn shutdown(&self) -> anyhow::Result<()> { - tracing::info!("Shutting down actor system"); - - // Signal cancellation - self.cancel_token.cancel(); - - // Leave cluster gracefully - { - let cluster_guard = self.cluster.read().await; - if let Some(cluster) = cluster_guard.as_ref() { - cluster.leave().await?; - } - } - - // Stop all actors - for entry in self.local_actors.iter() { - entry.join_handle.abort(); - } - self.local_actors.clear(); - - Ok(()) - } - - /// Get cancellation token - pub fn cancel_token(&self) -> CancellationToken { - self.cancel_token.clone() - } -} - -#[async_trait::async_trait] -impl ActorSystemRef for ActorSystem { - async fn actor_ref(&self, id: &ActorId) -> anyhow::Result { - ActorSystem::actor_ref(self, id).await - } - - fn node_id(&self) -> NodeId { - self.node_id - } - - async fn watch(&self, watcher: &ActorId, target: &ActorId) -> anyhow::Result<()> { - // Only support local watching for now - if target.node() != self.node_id { - return Err(anyhow::anyhow!( - "Cannot watch remote actor: {} (watching remote actors not yet supported)", - target - )); - } - - // Use string representation of ActorId for watching - let watcher_key = watcher.to_string(); - let target_key = target.to_string(); - self.lifecycle.watch(&watcher_key, &target_key).await; - Ok(()) - } - - async fn unwatch(&self, watcher: &ActorId, target: &ActorId) -> anyhow::Result<()> { - let watcher_key = watcher.to_string(); - let target_key = target.to_string(); - self.lifecycle.unwatch(&watcher_key, &target_key).await; - Ok(()) - } -} - -/// Actor instance loop - runs a single instance of an actor -async fn run_actor_instance( - mut actor: A, - receiver: &mut mpsc::Receiver, - ctx: &mut ActorContext, - cancel: CancellationToken, - stats: Arc, -) -> StopReason { - // Call on_start - if let Err(e) = actor.on_start(ctx).await { - tracing::error!(actor_id = ?ctx.id(), error = %e, "Actor start error"); - stats.inc_stop(); - return StopReason::Failed(e.to_string()); - } - - let stop_reason = loop { - tokio::select! { - msg = receiver.recv() => { - match msg { - Some(envelope) => { - stats.inc_message(); - let (message, responder) = envelope.into_parts(); - - match actor.receive(message, ctx).await { - Ok(response) => { - responder.send(Ok(response)); - } - Err(e) => { - tracing::error!(actor_id = ?ctx.id(), error = %e, "Actor error"); - responder.send(Err(anyhow::anyhow!("Handler error: {}", e))); - // Actor crashes on error - supervision will decide whether to restart - return StopReason::Failed(e.to_string()); - } - } - } - None => { - // Mailbox closed (all senders dropped) - break StopReason::Normal; - } - } - } - _ = cancel.cancelled() => { - break StopReason::SystemShutdown; - } - } - }; - - // Cleanup - stats.inc_stop(); - if let Err(e) = actor.on_stop(ctx).await { - tracing::warn!(actor_id = ?ctx.id(), error = %e, "Actor stop error"); - // If on_stop fails, mark as failed - if matches!(stop_reason, StopReason::Normal) { - return StopReason::Failed(e.to_string()); - } - } - - stop_reason -} - -/// Supervision loop - manages actor restarts -async fn run_supervision_loop( - mut factory: F, - mut receiver: mpsc::Receiver, - mut ctx: ActorContext, - cancel: CancellationToken, - stats: Arc, - spec: SupervisionSpec, -) -> StopReason -where - F: FnMut() -> anyhow::Result + Send + 'static, - A: Actor, -{ - let mut restarts = 0; - // Track restarts for windowing if needed (timestamp of restart) - let mut restart_timestamps: Vec = Vec::new(); - - loop { - // Create actor instance - let actor = match factory() { - Ok(a) => a, - Err(e) => { - tracing::error!(actor_id = ?ctx.id(), error = %e, "Failed to create actor instance"); - return StopReason::Failed(format!("Factory error: {}", e)); - } - }; - - // Run actor instance - let reason = run_actor_instance( - actor, - &mut receiver, - &mut ctx, - cancel.clone(), - stats.clone(), - ) - .await; - - // Check if we should restart - let is_failure = matches!(reason, StopReason::Failed(_)); - if !spec.policy.should_restart(is_failure) { - return reason; - } - - if matches!(reason, StopReason::SystemShutdown | StopReason::Killed) { - return reason; - } - - // Check max restarts - restarts += 1; - - // Prune old timestamps if window is set - if let Some(window) = spec.restart_window { - let now = std::time::Instant::now(); - restart_timestamps.push(now); - restart_timestamps.retain(|&t| now.duration_since(t) <= window); - - if restart_timestamps.len() as u32 > spec.max_restarts { - tracing::error!(actor_id = ?ctx.id(), "Max restarts ({}) exceeded within window {:?}", spec.max_restarts, window); - return reason; - } - } else { - // Absolute count - if restarts > spec.max_restarts { - tracing::error!(actor_id = ?ctx.id(), "Max restarts ({}) exceeded", spec.max_restarts); - return reason; - } - } - - tracing::info!( - actor_id = ?ctx.id(), - reason = ?reason, - restarts = restarts, - "Restarting actor..." - ); - - // Backoff - let backoff = spec.backoff.duration(restarts - 1); - if !backoff.is_zero() { - tokio::time::sleep(backoff).await; - } - } -} - -/// Unified message handler for HTTP/2 transport -struct SystemMessageHandler { - node_id: NodeId, - local_actors: Arc>, - named_actor_paths: Arc>, - cluster: Arc>>>, -} - -impl SystemMessageHandler { - /// Dispatch a message to an actor (ask pattern) - async fn dispatch_message(&self, path: &str, msg: Message) -> anyhow::Result { - // Check if path is /actors/{name} or /named/{path} - if let Some(actor_name) = path.strip_prefix("/actors/") { - self.send_to_local_actor(actor_name, msg).await - } else if let Some(named_path) = path.strip_prefix("/named/") { - self.send_to_named_actor(named_path, msg).await - } else { - Err(anyhow::anyhow!("Invalid path: {}", path)) - } - } - - /// Dispatch a fire-and-forget message - async fn dispatch_tell(&self, path: &str, msg: Message) -> anyhow::Result<()> { - // Check if path is /actors/{name} or /named/{path} - if let Some(actor_name) = path.strip_prefix("/actors/") { - self.tell_local_actor(actor_name, msg).await - } else if let Some(named_path) = path.strip_prefix("/named/") { - self.tell_named_actor(named_path, msg).await - } else { - Err(anyhow::anyhow!("Invalid path: {}", path)) - } - } - - async fn send_to_local_actor(&self, actor_name: &str, msg: Message) -> anyhow::Result { - // Find actor sender - first try by name, then by local_id - let sender = if let Some(handle) = self.local_actors.get(actor_name) { - handle.sender.clone() - } else if let Ok(local_id) = actor_name.parse::() { - // Try to find by local_id - self.local_actors - .iter() - .find(|entry| entry.value().actor_id.local_id() == local_id) - .map(|entry| entry.value().sender.clone()) - .ok_or_else(|| anyhow::anyhow!("Actor not found: {}", actor_name))? - } else { - return Err(anyhow::anyhow!("Actor not found: {}", actor_name)); - }; - - let (tx, rx) = tokio::sync::oneshot::channel(); - let envelope = Envelope::ask(msg, tx); - - sender - .send(envelope) - .await - .map_err(|_| anyhow::anyhow!("Actor mailbox closed"))?; - - rx.await.map_err(|_| anyhow::anyhow!("Actor dropped"))? - } - - async fn tell_local_actor(&self, actor_name: &str, msg: Message) -> anyhow::Result<()> { - // Find actor sender - first try by name, then by local_id - let sender = if let Some(handle) = self.local_actors.get(actor_name) { - handle.sender.clone() - } else if let Ok(local_id) = actor_name.parse::() { - // Try to find by local_id - self.local_actors - .iter() - .find(|entry| entry.value().actor_id.local_id() == local_id) - .map(|entry| entry.value().sender.clone()) - .ok_or_else(|| anyhow::anyhow!("Actor not found: {}", actor_name))? - } else { - return Err(anyhow::anyhow!("Actor not found: {}", actor_name)); - }; - - let envelope = Envelope::tell(msg); - - sender - .send(envelope) - .await - .map_err(|_| anyhow::anyhow!("Actor mailbox closed"))?; - - Ok(()) - } - - async fn send_to_named_actor(&self, path: &str, msg: Message) -> anyhow::Result { - let actor_name = self - .named_actor_paths - .get(path) - .ok_or_else(|| anyhow::anyhow!("Named actor not found: {}", path))? - .clone(); - - self.send_to_local_actor(&actor_name, msg).await - } - - async fn tell_named_actor(&self, path: &str, msg: Message) -> anyhow::Result<()> { - let actor_name = self - .named_actor_paths - .get(path) - .ok_or_else(|| anyhow::anyhow!("Named actor not found: {}", path))? - .clone(); - - self.tell_local_actor(&actor_name, msg).await - } -} - -#[async_trait::async_trait] -impl Http2ServerHandler for SystemMessageHandler { - /// Unified message handler - accepts Message (Single or Stream), returns Message - /// - /// This handler supports both single and streaming requests: - /// - Single requests are dispatched to local actors - /// - Streaming requests are passed through to actors that support streaming - async fn handle_message_full(&self, path: &str, msg: Message) -> anyhow::Result { - self.dispatch_message(path, msg).await - } - - /// Simple message handler for backward compatibility - async fn handle_message_simple( - &self, - path: &str, - msg_type: &str, - payload: Vec, - ) -> anyhow::Result { - let msg = Message::single(msg_type, payload); - self.dispatch_message(path, msg).await - } - - async fn handle_tell( - &self, - path: &str, - msg_type: &str, - payload: Vec, - ) -> anyhow::Result<()> { - let msg = Message::single(msg_type, payload); - self.dispatch_tell(path, msg).await - } - - async fn handle_gossip( - &self, - payload: Vec, - peer_addr: SocketAddr, - ) -> anyhow::Result>> { - let cluster_guard = self.cluster.read().await; - if let Some(cluster) = cluster_guard.as_ref() { - let msg: GossipMessage = bincode::deserialize(&payload)?; - let response = cluster.handle_gossip(msg, peer_addr).await?; - if let Some(resp) = response { - Ok(Some(bincode::serialize(&resp)?)) - } else { - Ok(None) - } - } else { - Ok(None) - } - } - - async fn health_check(&self) -> serde_json::Value { - // Collect local actors info - let mut actors = Vec::new(); - for entry in self.local_actors.iter() { - let name = entry.key().clone(); - let handle = entry.value(); - - let mut actor_info = serde_json::json!({ - "name": name, - "stats": handle.stats.to_json(), - "metadata": handle.metadata, - }); - - if let Some(path) = &handle.named_path { - actor_info["named_path"] = serde_json::json!(path.as_str()); - } - - actors.push(actor_info); - } - - // Collect named actors info - let named_actors: Vec<_> = self - .named_actor_paths - .iter() - .map(|e| { - serde_json::json!({ - "path": e.key().clone(), - "actor_name": e.value().clone(), - }) - }) - .collect(); - - // Collect cluster info - let mut cluster_info = serde_json::json!(null); - let cluster_guard = self.cluster.read().await; - if let Some(cluster) = cluster_guard.as_ref() { - let members = cluster.alive_members().await; - let all_named = cluster.all_named_actors().await; - - cluster_info = serde_json::json!({ - "members_count": members.len(), - "members": members, - "named_actors_count": all_named.len(), - "named_actors": all_named.iter().map(|info| { - serde_json::json!({ - "path": info.path.as_str(), - "instance_count": info.instance_count(), - }) - }).collect::>(), - }); - } - - serde_json::json!({ - "node_id": self.node_id.to_string(), - "actors_count": actors.len(), - "actors": actors, - "named_actors": named_actors, - "cluster": cluster_info, - }) - } - - async fn prometheus_metrics(&self) -> String { - // Collect cluster member counts by status - let mut cluster_members = std::collections::HashMap::new(); - let cluster_guard = self.cluster.read().await; - if let Some(cluster) = cluster_guard.as_ref() { - let all_members = cluster.all_members().await; - for member in all_members { - let status = format!("{:?}", member.status); - *cluster_members.entry(status).or_insert(0usize) += 1; - } - } - drop(cluster_guard); - - // Count messages from local actors - let mut total_messages: u64 = 0; - for entry in self.local_actors.iter() { - total_messages += entry.value().stats.message_count.load(Ordering::Relaxed); - } - - // Build system metrics - let system_metrics = PrometheusMetrics { - node_id: self.node_id.0, - actors_count: self.local_actors.len(), - messages_total: total_messages, - actors_created: self.local_actors.len() as u64, // Approximation - actors_stopped: 0, // Would need separate tracking - cluster_members, - }; - - // Export using global metrics registry - metrics().export_prometheus(&system_metrics) - } - - async fn cluster_members(&self) -> serde_json::Value { - let cluster_guard = self.cluster.read().await; - if let Some(cluster) = cluster_guard.as_ref() { - let members = cluster.all_members().await; - let result: Vec<_> = members - .iter() - .map(|m| { - serde_json::json!({ - "node_id": m.node_id.to_string(), - "addr": m.addr.to_string(), - "status": format!("{:?}", m.status), - }) - }) - .collect(); - serde_json::json!(result) - } else { - // Single node mode - return empty (no cluster) - serde_json::json!([{ - "node_id": self.node_id.to_string(), - "status": "Alive", - }]) - } - } - - async fn actors_list(&self, include_internal: bool) -> serde_json::Value { - let cluster_guard = self.cluster.read().await; - let all_named = if let Some(cluster) = cluster_guard.as_ref() { - cluster.all_named_actors().await - } else { - Vec::new() - }; - drop(cluster_guard); - - // Build actors list with detailed info - let mut actors = Vec::new(); - for info in all_named { - let path_str = info.path.as_str(); - - // Skip system/core - if path_str == "system/core" { - continue; - } - - // Check if this actor is on this node - if !info.instance_nodes.contains(&self.node_id) { - continue; - } - - let name = path_str.strip_prefix("actors/").unwrap_or(&path_str); - - // Skip internal actors unless requested - if !include_internal && name.starts_with('_') { - continue; - } - - let actor_type = if name.starts_with('_') { - "system" - } else { - "user" - }; - - // Get detailed instance info if available - let mut actor_json = serde_json::json!({ - "name": name, - "type": actor_type, - }); - - if let Some(instance) = info.get_instance(&self.node_id) { - actor_json["actor_id"] = serde_json::json!(instance.actor_id.to_string()); - for (k, v) in &instance.metadata { - actor_json[k] = serde_json::json!(v); - } - } - - actors.push(actor_json); - } - - serde_json::json!(actors) - } -} diff --git a/crates/pulsing-actor/src/system/config.rs b/crates/pulsing-actor/src/system/config.rs new file mode 100644 index 000000000..19161d074 --- /dev/null +++ b/crates/pulsing-actor/src/system/config.rs @@ -0,0 +1,177 @@ +//! Configuration types for the Actor System + +use crate::actor::{NodeId, DEFAULT_MAILBOX_SIZE}; +use crate::cluster::GossipConfig; +use crate::policies::LoadBalancingPolicy; +use crate::supervision::SupervisionSpec; +use crate::transport::Http2Config; +use std::collections::HashMap; +use std::net::SocketAddr; +use std::sync::Arc; + +/// Actor System configuration +#[derive(Clone, Debug)] +pub struct SystemConfig { + /// HTTP/2 address for all communication (actors + gossip) + pub addr: SocketAddr, + + /// Seed nodes to join (HTTP/2 addresses) + pub seed_nodes: Vec, + + /// Gossip configuration + pub gossip_config: GossipConfig, + + /// HTTP/2 transport configuration + pub http2_config: Http2Config, + + /// Default mailbox capacity for all actors + pub default_mailbox_capacity: usize, +} + +impl Default for SystemConfig { + fn default() -> Self { + Self { + addr: "0.0.0.0:0".parse().unwrap(), + seed_nodes: Vec::new(), + gossip_config: GossipConfig::default(), + http2_config: Http2Config::default(), + default_mailbox_capacity: DEFAULT_MAILBOX_SIZE, + } + } +} + +impl SystemConfig { + /// Create config for a standalone node (no cluster) + pub fn standalone() -> Self { + Self::default() + } + + /// Create config with specific address + pub fn with_addr(addr: SocketAddr) -> Self { + Self { + addr, + ..Default::default() + } + } + + /// Add seed nodes for cluster joining + pub fn with_seeds(mut self, seeds: Vec) -> Self { + self.seed_nodes = seeds; + self + } + + /// Set default mailbox capacity + pub fn with_mailbox_capacity(mut self, capacity: usize) -> Self { + self.default_mailbox_capacity = capacity; + self + } + + /// Enable TLS with passphrase-derived certificates + /// + /// All nodes using the same passphrase will be able to communicate securely. + /// The passphrase is used to derive a shared CA certificate, enabling + /// automatic mutual TLS authentication. + #[cfg(feature = "tls")] + pub fn with_tls(mut self, passphrase: &str) -> anyhow::Result { + self.http2_config = self.http2_config.with_tls(passphrase)?; + Ok(self) + } + + /// Check if TLS is enabled + pub fn is_tls_enabled(&self) -> bool { + self.http2_config.is_tls_enabled() + } +} + +/// Options for spawning an actor +#[derive(Default, Clone, Debug)] +pub struct SpawnOptions { + /// Override mailbox capacity (None = use system default) + pub mailbox_capacity: Option, + /// Whether this actor is public (can be resolved by name across cluster) + pub public: bool, + /// Supervision specification (restart policy) + pub supervision: SupervisionSpec, + /// Actor metadata (e.g., Python class, module, file path) + pub metadata: HashMap, +} + +impl SpawnOptions { + /// Create new spawn options with defaults + pub fn new() -> Self { + Self::default() + } + + /// Set mailbox capacity override + pub fn mailbox_capacity(mut self, capacity: usize) -> Self { + self.mailbox_capacity = Some(capacity); + self + } + + /// Set whether actor is public + pub fn public(mut self, public: bool) -> Self { + self.public = public; + self + } + + /// Set supervision specification + pub fn supervision(mut self, supervision: SupervisionSpec) -> Self { + self.supervision = supervision; + self + } + + /// Set actor metadata + pub fn metadata(mut self, metadata: HashMap) -> Self { + self.metadata = metadata; + self + } +} + +/// Options for resolving named actors +#[derive(Clone, Default)] +pub struct ResolveOptions { + /// Target node ID (if specified, skip load balancing) + pub node_id: Option, + /// Load balancing policy (None = use system default) + pub policy: Option>, + /// Only select Alive nodes (default: true) + pub filter_alive: bool, +} + +impl std::fmt::Debug for ResolveOptions { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ResolveOptions") + .field("node_id", &self.node_id) + .field("policy", &self.policy.as_ref().map(|p| p.name())) + .field("filter_alive", &self.filter_alive) + .finish() + } +} + +impl ResolveOptions { + /// Create new resolve options with defaults + pub fn new() -> Self { + Self { + filter_alive: true, + ..Default::default() + } + } + + /// Set target node ID (bypasses load balancing) + pub fn node_id(mut self, node_id: NodeId) -> Self { + self.node_id = Some(node_id); + self + } + + /// Set load balance policy + pub fn policy(mut self, policy: Arc) -> Self { + self.policy = Some(policy); + self + } + + /// Set whether to filter only alive nodes + pub fn filter_alive(mut self, filter: bool) -> Self { + self.filter_alive = filter; + self + } +} diff --git a/crates/pulsing-actor/src/system/handle.rs b/crates/pulsing-actor/src/system/handle.rs new file mode 100644 index 000000000..55593c757 --- /dev/null +++ b/crates/pulsing-actor/src/system/handle.rs @@ -0,0 +1,61 @@ +//! Actor handle and statistics types + +use crate::actor::{ActorId, ActorPath, Envelope}; +use std::collections::HashMap; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::Arc; +use tokio::sync::mpsc; +use tokio::task::JoinHandle; + +/// Actor runtime statistics +#[derive(Debug, Default)] +pub struct ActorStats { + /// Number of times the actor started + pub start_count: AtomicU64, + /// Number of times the actor stopped + pub stop_count: AtomicU64, + /// Number of messages processed + pub message_count: AtomicU64, +} + +impl ActorStats { + /// Increment stop count + pub fn inc_stop(&self) { + self.stop_count.fetch_add(1, Ordering::Relaxed); + } + + /// Increment message count + pub fn inc_message(&self) { + self.message_count.fetch_add(1, Ordering::Relaxed); + } + + /// Convert to JSON representation + pub fn to_json(&self) -> serde_json::Value { + serde_json::json!({ + "start_count": self.start_count.load(Ordering::Relaxed), + "stop_count": self.stop_count.load(Ordering::Relaxed), + "message_count": self.message_count.load(Ordering::Relaxed), + }) + } +} + +/// Local actor handle - internal representation of a running actor +pub(crate) struct LocalActorHandle { + /// Sender to the actor's mailbox + pub sender: mpsc::Sender, + + /// Actor task handle + pub join_handle: JoinHandle<()>, + + /// Runtime statistics + pub stats: Arc, + + /// Static metadata provided by the actor + pub metadata: HashMap, + + /// Named actor path (if this is a named actor) + pub named_path: Option, + + /// Full actor ID + pub actor_id: ActorId, +} diff --git a/crates/pulsing-actor/src/system/handler.rs b/crates/pulsing-actor/src/system/handler.rs new file mode 100644 index 000000000..71039b52a --- /dev/null +++ b/crates/pulsing-actor/src/system/handler.rs @@ -0,0 +1,347 @@ +//! HTTP/2 message handler for the actor system + +use super::handle::LocalActorHandle; +use crate::actor::{Envelope, Message, NodeId}; +use crate::cluster::{GossipCluster, GossipMessage}; +use crate::metrics::{metrics, SystemMetrics as PrometheusMetrics}; +use crate::transport::Http2ServerHandler; +use dashmap::DashMap; +use std::net::SocketAddr; +use std::sync::atomic::Ordering; +use std::sync::Arc; +use tokio::sync::{mpsc, RwLock}; + +/// Unified message handler for HTTP/2 transport +pub(crate) struct SystemMessageHandler { + node_id: NodeId, + local_actors: Arc>, + named_actor_paths: Arc>, + cluster: Arc>>>, +} + +impl SystemMessageHandler { + pub fn new( + node_id: NodeId, + local_actors: Arc>, + named_actor_paths: Arc>, + cluster: Arc>>>, + ) -> Self { + Self { + node_id, + local_actors, + named_actor_paths, + cluster, + } + } + + /// Find actor sender by name or local_id + fn find_actor_sender(&self, actor_name: &str) -> anyhow::Result> { + // First try by name + if let Some(handle) = self.local_actors.get(actor_name) { + return Ok(handle.sender.clone()); + } + + // Then try by local_id + if let Ok(local_id) = actor_name.parse::() { + if let Some(sender) = self + .local_actors + .iter() + .find(|entry| entry.value().actor_id.local_id() == local_id) + .map(|entry| entry.value().sender.clone()) + { + return Ok(sender); + } + } + + Err(anyhow::anyhow!("Actor not found: {}", actor_name)) + } + + /// Dispatch a message to an actor (ask pattern) + async fn dispatch_message(&self, path: &str, msg: Message) -> anyhow::Result { + if let Some(actor_name) = path.strip_prefix("/actors/") { + self.send_to_local_actor(actor_name, msg).await + } else if let Some(named_path) = path.strip_prefix("/named/") { + self.send_to_named_actor(named_path, msg).await + } else { + Err(anyhow::anyhow!("Invalid path: {}", path)) + } + } + + /// Dispatch a fire-and-forget message + async fn dispatch_tell(&self, path: &str, msg: Message) -> anyhow::Result<()> { + if let Some(actor_name) = path.strip_prefix("/actors/") { + self.tell_local_actor(actor_name, msg).await + } else if let Some(named_path) = path.strip_prefix("/named/") { + self.tell_named_actor(named_path, msg).await + } else { + Err(anyhow::anyhow!("Invalid path: {}", path)) + } + } + + async fn send_to_local_actor(&self, actor_name: &str, msg: Message) -> anyhow::Result { + let sender = self.find_actor_sender(actor_name)?; + + let (tx, rx) = tokio::sync::oneshot::channel(); + let envelope = Envelope::ask(msg, tx); + + sender + .send(envelope) + .await + .map_err(|_| anyhow::anyhow!("Actor mailbox closed"))?; + + rx.await.map_err(|_| anyhow::anyhow!("Actor dropped"))? + } + + async fn tell_local_actor(&self, actor_name: &str, msg: Message) -> anyhow::Result<()> { + let sender = self.find_actor_sender(actor_name)?; + let envelope = Envelope::tell(msg); + + sender + .send(envelope) + .await + .map_err(|_| anyhow::anyhow!("Actor mailbox closed"))?; + + Ok(()) + } + + async fn send_to_named_actor(&self, path: &str, msg: Message) -> anyhow::Result { + let actor_name = self + .named_actor_paths + .get(path) + .ok_or_else(|| anyhow::anyhow!("Named actor not found: {}", path))? + .clone(); + + self.send_to_local_actor(&actor_name, msg).await + } + + async fn tell_named_actor(&self, path: &str, msg: Message) -> anyhow::Result<()> { + let actor_name = self + .named_actor_paths + .get(path) + .ok_or_else(|| anyhow::anyhow!("Named actor not found: {}", path))? + .clone(); + + self.tell_local_actor(&actor_name, msg).await + } +} + +#[async_trait::async_trait] +impl Http2ServerHandler for SystemMessageHandler { + /// Unified message handler - accepts Message (Single or Stream), returns Message + async fn handle_message_full(&self, path: &str, msg: Message) -> anyhow::Result { + self.dispatch_message(path, msg).await + } + + /// Simple message handler for backward compatibility + async fn handle_message_simple( + &self, + path: &str, + msg_type: &str, + payload: Vec, + ) -> anyhow::Result { + let msg = Message::single(msg_type, payload); + self.dispatch_message(path, msg).await + } + + async fn handle_tell( + &self, + path: &str, + msg_type: &str, + payload: Vec, + ) -> anyhow::Result<()> { + let msg = Message::single(msg_type, payload); + self.dispatch_tell(path, msg).await + } + + async fn handle_gossip( + &self, + payload: Vec, + peer_addr: SocketAddr, + ) -> anyhow::Result>> { + let cluster_guard = self.cluster.read().await; + if let Some(cluster) = cluster_guard.as_ref() { + let msg: GossipMessage = bincode::deserialize(&payload)?; + let response = cluster.handle_gossip(msg, peer_addr).await?; + if let Some(resp) = response { + Ok(Some(bincode::serialize(&resp)?)) + } else { + Ok(None) + } + } else { + Ok(None) + } + } + + async fn health_check(&self) -> serde_json::Value { + // Collect local actors info + let mut actors = Vec::new(); + for entry in self.local_actors.iter() { + let name = entry.key().clone(); + let handle = entry.value(); + + let mut actor_info = serde_json::json!({ + "name": name, + "stats": handle.stats.to_json(), + "metadata": handle.metadata, + }); + + if let Some(path) = &handle.named_path { + actor_info["named_path"] = serde_json::json!(path.as_str()); + } + + actors.push(actor_info); + } + + // Collect named actors info + let named_actors: Vec<_> = self + .named_actor_paths + .iter() + .map(|e| { + serde_json::json!({ + "path": e.key().clone(), + "actor_name": e.value().clone(), + }) + }) + .collect(); + + // Collect cluster info + let mut cluster_info = serde_json::json!(null); + let cluster_guard = self.cluster.read().await; + if let Some(cluster) = cluster_guard.as_ref() { + let members = cluster.alive_members().await; + let all_named = cluster.all_named_actors().await; + + cluster_info = serde_json::json!({ + "members_count": members.len(), + "members": members, + "named_actors_count": all_named.len(), + "named_actors": all_named.iter().map(|info| { + serde_json::json!({ + "path": info.path.as_str(), + "instance_count": info.instance_count(), + }) + }).collect::>(), + }); + } + + serde_json::json!({ + "node_id": self.node_id.to_string(), + "actors_count": actors.len(), + "actors": actors, + "named_actors": named_actors, + "cluster": cluster_info, + }) + } + + async fn prometheus_metrics(&self) -> String { + // Collect cluster member counts by status + let mut cluster_members = std::collections::HashMap::new(); + let cluster_guard = self.cluster.read().await; + if let Some(cluster) = cluster_guard.as_ref() { + let all_members = cluster.all_members().await; + for member in all_members { + let status = format!("{:?}", member.status); + *cluster_members.entry(status).or_insert(0usize) += 1; + } + } + drop(cluster_guard); + + // Count messages from local actors + let mut total_messages: u64 = 0; + for entry in self.local_actors.iter() { + total_messages += entry.value().stats.message_count.load(Ordering::Relaxed); + } + + // Build system metrics + let system_metrics = PrometheusMetrics { + node_id: self.node_id.0, + actors_count: self.local_actors.len(), + messages_total: total_messages, + actors_created: self.local_actors.len() as u64, + actors_stopped: 0, + cluster_members, + }; + + // Export using global metrics registry + metrics().export_prometheus(&system_metrics) + } + + async fn cluster_members(&self) -> serde_json::Value { + let cluster_guard = self.cluster.read().await; + if let Some(cluster) = cluster_guard.as_ref() { + let members = cluster.all_members().await; + let result: Vec<_> = members + .iter() + .map(|m| { + serde_json::json!({ + "node_id": m.node_id.to_string(), + "addr": m.addr.to_string(), + "status": format!("{:?}", m.status), + }) + }) + .collect(); + serde_json::json!(result) + } else { + serde_json::json!([{ + "node_id": self.node_id.to_string(), + "status": "Alive", + }]) + } + } + + async fn actors_list(&self, include_internal: bool) -> serde_json::Value { + let cluster_guard = self.cluster.read().await; + let all_named = if let Some(cluster) = cluster_guard.as_ref() { + cluster.all_named_actors().await + } else { + Vec::new() + }; + drop(cluster_guard); + + // Build actors list with detailed info + let mut actors = Vec::new(); + for info in all_named { + let path_str = info.path.as_str(); + + // Skip system/core + if path_str == "system/core" { + continue; + } + + // Check if this actor is on this node + if !info.instance_nodes.contains(&self.node_id) { + continue; + } + + let name = path_str.strip_prefix("actors/").unwrap_or(&path_str); + + // Skip internal actors unless requested + if !include_internal && name.starts_with('_') { + continue; + } + + let actor_type = if name.starts_with('_') { + "system" + } else { + "user" + }; + + // Get detailed instance info if available + let mut actor_json = serde_json::json!({ + "name": name, + "type": actor_type, + }); + + if let Some(instance) = info.get_instance(&self.node_id) { + actor_json["actor_id"] = serde_json::json!(instance.actor_id.to_string()); + for (k, v) in &instance.metadata { + actor_json[k] = serde_json::json!(v); + } + } + + actors.push(actor_json); + } + + serde_json::json!(actors) + } +} diff --git a/crates/pulsing-actor/src/system/mod.rs b/crates/pulsing-actor/src/system/mod.rs new file mode 100644 index 000000000..15a8a47fc --- /dev/null +++ b/crates/pulsing-actor/src/system/mod.rs @@ -0,0 +1,794 @@ +//! Actor System - the main entry point for creating and managing actors +//! +//! This module provides: +//! - [`ActorSystem`] - The main system for managing actors +//! - [`SystemConfig`] - Configuration for the actor system +//! - [`SpawnOptions`] - Options for spawning actors +//! - [`ResolveOptions`] - Options for resolving named actors + +mod config; +mod handle; +mod handler; +mod runtime; + +pub use config::{ResolveOptions, SpawnOptions, SystemConfig}; +pub use handle::ActorStats; + +use crate::actor::{ + Actor, ActorAddress, ActorContext, ActorId, ActorPath, ActorRef, ActorSystemRef, Mailbox, + NodeId, StopReason, +}; +use crate::cluster::{GossipCluster, MemberInfo, MemberStatus, NamedActorInfo}; +use crate::policies::{LoadBalancingPolicy, RoundRobinPolicy}; +use crate::system_actor::{ + BoxedActorFactory, SystemActor, SystemRef, SYSTEM_ACTOR_LOCAL_NAME, SYSTEM_ACTOR_PATH, +}; +use crate::transport::{Http2RemoteTransport, Http2Transport}; +use crate::watch::ActorLifecycle; +use dashmap::DashMap; +use handle::LocalActorHandle; +use handler::SystemMessageHandler; +use runtime::run_supervision_loop; +use std::collections::HashMap; +use std::net::SocketAddr; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::Arc; +use tokio::sync::RwLock; +use tokio_util::sync::CancellationToken; + +/// The Actor System - manages actors and cluster membership +pub struct ActorSystem { + /// Local node ID + node_id: NodeId, + + /// HTTP/2 address + addr: SocketAddr, + + /// Default mailbox capacity for actors + default_mailbox_capacity: usize, + + /// Local actors (actor_name -> handle) + local_actors: Arc>, + + /// Named actor path to local actor name mapping (path_string -> actor_name) + named_actor_paths: Arc>, + + /// Gossip cluster (for discovery) + cluster: Arc>>>, + + /// HTTP/2 transport + transport: Arc, + + /// Cancellation token + cancel_token: CancellationToken, + + /// Actor lifecycle manager (watch, termination handling) + lifecycle: Arc, + + /// Actor ID counter (for generating unique local IDs) + actor_id_counter: AtomicU64, + + /// Default load balancing policy + default_lb_policy: Arc, +} + +impl ActorSystem { + /// Create a new actor system + pub async fn new(config: SystemConfig) -> anyhow::Result> { + let cancel_token = CancellationToken::new(); + let node_id = NodeId::generate(); + let local_actors: Arc> = Arc::new(DashMap::new()); + let named_actor_paths: Arc> = Arc::new(DashMap::new()); + let cluster_holder: Arc>>> = Arc::new(RwLock::new(None)); + let lifecycle = Arc::new(ActorLifecycle::new()); + + // Create message handler (needs cluster reference for gossip) + let handler = SystemMessageHandler::new( + node_id, + local_actors.clone(), + named_actor_paths.clone(), + cluster_holder.clone(), + ); + + // Create HTTP/2 transport + let (transport, actual_addr) = Http2Transport::new( + config.addr, + Arc::new(handler), + config.http2_config, + cancel_token.clone(), + ) + .await?; + + // Create gossip cluster + let cluster = GossipCluster::new( + node_id, + actual_addr, + transport.clone(), + config.gossip_config, + ); + + let cluster = Arc::new(cluster); + { + let mut holder = cluster_holder.write().await; + *holder = Some(cluster.clone()); + } + + // Start cluster gossip + cluster.start(cancel_token.clone()); + + // Join cluster if seed nodes provided + if !config.seed_nodes.is_empty() { + cluster.join(config.seed_nodes).await?; + } + + let system = Arc::new(Self { + node_id, + addr: actual_addr, + default_mailbox_capacity: config.default_mailbox_capacity, + local_actors: local_actors.clone(), + named_actor_paths: named_actor_paths.clone(), + cluster: cluster_holder, + transport, + cancel_token: cancel_token.clone(), + lifecycle, + actor_id_counter: AtomicU64::new(1), + default_lb_policy: Arc::new(RoundRobinPolicy::new()), + }); + + // Start the builtin SystemActor with path "system" + system + .start_system_actor(local_actors, named_actor_paths) + .await?; + + tracing::info!( + node_id = %node_id, + addr = %actual_addr, + "Actor system started" + ); + + Ok(system) + } + + /// Start the builtin SystemActor + async fn start_system_actor( + self: &Arc, + local_actors: Arc>, + named_actor_paths: Arc>, + ) -> anyhow::Result<()> { + // Create SystemRef for SystemActor + let system_ref = Arc::new(SystemRef { + node_id: self.node_id, + addr: self.addr, + local_actors: local_actors + .iter() + .map(|e| (e.key().clone(), e.sender.clone())) + .collect::>() + .into(), + named_actor_paths, + }); + + // Create SystemActor with default factory + let system_actor = SystemActor::with_default_factory(system_ref); + + // Spawn as named actor with path "system" + let path = ActorPath::new(SYSTEM_ACTOR_PATH)?; + self.spawn_named(path, SYSTEM_ACTOR_LOCAL_NAME, system_actor) + .await?; + + tracing::debug!(path = SYSTEM_ACTOR_PATH, "SystemActor started"); + Ok(()) + } + + /// Start SystemActor with custom factory (for Python extension) + pub async fn start_system_actor_with_factory( + self: &Arc, + factory: BoxedActorFactory, + ) -> anyhow::Result<()> { + // Check if already started + if self.local_actors.contains_key(SYSTEM_ACTOR_LOCAL_NAME) { + return Err(anyhow::anyhow!("SystemActor already started")); + } + + // Create SystemRef + let system_ref = Arc::new(SystemRef { + node_id: self.node_id, + addr: self.addr, + local_actors: Arc::new(DashMap::new()), // Will be updated + named_actor_paths: self.named_actor_paths.clone(), + }); + + // Create SystemActor with custom factory + let system_actor = SystemActor::new(system_ref, factory); + + // Spawn as named actor + let path = ActorPath::new(SYSTEM_ACTOR_PATH)?; + self.spawn_named(path, SYSTEM_ACTOR_LOCAL_NAME, system_actor) + .await?; + + tracing::debug!( + path = SYSTEM_ACTOR_PATH, + "SystemActor started with custom factory" + ); + Ok(()) + } + + /// Get SystemActor reference + pub async fn system(&self) -> anyhow::Result { + self.resolve_named(&ActorPath::new(SYSTEM_ACTOR_PATH)?, None) + .await + } + + /// Get node ID + pub fn node_id(&self) -> &NodeId { + &self.node_id + } + + /// Get local address + pub fn addr(&self) -> SocketAddr { + self.addr + } + + /// Get list of local actor names + pub fn local_actor_names(&self) -> Vec { + self.local_actors.iter().map(|e| e.key().clone()).collect() + } + + /// Generate a new unique local actor ID + fn next_actor_id(&self) -> ActorId { + let local_id = self.actor_id_counter.fetch_add(1, Ordering::Relaxed); + ActorId::new(self.node_id, local_id) + } + + // ========== Spawn Methods ========== + + /// Create a once-use factory from an actor instance + fn once_factory(actor: A) -> impl FnMut() -> anyhow::Result { + let mut actor_opt = Some(actor); + move || { + actor_opt + .take() + .ok_or_else(|| anyhow::anyhow!("Actor cannot be restarted (spawned as instance)")) + } + } + + /// Spawn an actor with a local name (uses system default mailbox capacity) + pub async fn spawn(&self, name: impl AsRef, actor: A) -> anyhow::Result + where + A: Actor, + { + self.spawn_factory(name, Self::once_factory(actor), SpawnOptions::default()) + .await + } + + /// Spawn an actor with custom options + pub async fn spawn_with_options( + &self, + name: impl AsRef, + actor: A, + options: SpawnOptions, + ) -> anyhow::Result + where + A: Actor, + { + self.spawn_factory(name, Self::once_factory(actor), options) + .await + } + + /// Spawn an actor using a factory function (enables supervision restarts) + pub async fn spawn_factory( + &self, + name: impl AsRef, + factory: F, + options: SpawnOptions, + ) -> anyhow::Result + where + F: FnMut() -> anyhow::Result + Send + 'static, + A: Actor, + { + let name = name.as_ref(); + + // Check for duplicate + if self.local_actors.contains_key(name) { + return Err(anyhow::anyhow!("Actor already exists: {}", name)); + } + + let actor_id = self.next_actor_id(); + + // Use configured mailbox capacity + let capacity = options + .mailbox_capacity + .unwrap_or(self.default_mailbox_capacity); + let mailbox = Mailbox::with_capacity(capacity); + let (sender, receiver) = mailbox.split(); + + let stats = Arc::new(ActorStats::default()); + let metadata = HashMap::new(); + + // Create context + let ctx = ActorContext::new(actor_id); + + // Spawn actor loop + let stats_clone = stats.clone(); + let cancel = self.cancel_token.clone(); + let actor_id_for_log = actor_id; + let supervision = options.supervision.clone(); + + let join_handle = tokio::spawn(async move { + let reason = + run_supervision_loop(factory, receiver, ctx, cancel, stats_clone, supervision) + .await; + tracing::debug!(actor_id = ?actor_id_for_log, reason = ?reason, "Actor stopped"); + }); + + // Register actor + let handle = LocalActorHandle { + sender: sender.clone(), + join_handle, + stats: stats.clone(), + metadata, + named_path: None, + actor_id, + }; + + self.local_actors.insert(name.to_string(), handle); + + // Create ActorRef + Ok(ActorRef::local(actor_id, sender)) + } + + /// Spawn a named actor (publicly accessible via named path) + pub async fn spawn_named( + &self, + path: ActorPath, + local_name: impl AsRef, + actor: A, + ) -> anyhow::Result + where + A: Actor, + { + self.spawn_named_factory( + path, + local_name, + Self::once_factory(actor), + SpawnOptions::default(), + ) + .await + } + + /// Spawn a named actor with custom options + pub async fn spawn_named_with_options( + &self, + path: ActorPath, + local_name: impl AsRef, + actor: A, + options: SpawnOptions, + ) -> anyhow::Result + where + A: Actor, + { + self.spawn_named_factory(path, local_name, Self::once_factory(actor), options) + .await + } + + /// Spawn a named actor using a factory function + pub async fn spawn_named_factory( + &self, + path: ActorPath, + local_name: impl AsRef, + factory: F, + options: SpawnOptions, + ) -> anyhow::Result + where + F: FnMut() -> anyhow::Result + Send + 'static, + A: Actor, + { + let local_name = local_name.as_ref(); + + // Check for duplicate local name + if self.local_actors.contains_key(local_name) { + return Err(anyhow::anyhow!("Actor already exists: {}", local_name)); + } + + // Check for duplicate named path + if self + .named_actor_paths + .contains_key(&path.as_str().to_string()) + { + return Err(anyhow::anyhow!( + "Named path already registered: {}", + path.as_str() + )); + } + + let actor_id = self.next_actor_id(); + + // Use configured mailbox capacity + let capacity = options + .mailbox_capacity + .unwrap_or(self.default_mailbox_capacity); + let mailbox = Mailbox::with_capacity(capacity); + let (sender, receiver) = mailbox.split(); + + let stats = Arc::new(ActorStats::default()); + let metadata = options.metadata.clone(); + + // Create context + let ctx = ActorContext::new(actor_id); + + // Spawn actor loop + let stats_clone = stats.clone(); + let cancel = self.cancel_token.clone(); + let actor_id_for_log = actor_id; + let supervision = options.supervision.clone(); + + let join_handle = tokio::spawn(async move { + let reason = + run_supervision_loop(factory, receiver, ctx, cancel, stats_clone, supervision) + .await; + tracing::debug!(actor_id = ?actor_id_for_log, reason = ?reason, "Actor stopped"); + }); + + // Register actor + let handle = LocalActorHandle { + sender: sender.clone(), + join_handle, + stats: stats.clone(), + metadata: metadata.clone(), + named_path: Some(path.clone()), + actor_id, + }; + + self.local_actors.insert(local_name.to_string(), handle); + self.named_actor_paths + .insert(path.as_str().to_string(), local_name.to_string()); + + // Register in cluster with full details + if let Some(cluster) = self.cluster.read().await.as_ref() { + if metadata.is_empty() { + cluster.register_named_actor(path.clone()).await; + } else { + cluster + .register_named_actor_full(path.clone(), actor_id, metadata) + .await; + } + } + + // Create ActorRef + Ok(ActorRef::local(actor_id, sender)) + } + + // ========== Resolve Methods ========== + + /// Get ActorRef for a local or remote actor by ID + pub async fn actor_ref(&self, id: &ActorId) -> anyhow::Result { + // Check if local + if id.node() == self.node_id || id.node().is_local() { + let target_local_id = id.local_id(); + for entry in self.local_actors.iter() { + let entry_local_id = entry.value().actor_id.local_id(); + if entry_local_id == target_local_id { + return Ok(ActorRef::local( + entry.value().actor_id, + entry.value().sender.clone(), + )); + } + } + return Err(anyhow::anyhow!("Local actor not found: {}", id)); + } + + // Remote actor - get address from cluster + let cluster_guard = self.cluster.read().await; + let cluster = cluster_guard + .as_ref() + .ok_or_else(|| anyhow::anyhow!("Cluster not initialized"))?; + + let member = cluster + .get_member(&id.node()) + .await + .ok_or_else(|| anyhow::anyhow!("Node not found in cluster: {}", id.node()))?; + + // Create remote transport using actor id + let transport = Http2RemoteTransport::new_by_id(self.transport.client(), member.addr, *id); + + Ok(ActorRef::remote(*id, member.addr, Arc::new(transport))) + } + + /// Resolve a named actor and get an ActorRef (uses default load balancing: RoundRobin) + pub async fn resolve_named( + &self, + path: &ActorPath, + node_id: Option<&NodeId>, + ) -> anyhow::Result { + let options = if let Some(nid) = node_id { + ResolveOptions::new().node_id(*nid) + } else { + ResolveOptions::new() + }; + self.resolve_named_with_options(path, options).await + } + + /// Resolve a named actor with custom options (load balancing, health filtering) + pub async fn resolve_named_with_options( + &self, + path: &ActorPath, + options: ResolveOptions, + ) -> anyhow::Result { + let cluster_guard = self.cluster.read().await; + let cluster = cluster_guard + .as_ref() + .ok_or_else(|| anyhow::anyhow!("Cluster not initialized"))?; + + let instances = cluster.get_named_actor_instances(path).await; + + if instances.is_empty() { + return Err(anyhow::anyhow!("Named actor not found: {}", path.as_str())); + } + + // Health filtering: only select Alive nodes + let healthy_instances: Vec<_> = if options.filter_alive { + instances + .into_iter() + .filter(|i| i.status == MemberStatus::Alive) + .collect() + } else { + instances + }; + + if healthy_instances.is_empty() { + return Err(anyhow::anyhow!( + "No healthy instances for named actor: {}", + path.as_str() + )); + } + + // Select target instance + let target = if let Some(nid) = options.node_id { + // If node_id specified, find that specific instance + healthy_instances + .iter() + .find(|i| i.node_id == nid) + .ok_or_else(|| anyhow::anyhow!("Actor instance not found on node: {}", nid))? + } else { + // Use load balancing policy + let policy = options.policy.as_ref().unwrap_or(&self.default_lb_policy); + self.select_instance(&healthy_instances, policy.as_ref()) + }; + + // If local, get local ref + if target.node_id == self.node_id { + let actor_name = self + .named_actor_paths + .get(&path.as_str()) + .ok_or_else(|| anyhow::anyhow!("Named actor not found locally"))? + .clone(); + + let handle = self + .local_actors + .get(&actor_name) + .ok_or_else(|| anyhow::anyhow!("Actor not found: {}", actor_name))?; + + return Ok(ActorRef::local(handle.actor_id, handle.sender.clone())); + } + + // Remote actor + let transport = + Http2RemoteTransport::new_named(self.transport.client(), target.addr, path.clone()); + + let actor_id = ActorId::new(target.node_id, 0); + Ok(ActorRef::remote(actor_id, target.addr, Arc::new(transport))) + } + + /// Select an instance using load balancing policy + fn select_instance<'a>( + &self, + instances: &'a [MemberInfo], + policy: &dyn LoadBalancingPolicy, + ) -> &'a MemberInfo { + // Convert MemberInfo to a format compatible with LoadBalancingPolicy + // For now, use simple index-based selection + let idx = policy.select_worker(&[], None).unwrap_or(0) % instances.len(); + &instances[idx] + } + + /// Resolve an actor address and get an ActorRef + pub async fn resolve(&self, address: &ActorAddress) -> anyhow::Result { + match address { + ActorAddress::Named { path, instance } => { + self.resolve_named(path, instance.as_ref()).await + } + ActorAddress::Global { node_id, actor_id } => { + let id = ActorId::new(*node_id, *actor_id); + self.actor_ref(&id).await + } + } + } + + /// Get all instances of a named actor across the cluster + pub async fn get_named_instances(&self, path: &ActorPath) -> Vec { + let cluster_guard = self.cluster.read().await; + if let Some(cluster) = cluster_guard.as_ref() { + cluster.get_named_actor_instances(path).await + } else { + Vec::new() + } + } + + /// Get detailed instances with actor_id and metadata + pub async fn get_named_instances_detailed( + &self, + path: &ActorPath, + ) -> Vec<(MemberInfo, Option)> { + let cluster_guard = self.cluster.read().await; + if let Some(cluster) = cluster_guard.as_ref() { + cluster.get_named_actor_instances_detailed(path).await + } else { + Vec::new() + } + } + + /// Lookup named actor information + pub async fn lookup_named(&self, path: &ActorPath) -> Option { + let cluster_guard = self.cluster.read().await; + if let Some(cluster) = cluster_guard.as_ref() { + cluster.lookup_named_actor(path).await + } else { + None + } + } + + /// Get cluster member information + pub async fn members(&self) -> Vec { + let cluster_guard = self.cluster.read().await; + if let Some(cluster) = cluster_guard.as_ref() { + cluster.all_members().await + } else { + Vec::new() + } + } + + /// Get all named actors in the cluster + pub async fn all_named_actors(&self) -> Vec { + let cluster_guard = self.cluster.read().await; + if let Some(cluster) = cluster_guard.as_ref() { + cluster.all_named_actors().await + } else { + Vec::new() + } + } + + // ========== Stop Methods ========== + + /// Stop an actor + pub async fn stop(&self, name: impl AsRef) -> anyhow::Result<()> { + self.stop_with_reason(name, StopReason::Killed).await + } + + /// Stop an actor with a specific reason + pub async fn stop_with_reason( + &self, + name: impl AsRef, + reason: StopReason, + ) -> anyhow::Result<()> { + let name = name.as_ref(); + + if let Some((_, handle)) = self.local_actors.remove(name) { + handle.join_handle.abort(); + + let local_actors = self.local_actors.clone(); + self.lifecycle + .handle_termination( + &handle.actor_id, + name, + handle.named_path.clone(), + reason, + &self.named_actor_paths, + &self.cluster, + |n| local_actors.get(n).map(|h| h.sender.clone()), + ) + .await; + } + + Ok(()) + } + + /// Stop a named actor by path + pub async fn stop_named(&self, path: &ActorPath) -> anyhow::Result<()> { + self.stop_named_with_reason(path, StopReason::Killed).await + } + + /// Stop a named actor by path with a specific reason + pub async fn stop_named_with_reason( + &self, + path: &ActorPath, + reason: StopReason, + ) -> anyhow::Result<()> { + let path_key = path.as_str(); + + // Find the local actor name for this path + if let Some(actor_name_ref) = self.named_actor_paths.get(&path_key) { + let actor_name = actor_name_ref.clone(); + drop(actor_name_ref); + + if let Some((_, handle)) = self.local_actors.remove(&actor_name) { + handle.join_handle.abort(); + + let local_actors = self.local_actors.clone(); + self.lifecycle + .handle_termination( + &handle.actor_id, + &actor_name, + Some(path.clone()), + reason, + &self.named_actor_paths, + &self.cluster, + |name| local_actors.get(name).map(|h| h.sender.clone()), + ) + .await; + } + } + + Ok(()) + } + + /// Shutdown the entire actor system + pub async fn shutdown(&self) -> anyhow::Result<()> { + tracing::info!("Shutting down actor system"); + + // Signal cancellation + self.cancel_token.cancel(); + + // Leave cluster gracefully + { + let cluster_guard = self.cluster.read().await; + if let Some(cluster) = cluster_guard.as_ref() { + cluster.leave().await?; + } + } + + // Stop all actors + for entry in self.local_actors.iter() { + entry.join_handle.abort(); + } + self.local_actors.clear(); + + Ok(()) + } + + /// Get cancellation token + pub fn cancel_token(&self) -> CancellationToken { + self.cancel_token.clone() + } +} + +#[async_trait::async_trait] +impl ActorSystemRef for ActorSystem { + async fn actor_ref(&self, id: &ActorId) -> anyhow::Result { + ActorSystem::actor_ref(self, id).await + } + + fn node_id(&self) -> NodeId { + self.node_id + } + + async fn watch(&self, watcher: &ActorId, target: &ActorId) -> anyhow::Result<()> { + // Only support local watching for now + if target.node() != self.node_id { + return Err(anyhow::anyhow!( + "Cannot watch remote actor: {} (watching remote actors not yet supported)", + target + )); + } + + let watcher_key = watcher.to_string(); + let target_key = target.to_string(); + self.lifecycle.watch(&watcher_key, &target_key).await; + Ok(()) + } + + async fn unwatch(&self, watcher: &ActorId, target: &ActorId) -> anyhow::Result<()> { + let watcher_key = watcher.to_string(); + let target_key = target.to_string(); + self.lifecycle.unwatch(&watcher_key, &target_key).await; + Ok(()) + } +} diff --git a/crates/pulsing-actor/src/system/runtime.rs b/crates/pulsing-actor/src/system/runtime.rs new file mode 100644 index 000000000..d75ac22fb --- /dev/null +++ b/crates/pulsing-actor/src/system/runtime.rs @@ -0,0 +1,151 @@ +//! Actor runtime loop and supervision + +use super::handle::ActorStats; +use crate::actor::{Actor, ActorContext, Envelope, StopReason}; +use crate::supervision::SupervisionSpec; +use std::sync::Arc; +use tokio::sync::mpsc; +use tokio_util::sync::CancellationToken; + +/// Actor instance loop - runs a single instance of an actor +pub(crate) async fn run_actor_instance( + mut actor: A, + receiver: &mut mpsc::Receiver, + ctx: &mut ActorContext, + cancel: CancellationToken, + stats: Arc, +) -> StopReason { + // Call on_start + if let Err(e) = actor.on_start(ctx).await { + tracing::error!(actor_id = ?ctx.id(), error = %e, "Actor start error"); + stats.inc_stop(); + return StopReason::Failed(e.to_string()); + } + + let stop_reason = loop { + tokio::select! { + msg = receiver.recv() => { + match msg { + Some(envelope) => { + stats.inc_message(); + let (message, responder) = envelope.into_parts(); + + match actor.receive(message, ctx).await { + Ok(response) => { + responder.send(Ok(response)); + } + Err(e) => { + tracing::error!(actor_id = ?ctx.id(), error = %e, "Actor error"); + responder.send(Err(anyhow::anyhow!("Handler error: {}", e))); + // Actor crashes on error - supervision will decide whether to restart + return StopReason::Failed(e.to_string()); + } + } + } + None => { + // Mailbox closed (all senders dropped) + break StopReason::Normal; + } + } + } + _ = cancel.cancelled() => { + break StopReason::SystemShutdown; + } + } + }; + + // Cleanup + stats.inc_stop(); + if let Err(e) = actor.on_stop(ctx).await { + tracing::warn!(actor_id = ?ctx.id(), error = %e, "Actor stop error"); + // If on_stop fails, mark as failed + if matches!(stop_reason, StopReason::Normal) { + return StopReason::Failed(e.to_string()); + } + } + + stop_reason +} + +/// Supervision loop - manages actor restarts +pub(crate) async fn run_supervision_loop( + mut factory: F, + mut receiver: mpsc::Receiver, + mut ctx: ActorContext, + cancel: CancellationToken, + stats: Arc, + spec: SupervisionSpec, +) -> StopReason +where + F: FnMut() -> anyhow::Result + Send + 'static, + A: Actor, +{ + let mut restarts = 0; + // Track restarts for windowing if needed (timestamp of restart) + let mut restart_timestamps: Vec = Vec::new(); + + loop { + // Create actor instance + let actor = match factory() { + Ok(a) => a, + Err(e) => { + tracing::error!(actor_id = ?ctx.id(), error = %e, "Failed to create actor instance"); + return StopReason::Failed(format!("Factory error: {}", e)); + } + }; + + // Run actor instance + let reason = run_actor_instance( + actor, + &mut receiver, + &mut ctx, + cancel.clone(), + stats.clone(), + ) + .await; + + // Check if we should restart + let is_failure = matches!(reason, StopReason::Failed(_)); + if !spec.policy.should_restart(is_failure) { + return reason; + } + + if matches!(reason, StopReason::SystemShutdown | StopReason::Killed) { + return reason; + } + + // Check max restarts + restarts += 1; + + // Prune old timestamps if window is set + if let Some(window) = spec.restart_window { + let now = std::time::Instant::now(); + restart_timestamps.push(now); + restart_timestamps.retain(|&t| now.duration_since(t) <= window); + + if restart_timestamps.len() as u32 > spec.max_restarts { + tracing::error!(actor_id = ?ctx.id(), "Max restarts ({}) exceeded within window {:?}", spec.max_restarts, window); + return reason; + } + } else { + // Absolute count + if restarts > spec.max_restarts { + tracing::error!(actor_id = ?ctx.id(), "Max restarts ({}) exceeded", spec.max_restarts); + return reason; + } + } + + tracing::info!( + actor_id = ?ctx.id(), + reason = ?reason, + restarts = restarts, + "Restarting actor..." + ); + + // Backoff + let backoff = spec.backoff.duration(restarts - 1); + if !backoff.is_zero() { + tokio::time::sleep(backoff).await; + } + } +} diff --git a/crates/pulsing-actor/src/system_actor/messages.rs b/crates/pulsing-actor/src/system_actor/messages.rs index 6e76eecc4..5e75e5df0 100644 --- a/crates/pulsing-actor/src/system_actor/messages.rs +++ b/crates/pulsing-actor/src/system_actor/messages.rs @@ -169,73 +169,3 @@ pub struct ActorStatusInfo { /// Last active time pub last_active: Option, } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_system_message_serialization() { - let msg = SystemMessage::ListActors; - let json = serde_json::to_string(&msg).unwrap(); - assert!(json.contains("ListActors")); - - let msg = SystemMessage::GetActor { - name: "test".to_string(), - }; - let json = serde_json::to_string(&msg).unwrap(); - assert!(json.contains("GetActor")); - assert!(json.contains("test")); - } - - #[test] - fn test_system_response_serialization() { - let resp = SystemResponse::Ok; - let json = serde_json::to_string(&resp).unwrap(); - assert!(json.contains("Ok")); - - let resp = SystemResponse::Error { - message: "test error".to_string(), - }; - let json = serde_json::to_string(&resp).unwrap(); - assert!(json.contains("Error")); - assert!(json.contains("test error")); - } - - #[test] - fn test_actor_info_serialization() { - let info = ActorInfo { - name: "test".to_string(), - actor_id: 123, - actor_type: "TestActor".to_string(), - uptime_secs: 60, - public: true, - metadata: std::collections::HashMap::new(), - }; - let json = serde_json::to_string(&info).unwrap(); - assert!(json.contains("test")); - assert!(json.contains("123")); - } - - #[test] - fn test_create_actor_deserialization() { - // Test JSON deserialization (Python compatibility) - let json = r#"{"type":"CreateActor","actor_type":"Counter","name":"c1","params":{"init_value":10},"public":true}"#; - let msg: SystemMessage = serde_json::from_str(json).unwrap(); - - match msg { - SystemMessage::CreateActor { - actor_type, - name, - params, - public, - } => { - assert_eq!(actor_type, "Counter"); - assert_eq!(name, "c1"); - assert_eq!(params["init_value"], 10); - assert!(public); - } - _ => panic!("Expected CreateActor"), - } - } -} diff --git a/crates/pulsing-actor/src/system_actor/mod.rs b/crates/pulsing-actor/src/system_actor/mod.rs index 9127ea5a8..9d91a268c 100644 --- a/crates/pulsing-actor/src/system_actor/mod.rs +++ b/crates/pulsing-actor/src/system_actor/mod.rs @@ -415,45 +415,3 @@ impl Actor for SystemActor { }) } } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_system_actor_path() { - assert_eq!(SYSTEM_ACTOR_PATH, "system/core"); - } - - #[test] - fn test_actor_registry() { - let registry = ActorRegistry::new(); - let actor_id = ActorId::local(1); - - registry.register("test", actor_id, "TestActor", true); - assert!(registry.contains("test")); - assert_eq!(registry.count(), 1); - - let info = registry.get_info("test").unwrap(); - assert_eq!(info.name, "test"); - assert_eq!(info.actor_type, "TestActor"); - - registry.unregister("test"); - assert!(!registry.contains("test")); - } - - #[test] - fn test_system_metrics() { - let metrics = SystemMetrics::new(); - - metrics.inc_message(); - metrics.inc_message(); - assert_eq!(metrics.messages_total(), 2); - - metrics.inc_actor_created(); - assert_eq!(metrics.actors_created(), 1); - - metrics.inc_actor_stopped(); - assert_eq!(metrics.actors_stopped(), 1); - } -} diff --git a/crates/pulsing-actor/tests/cluster/member_tests.rs b/crates/pulsing-actor/tests/cluster/member_tests.rs new file mode 100644 index 000000000..c36f76dcc --- /dev/null +++ b/crates/pulsing-actor/tests/cluster/member_tests.rs @@ -0,0 +1,528 @@ +//! Member types tests +//! +//! Tests for cluster member types: NodeStatus, MemberStatus, MemberInfo, +//! ClusterNode, NamedActorInfo, NamedActorInstance, etc. + +use pulsing_actor::actor::{ActorId, ActorPath, NodeId}; +use pulsing_actor::cluster::{ + ActorLocation, ClusterNode, FailureInfo, MemberInfo, MemberStatus, NamedActorInfo, + NamedActorInstance, NodeStatus, +}; +use std::collections::HashMap; +use std::net::SocketAddr; + +// ============================================================================ +// NodeStatus Tests +// ============================================================================ + +#[test] +fn test_node_status_is_online() { + assert!(NodeStatus::Online.is_online()); + assert!(!NodeStatus::PFail.is_online()); + assert!(!NodeStatus::Fail.is_online()); + assert!(!NodeStatus::Handshake.is_online()); +} + +#[test] +fn test_node_status_is_failed() { + assert!(!NodeStatus::Online.is_failed()); + assert!(NodeStatus::PFail.is_failed()); + assert!(NodeStatus::Fail.is_failed()); + assert!(!NodeStatus::Handshake.is_failed()); +} + +// ============================================================================ +// MemberStatus Tests +// ============================================================================ + +#[test] +fn test_member_status_is_alive() { + assert!(MemberStatus::Alive.is_alive()); + assert!(!MemberStatus::Suspect.is_alive()); + assert!(!MemberStatus::Dead.is_alive()); + assert!(!MemberStatus::Leaving.is_alive()); +} + +#[test] +fn test_member_status_is_reachable() { + assert!(MemberStatus::Alive.is_reachable()); + assert!(MemberStatus::Suspect.is_reachable()); + assert!(!MemberStatus::Dead.is_reachable()); + assert!(!MemberStatus::Leaving.is_reachable()); +} + +// ============================================================================ +// ClusterNode Tests +// ============================================================================ + +#[test] +fn test_cluster_node_new() { + let node_id = NodeId::generate(); + let addr: SocketAddr = "127.0.0.1:8000".parse().unwrap(); + + let node = ClusterNode::new(node_id, addr, 1); + assert_eq!(node.node_id, node_id); + assert_eq!(node.addr, addr); + assert_eq!(node.status, NodeStatus::Online); + assert_eq!(node.epoch, 1); + assert!(node.last_seen > 0); +} + +#[test] +fn test_cluster_node_supersedes_by_epoch() { + let node_id = NodeId::generate(); + let addr: SocketAddr = "127.0.0.1:8000".parse().unwrap(); + + let n1 = ClusterNode::new(node_id, addr, 1); + let mut n2 = ClusterNode::new(node_id, addr, 1); + + // Same epoch, same status - neither supersedes + assert!(!n1.supersedes(&n2)); + assert!(!n2.supersedes(&n1)); + + // Higher epoch wins + n2.epoch = 2; + assert!(n2.supersedes(&n1)); + assert!(!n1.supersedes(&n2)); +} + +#[test] +fn test_cluster_node_supersedes_by_status() { + let node_id = NodeId::generate(); + let addr: SocketAddr = "127.0.0.1:8000".parse().unwrap(); + + let mut n1 = ClusterNode::new(node_id, addr, 2); + let n2 = ClusterNode::new(node_id, addr, 2); + + // Same epoch, higher status wins + n1.status = NodeStatus::Fail; + assert!(n1.supersedes(&n2)); +} + +// ============================================================================ +// MemberInfo Tests +// ============================================================================ + +#[test] +fn test_member_info_creation() { + let node_id = NodeId::generate(); + let addr: SocketAddr = "127.0.0.1:8000".parse().unwrap(); + let gossip_addr: SocketAddr = "127.0.0.1:7000".parse().unwrap(); + + let member = MemberInfo::new(node_id, addr, gossip_addr); + + assert_eq!(member.node_id, node_id); + assert_eq!(member.addr, addr); + assert_eq!(member.gossip_addr, gossip_addr); + assert_eq!(member.status, MemberStatus::Alive); + assert_eq!(member.incarnation, 0); +} + +#[test] +fn test_member_info_refute() { + let node_id = NodeId::generate(); + let addr: SocketAddr = "127.0.0.1:8000".parse().unwrap(); + + let mut member = MemberInfo::new(node_id, addr, addr); + member.suspect(); + assert_eq!(member.status, MemberStatus::Suspect); + + member.refute(); + assert_eq!(member.status, MemberStatus::Alive); + assert_eq!(member.incarnation, 1); +} + +#[test] +fn test_member_info_suspect_from_alive() { + let node_id = NodeId::generate(); + let addr: SocketAddr = "127.0.0.1:8000".parse().unwrap(); + + let mut member = MemberInfo::new(node_id, addr, addr); + assert_eq!(member.status, MemberStatus::Alive); + + member.suspect(); + assert_eq!(member.status, MemberStatus::Suspect); +} + +#[test] +fn test_member_info_suspect_already_suspect() { + let node_id = NodeId::generate(); + let addr: SocketAddr = "127.0.0.1:8000".parse().unwrap(); + + let mut member = MemberInfo::new(node_id, addr, addr); + member.suspect(); + member.suspect(); // Should not change + assert_eq!(member.status, MemberStatus::Suspect); +} + +#[test] +fn test_member_info_mark_dead() { + let node_id = NodeId::generate(); + let addr: SocketAddr = "127.0.0.1:8000".parse().unwrap(); + + let mut member = MemberInfo::new(node_id, addr, addr); + assert_eq!(member.status, MemberStatus::Alive); + + member.mark_dead(); + assert_eq!(member.status, MemberStatus::Dead); +} + +#[test] +fn test_member_info_supersedes_by_incarnation() { + let node_id = NodeId::generate(); + let addr: SocketAddr = "127.0.0.1:8000".parse().unwrap(); + + let mut m1 = MemberInfo::new(node_id, addr, addr); + let mut m2 = MemberInfo::new(node_id, addr, addr); + + // Same incarnation, same status - neither supersedes + assert!(!m1.supersedes(&m2)); + assert!(!m2.supersedes(&m1)); + + // Suspect supersedes Alive at same incarnation + m1.suspect(); + assert!(m1.supersedes(&m2)); + assert!(!m2.supersedes(&m1)); + + // Higher incarnation always wins + m2.incarnation = 1; + assert!(!m1.supersedes(&m2)); + assert!(m2.supersedes(&m1)); +} + +#[test] +fn test_member_info_supersedes_dead() { + let node_id = NodeId::generate(); + let addr: SocketAddr = "127.0.0.1:8000".parse().unwrap(); + + let alive = MemberInfo::new(node_id, addr, addr); + let mut dead = MemberInfo::new(node_id, addr, addr); + dead.mark_dead(); + + // Dead supersedes Alive at same incarnation + assert!(dead.supersedes(&alive)); + assert!(!alive.supersedes(&dead)); +} + +#[test] +fn test_member_info_equality() { + let node_id = NodeId::generate(); + let addr1: SocketAddr = "127.0.0.1:8000".parse().unwrap(); + let addr2: SocketAddr = "127.0.0.1:9000".parse().unwrap(); + + let m1 = MemberInfo::new(node_id, addr1, addr1); + let m2 = MemberInfo::new(node_id, addr2, addr2); + + // Equality is based on node_id only + assert_eq!(m1, m2); +} + +#[test] +fn test_member_info_hash() { + use std::collections::hash_map::DefaultHasher; + use std::hash::{Hash, Hasher}; + + let node_id = NodeId::generate(); + let addr1: SocketAddr = "127.0.0.1:8000".parse().unwrap(); + let addr2: SocketAddr = "127.0.0.1:9000".parse().unwrap(); + + let m1 = MemberInfo::new(node_id, addr1, addr1); + let m2 = MemberInfo::new(node_id, addr2, addr2); + + let mut hasher1 = DefaultHasher::new(); + let mut hasher2 = DefaultHasher::new(); + m1.hash(&mut hasher1); + m2.hash(&mut hasher2); + + // Same node_id should have same hash + assert_eq!(hasher1.finish(), hasher2.finish()); +} + +// ============================================================================ +// ActorLocation Tests +// ============================================================================ + +#[test] +fn test_actor_location() { + let actor_id = ActorId::local(1); + let node_id = NodeId::generate(); + + let location = ActorLocation::new(actor_id, node_id); + assert_eq!(location.actor_id, actor_id); + assert_eq!(location.node_id, node_id); + assert_eq!(location.version, 0); +} + +// ============================================================================ +// FailureInfo Tests +// ============================================================================ + +#[test] +fn test_failure_info() { + let node_id = NodeId::generate(); + let reporter_id = NodeId::generate(); + + let failure = FailureInfo { + node_id, + status: NodeStatus::PFail, + epoch: 5, + reported_by: reporter_id, + }; + + assert_eq!(failure.node_id, node_id); + assert_eq!(failure.status, NodeStatus::PFail); + assert_eq!(failure.epoch, 5); + assert_eq!(failure.reported_by, reporter_id); +} + +// ============================================================================ +// NamedActorInstance Tests +// ============================================================================ + +#[test] +fn test_named_actor_instance_new() { + let node_id = NodeId::generate(); + let actor_id = ActorId::local(42); + + let instance = NamedActorInstance::new(node_id, actor_id); + + assert_eq!(instance.node_id, node_id); + assert_eq!(instance.actor_id, actor_id); + assert!(instance.metadata.is_empty()); +} + +#[test] +fn test_named_actor_instance_with_metadata() { + let node_id = NodeId::generate(); + let actor_id = ActorId::local(42); + let mut metadata = HashMap::new(); + metadata.insert("class".to_string(), "Counter".to_string()); + metadata.insert("module".to_string(), "__main__".to_string()); + metadata.insert("file".to_string(), "/app/main.py".to_string()); + + let instance = NamedActorInstance::with_metadata(node_id, actor_id, metadata.clone()); + + assert_eq!(instance.node_id, node_id); + assert_eq!(instance.actor_id, actor_id); + assert_eq!(instance.metadata.get("class"), Some(&"Counter".to_string())); + assert_eq!( + instance.metadata.get("module"), + Some(&"__main__".to_string()) + ); + assert_eq!( + instance.metadata.get("file"), + Some(&"/app/main.py".to_string()) + ); +} + +// ============================================================================ +// NamedActorInfo Tests +// ============================================================================ + +#[test] +fn test_named_actor_info_new() { + let path = ActorPath::new("services/llm").unwrap(); + let info = NamedActorInfo::new(path.clone()); + + assert_eq!(info.path, path); + assert!(info.instances.is_empty()); + assert!(info.is_empty()); + assert_eq!(info.version, 0); +} + +#[test] +fn test_named_actor_info_with_instance() { + let path = ActorPath::new("services/llm").unwrap(); + let node_id = NodeId::generate(); + + let info = NamedActorInfo::with_instance(path.clone(), node_id); + + assert_eq!(info.path, path); + assert_eq!(info.instance_count(), 1); + assert!(!info.is_empty()); + assert_eq!(info.version, 1); + assert!(info.instance_nodes.contains(&node_id)); +} + +#[test] +fn test_named_actor_info_with_full_instance() { + let path = ActorPath::new("actors/counter").unwrap(); + let node_id = NodeId::generate(); + let actor_id = ActorId::local(42); + let mut metadata = HashMap::new(); + metadata.insert("class".to_string(), "Counter".to_string()); + + let instance = NamedActorInstance::with_metadata(node_id, actor_id, metadata); + let info = NamedActorInfo::with_full_instance(path.clone(), instance); + + assert_eq!(info.path, path); + assert_eq!(info.instance_count(), 1); + assert!(info.instance_nodes.contains(&node_id)); + assert!(info.instances.contains_key(&node_id)); + + let retrieved = info.get_instance(&node_id).unwrap(); + assert_eq!(retrieved.actor_id, actor_id); + assert_eq!( + retrieved.metadata.get("class"), + Some(&"Counter".to_string()) + ); +} + +#[test] +fn test_named_actor_info_add_instance() { + let path = ActorPath::new("services/llm").unwrap(); + let node1 = NodeId::generate(); + let node2 = NodeId::generate(); + + let mut info = NamedActorInfo::new(path); + info.add_instance(node1); + info.add_instance(node2); + + assert_eq!(info.instance_count(), 2); + assert_eq!(info.version, 2); +} + +#[test] +fn test_named_actor_info_add_duplicate_instance() { + let path = ActorPath::new("services/llm").unwrap(); + let node_id = NodeId::generate(); + + let mut info = NamedActorInfo::new(path); + info.add_instance(node_id); + info.add_instance(node_id); // Duplicate + + assert_eq!(info.instance_count(), 1); + assert_eq!(info.version, 1); // Version not incremented for duplicate +} + +#[test] +fn test_named_actor_info_add_full_instance() { + let path = ActorPath::new("actors/counter").unwrap(); + let node1 = NodeId::generate(); + let node2 = NodeId::generate(); + let actor_id1 = ActorId::local(1); + let actor_id2 = ActorId::local(2); + + let mut info = NamedActorInfo::new(path); + + let instance1 = NamedActorInstance::new(node1, actor_id1); + info.add_full_instance(instance1); + assert_eq!(info.instance_count(), 1); + + let instance2 = NamedActorInstance::new(node2, actor_id2); + info.add_full_instance(instance2); + assert_eq!(info.instance_count(), 2); + + assert!(info.get_instance(&node1).is_some()); + assert!(info.get_instance(&node2).is_some()); + assert_eq!(info.get_instance(&node1).unwrap().actor_id, actor_id1); + assert_eq!(info.get_instance(&node2).unwrap().actor_id, actor_id2); +} + +#[test] +fn test_named_actor_info_remove_instance() { + let path = ActorPath::new("services/llm").unwrap(); + let node1 = NodeId::generate(); + let node2 = NodeId::generate(); + + let mut info = NamedActorInfo::new(path); + info.add_instance(node1); + info.add_instance(node2); + + assert!(info.remove_instance(&node1)); + assert_eq!(info.instance_count(), 1); + + assert!(!info.remove_instance(&node1)); // Already removed +} + +#[test] +fn test_named_actor_info_get_instance_not_found() { + let path = ActorPath::new("actors/counter").unwrap(); + let node_id = NodeId::generate(); + + let info = NamedActorInfo::new(path); + + assert!(info.get_instance(&node_id).is_none()); +} + +#[test] +fn test_named_actor_info_merge() { + let path = ActorPath::new("services/llm").unwrap(); + let node1 = NodeId::generate(); + let node2 = NodeId::generate(); + let node3 = NodeId::generate(); + + let mut info1 = NamedActorInfo::with_instance(path.clone(), node1); + info1.add_instance(node2); + + let mut info2 = NamedActorInfo::with_instance(path.clone(), node2); + info2.add_instance(node3); + + info1.merge(&info2); + + assert_eq!(info1.instance_count(), 3); + assert!(info1.instance_nodes.contains(&node1)); + assert!(info1.instance_nodes.contains(&node2)); + assert!(info1.instance_nodes.contains(&node3)); +} + +#[test] +fn test_named_actor_info_merge_with_full_instances() { + let path = ActorPath::new("actors/counter").unwrap(); + let node1 = NodeId::generate(); + let node2 = NodeId::generate(); + let actor_id1 = ActorId::local(1); + let actor_id2 = ActorId::local(2); + + let mut metadata1 = HashMap::new(); + metadata1.insert("class".to_string(), "Counter".to_string()); + let instance1 = NamedActorInstance::with_metadata(node1, actor_id1, metadata1); + let mut info1 = NamedActorInfo::with_full_instance(path.clone(), instance1); + + let mut metadata2 = HashMap::new(); + metadata2.insert("class".to_string(), "Counter".to_string()); + let instance2 = NamedActorInstance::with_metadata(node2, actor_id2, metadata2); + let info2 = NamedActorInfo::with_full_instance(path.clone(), instance2); + + info1.merge(&info2); + + assert_eq!(info1.instance_count(), 2); + assert!(info1.get_instance(&node1).is_some()); + assert!(info1.get_instance(&node2).is_some()); +} + +#[test] +fn test_named_actor_info_select_instance() { + let path = ActorPath::new("services/llm").unwrap(); + let node_id = NodeId::generate(); + + let info = NamedActorInfo::with_instance(path, node_id); + + // Should return the only instance + let selected = info.select_instance(); + assert_eq!(selected, Some(node_id)); +} + +#[test] +fn test_named_actor_info_select_instance_empty() { + let path = ActorPath::new("services/llm").unwrap(); + let info = NamedActorInfo::new(path); + + assert!(info.select_instance().is_none()); +} + +#[test] +fn test_named_actor_info_node_ids_iterator() { + let path = ActorPath::new("actors/counter").unwrap(); + let node1 = NodeId::generate(); + let node2 = NodeId::generate(); + + let mut info = NamedActorInfo::new(path); + info.add_instance(node1); + info.add_instance(node2); + + let node_ids: Vec<_> = info.node_ids().collect(); + assert_eq!(node_ids.len(), 2); + assert!(node_ids.contains(&&node1)); + assert!(node_ids.contains(&&node2)); +} diff --git a/crates/pulsing-actor/tests/cluster/mod.rs b/crates/pulsing-actor/tests/cluster/mod.rs new file mode 100644 index 000000000..70c258719 --- /dev/null +++ b/crates/pulsing-actor/tests/cluster/mod.rs @@ -0,0 +1,8 @@ +//! Cluster module tests +//! +//! This module contains tests for cluster-related functionality: +//! - Member management (member_tests) +//! - Gossip protocol (gossip_tests) +//! - SWIM protocol (swim_tests) + +mod member_tests; diff --git a/crates/pulsing-actor/tests/cluster_tests.rs b/crates/pulsing-actor/tests/cluster_tests.rs index 0624c0469..0b7f98d35 100644 --- a/crates/pulsing-actor/tests/cluster_tests.rs +++ b/crates/pulsing-actor/tests/cluster_tests.rs @@ -1,7 +1,12 @@ //! Cluster and Gossip protocol tests +//! +//! This file contains integration tests for cluster functionality. +//! Unit tests for member types are in tests/cluster/member_tests.rs + +mod cluster; use pulsing_actor::actor::{ActorId, NodeId}; -use pulsing_actor::cluster::{GossipConfig, MemberInfo, MemberStatus}; +use pulsing_actor::cluster::GossipConfig; use pulsing_actor::prelude::*; use std::net::SocketAddr; use std::time::Duration; @@ -18,114 +23,6 @@ fn test_gossip_config_default() { assert_eq!(config.fanout, 3); } -// ============================================================================ -// Member Status Tests -// ============================================================================ - -#[test] -fn test_member_status_alive() { - let status = MemberStatus::Alive; - assert!(status.is_alive()); - assert!(status.is_reachable()); -} - -#[test] -fn test_member_status_suspect() { - let status = MemberStatus::Suspect; - assert!(!status.is_alive()); - assert!(status.is_reachable()); // Suspect is still reachable -} - -#[test] -fn test_member_status_dead() { - let status = MemberStatus::Dead; - assert!(!status.is_alive()); - assert!(!status.is_reachable()); -} - -#[test] -fn test_member_status_leaving() { - let status = MemberStatus::Leaving; - assert!(!status.is_alive()); - assert!(!status.is_reachable()); -} - -// ============================================================================ -// MemberInfo Tests -// ============================================================================ - -#[test] -fn test_member_info_creation() { - let node_id = NodeId::generate(); - let addr: SocketAddr = "127.0.0.1:8000".parse().unwrap(); - let gossip_addr: SocketAddr = "127.0.0.1:7000".parse().unwrap(); - - let member = MemberInfo::new(node_id, addr, gossip_addr); - - assert_eq!(member.node_id, node_id); - assert_eq!(member.addr, addr); - assert_eq!(member.gossip_addr, gossip_addr); - assert_eq!(member.status, MemberStatus::Alive); - assert_eq!(member.incarnation, 0); -} - -#[test] -fn test_member_info_refute() { - let node_id = NodeId::generate(); - let addr: SocketAddr = "127.0.0.1:8000".parse().unwrap(); - - let mut member = MemberInfo::new(node_id, addr, addr); - member.suspect(); - assert_eq!(member.status, MemberStatus::Suspect); - - member.refute(); - assert_eq!(member.status, MemberStatus::Alive); - assert_eq!(member.incarnation, 1); -} - -#[test] -fn test_member_info_supersedes_by_incarnation() { - let node_id = NodeId::generate(); - let addr: SocketAddr = "127.0.0.1:8000".parse().unwrap(); - - let mut m1 = MemberInfo::new(node_id, addr, addr); - let m2 = MemberInfo::new(node_id, addr, addr); - - // Same incarnation - neither supersedes - assert!(!m1.supersedes(&m2)); - assert!(!m2.supersedes(&m1)); - - // Higher incarnation supersedes - m1.incarnation = 1; - assert!(m1.supersedes(&m2)); - assert!(!m2.supersedes(&m1)); -} - -#[test] -fn test_member_info_supersedes_by_status() { - let node_id = NodeId::generate(); - let addr: SocketAddr = "127.0.0.1:8000".parse().unwrap(); - - let mut alive = MemberInfo::new(node_id, addr, addr); - let mut suspect = MemberInfo::new(node_id, addr, addr); - let mut dead = MemberInfo::new(node_id, addr, addr); - - alive.status = MemberStatus::Alive; - suspect.status = MemberStatus::Suspect; - dead.status = MemberStatus::Dead; - - // Dead supersedes all - assert!(dead.supersedes(&alive)); - assert!(dead.supersedes(&suspect)); - - // Suspect supersedes Alive - assert!(suspect.supersedes(&alive)); - - // Alive doesn't supersede others - assert!(!alive.supersedes(&suspect)); - assert!(!alive.supersedes(&dead)); -} - // ============================================================================ // ActorSystem Cluster Tests // ============================================================================ diff --git a/crates/pulsing-actor/tests/system_actor_tests.rs b/crates/pulsing-actor/tests/system_actor_tests.rs index 4d4390733..9aaf698ff 100644 --- a/crates/pulsing-actor/tests/system_actor_tests.rs +++ b/crates/pulsing-actor/tests/system_actor_tests.rs @@ -2,8 +2,11 @@ //! //! Tests for the built-in SystemActor functionality +use pulsing_actor::actor::ActorId; use pulsing_actor::prelude::*; -use pulsing_actor::system_actor::{ActorInfo, SystemMessage, SystemResponse, SYSTEM_ACTOR_PATH}; +use pulsing_actor::system_actor::{ + ActorInfo, ActorRegistry, SystemMessage, SystemMetrics, SystemResponse, SYSTEM_ACTOR_PATH, +}; use std::time::Duration; // ============================================================================ @@ -417,3 +420,85 @@ async fn test_system_actor_uptime_increases() { system.shutdown().await.unwrap(); } + +// ============================================================================ +// ActorRegistry Tests (moved from src/system_actor/mod.rs) +// ============================================================================ + +#[test] +fn test_actor_registry() { + let registry = ActorRegistry::new(); + let actor_id = ActorId::local(1); + + registry.register("test", actor_id, "TestActor", true); + assert!(registry.contains("test")); + assert_eq!(registry.count(), 1); + + let info = registry.get_info("test").unwrap(); + assert_eq!(info.name, "test"); + assert_eq!(info.actor_type, "TestActor"); + + registry.unregister("test"); + assert!(!registry.contains("test")); +} + +#[test] +fn test_actor_registry_list_all() { + let registry = ActorRegistry::new(); + + registry.register("actor1", ActorId::local(1), "TypeA", true); + registry.register("actor2", ActorId::local(2), "TypeB", false); + + let actors = registry.list_all(); + assert_eq!(actors.len(), 2); +} + +#[test] +fn test_actor_registry_get_not_found() { + let registry = ActorRegistry::new(); + assert!(registry.get("nonexistent").is_none()); + assert!(registry.get_info("nonexistent").is_none()); +} + +// ============================================================================ +// SystemMetrics Tests (moved from src/system_actor/mod.rs) +// ============================================================================ + +#[test] +fn test_system_metrics() { + let metrics = SystemMetrics::new(); + + metrics.inc_message(); + metrics.inc_message(); + assert_eq!(metrics.messages_total(), 2); + + metrics.inc_actor_created(); + assert_eq!(metrics.actors_created(), 1); + + metrics.inc_actor_stopped(); + assert_eq!(metrics.actors_stopped(), 1); +} + +#[test] +fn test_system_metrics_concurrent() { + use std::sync::Arc; + use std::thread; + + let metrics = Arc::new(SystemMetrics::new()); + let mut handles = vec![]; + + for _ in 0..10 { + let m = metrics.clone(); + handles.push(thread::spawn(move || { + for _ in 0..100 { + m.inc_message(); + } + })); + } + + for h in handles { + h.join().unwrap(); + } + + assert_eq!(metrics.messages_total(), 1000); +} diff --git a/docs/actor-list-guide.md b/docs/actor-list-guide.md index 7402674b5..9817b4a4d 100644 --- a/docs/actor-list-guide.md +++ b/docs/actor-list-guide.md @@ -1,5 +1,10 @@ # Actor List 命令使用指南 +!!! note "文档迁移" + 本页已迁移到文档站点的 **Guide** 中,并会以站点版本为准: + - `docs/src/guide/actor_list.zh.md` + - `docs/src/guide/actor_list.md` + `pulsing actor list` 命令用于列出当前 Actor 系统中的 actors。 ## 基本用法 diff --git a/docs/actor-list-implementation.md b/docs/actor-list-implementation.md index b9ea890ef..b25f8dd5e 100644 --- a/docs/actor-list-implementation.md +++ b/docs/actor-list-implementation.md @@ -1,5 +1,12 @@ # Pulsing Actor List 完整实现总结 +!!! note "文档迁移" + 本页偏实现细节,面向用户的最新版已迁移到文档站点的 **Guide**: + - `docs/src/guide/actor_list.zh.md` + - `docs/src/guide/actor_list.md` + - `docs/src/guide/operations.zh.md` + - `docs/src/guide/operations.md` + ## ✅ 已完成功能 ### 1. 本地查询模式 diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 40d5b7f53..cdc5ea33f 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -1,10 +1,11 @@ site_name: Pulsing site_description: A lightweight distributed Actor framework for building scalable systems -site_url: https://github.com/DeepLink-org/Pulsing +# Docs may be hosted elsewhere, but repo is the canonical entry point. +site_url: https://github.com/reiase/pulsing docs_dir: src -repo_name: DeepLink-org/Pulsing -repo_url: https://github.com/DeepLink-org/Pulsing +repo_name: reiase/pulsing +repo_url: https://github.com/reiase/pulsing theme: name: material @@ -90,6 +91,11 @@ plugins: Actor System: Actor 系统 Remote Actors: 远程 Actor Security: 安全 + Reliability: 可靠性 + Operations: 运维(CLI) + Actor List: Actor 列表 + Inspect: 巡检 + Bench: 压测 Agent: Agent 框架 Overview: 概述 AutoGen: AutoGen @@ -125,6 +131,11 @@ nav: - Actor System: guide/actor_system.md - Remote Actors: guide/remote_actors.md - Security: guide/security.md + - Reliability: guide/reliability.md + - Operations: guide/operations.md + - Actor List: guide/actor_list.md + - Inspect: guide/inspect.md + - Bench: guide/bench.md - Distributed Queue: guide/queue.md - Semantics: guide/semantics.md - Agent: @@ -154,4 +165,4 @@ extra: extra_css: - assets/stylesheets/home.css -copyright: © 2025 Pulsing. All rights reserved. +copyright: © 2026 Pulsing. All rights reserved. diff --git a/docs/src/api_reference.md b/docs/src/api_reference.md index 3792f875a..68c189427 100644 --- a/docs/src/api_reference.md +++ b/docs/src/api_reference.md @@ -220,6 +220,43 @@ async def main(): await shutdown() ``` +#### Supervision (actor-level restarts) + +`@remote` supports **actor-level restarts** via optional parameters: + +- `restart_policy`: `"never"` (default), `"always"`, `"on-failure"` +- `max_restarts`: maximum number of restarts (default: `3`) +- `min_backoff` / `max_backoff`: backoff bounds in seconds + +Example: + +```python +from pulsing.actor import remote + +@remote(restart_policy="on-failure", max_restarts=5, min_backoff=0.2, max_backoff=10.0) +class Worker: + def work(self, x: int) -> int: + return 100 // x +``` + +Notes: + +- This is **not** a supervision tree. +- Restarts do **not** imply exactly-once semantics; design idempotent handlers. + +## Helpers + +### ask_with_timeout + +Convenience wrapper around `ActorRef.ask()` with timeout support: + +```python +from pulsing.actor import ask_with_timeout + +result = await ask_with_timeout(ref, {"op": "compute"}, timeout=10.0) +``` + + After decoration, the class provides: - `spawn(**kwargs) -> ActorRef`: Create actor (uses global system from `init()`) diff --git a/docs/src/api_reference.zh.md b/docs/src/api_reference.zh.md index c31767b9a..188b7c07f 100644 --- a/docs/src/api_reference.zh.md +++ b/docs/src/api_reference.zh.md @@ -220,6 +220,43 @@ async def main(): await shutdown() ``` +#### 监督(actor 级别重启) + +`@remote` 支持通过可选参数配置 **actor 级别重启**: + +- `restart_policy`:`"never"`(默认)、`"always"`、`"on-failure"` +- `max_restarts`:最大重启次数(默认 `3`) +- `min_backoff` / `max_backoff`:退避时间下限/上限(单位秒) + +示例: + +```python +from pulsing.actor import remote + +@remote(restart_policy="on-failure", max_restarts=5, min_backoff=0.2, max_backoff=10.0) +class Worker: + def work(self, x: int) -> int: + return 100 // x +``` + +说明: + +- 这**不是** supervision tree。 +- 重启也**不等于** exactly-once;业务逻辑需要幂等与去重。 + +## 辅助函数 + +### ask_with_timeout + +为 `ActorRef.ask()` 提供一个带超时的便捷封装: + +```python +from pulsing.actor import ask_with_timeout + +result = await ask_with_timeout(ref, {"op": "compute"}, timeout=10.0) +``` + + 装饰后,类提供: - `spawn(**kwargs) -> ActorRef`: 创建 actor(使用 `init()` 初始化的全局系统) diff --git a/docs/src/design/actor-system.md b/docs/src/design/actor-system.md index 2ea889e8c..6a1afc5ac 100644 --- a/docs/src/design/actor-system.md +++ b/docs/src/design/actor-system.md @@ -457,7 +457,7 @@ match actor_ref.ask::(msg).await { ## 未来规划 -- [ ] Actor 监督树 (Supervision) +- [ ] Actor 监督树 (Supervision)(不计划引入) - [ ] 持久化支持 - [ ] 更完善的 Leader Election - [ ] Metrics 和 Tracing 集成 diff --git a/docs/src/design/actor-system.zh.md b/docs/src/design/actor-system.zh.md index 2ea889e8c..6a1afc5ac 100644 --- a/docs/src/design/actor-system.zh.md +++ b/docs/src/design/actor-system.zh.md @@ -457,7 +457,7 @@ match actor_ref.ask::(msg).await { ## 未来规划 -- [ ] Actor 监督树 (Supervision) +- [ ] Actor 监督树 (Supervision)(不计划引入) - [ ] 持久化支持 - [ ] 更完善的 Leader Election - [ ] Metrics 和 Tracing 集成 diff --git a/docs/src/examples/llm_inference.md b/docs/src/examples/llm_inference.md index 672a96fa6..05bf47832 100644 --- a/docs/src/examples/llm_inference.md +++ b/docs/src/examples/llm_inference.md @@ -1,25 +1,84 @@ -# LLM Inference (overview) +# LLM Inference (runnable) -Pulsing is a **general-purpose distributed actor framework** and also a good fit for **LLM inference services**, especially when you need: +This guide shows how to run a **router + worker** LLM service with Pulsing, and expose an **OpenAI-compatible HTTP API**. -- a router + worker architecture -- distributed scheduling / load awareness -- streaming responses (`ask_stream`) +## Architecture -This page is currently an overview (Draft). See: +- **Router**: accepts HTTP requests, selects a worker, forwards `GenerateRequest` / `GenerateStreamRequest` +- **Workers**: host model replicas -- `docs/src/design/http2-transport.md` for the HTTP/2 streaming protocol design -- `docs/src/design/load_sync.md` for load sync concepts +## 0) Prerequisites -## Suggested architecture +- `pip install pulsing` +- Choose one backend: + - **Transformers**: install `torch` + `transformers` + - **vLLM**: install `vllm` -- **Router**: accepts client requests, chooses a worker, forwards request -- **Workers**: host model replicas, expose `generate` / `generate_stream` +## 1) Start the Router (Terminal A) -## Next step +The router needs an **actor system address** so workers can join the same cluster: -If you want this page to become a runnable example, tell me which backend you want: +```bash +pulsing actor router --addr 0.0.0.0:8000 --http_port 8080 --model_name my-llm +``` -- `transformers` + `torch` -- `vllm` -- `triton` / custom engine +## 2) Start workers + +You can run **one or more** workers. Each worker should join the router node via `--seeds`. + +### Option A: Transformers worker (Terminal B) + +```bash +pulsing actor transformers --model gpt2 --device cpu --addr 0.0.0.0:8001 --seeds 127.0.0.1:8000 +``` + +### Option B: vLLM worker (Terminal C) + +```bash +pulsing actor vllm --model Qwen/Qwen2.5-0.5B --addr 0.0.0.0:8002 --seeds 127.0.0.1:8000 +``` + +## 3) Verify cluster + workers + +### List actors (observer mode) + +```bash +pulsing actor list --endpoint 127.0.0.1:8000 +``` + +### Inspect cluster + +```bash +pulsing inspect --seeds 127.0.0.1:8000 +``` + +## 4) Call the OpenAI-compatible API + +### Non-streaming + +```bash +curl -s http://localhost:8080/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{"model":"my-llm","messages":[{"role":"user","content":"Hello"}],"stream":false}' +``` + +### Streaming (SSE) + +```bash +curl -N http://localhost:8080/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{"model":"my-llm","messages":[{"role":"user","content":"Tell me a joke"}],"stream":true}' +``` + +## Troubleshooting + +- If you see `No available workers`, ensure: + - router is started with `--addr` + - workers join via `--seeds ` + - the worker actor name is `worker` (default) + +See also: + +- [Operations (CLI)](../guide/operations.md) +- [HTTP2 Transport (design)](../design/http2-transport.md) +- [Load Sync (design)](../design/load_sync.md) diff --git a/docs/src/examples/llm_inference.zh.md b/docs/src/examples/llm_inference.zh.md index d30a1f45f..143d7d50f 100644 --- a/docs/src/examples/llm_inference.zh.md +++ b/docs/src/examples/llm_inference.zh.md @@ -1,25 +1,84 @@ -# LLM 推理(概览) +# LLM 推理(可运行) -Pulsing 正在变成一个**通用的分布式 Actor 框架**,同时也很适合用于 **LLM 推理服务**,尤其是需要: +本指南展示如何用 Pulsing 跑通 **router + worker** 架构,并对外暴露 **OpenAI 兼容 HTTP API**。 -- router + worker 架构 -- 分布式调度 / 负载感知 -- 流式响应(`ask_stream`) +## 推荐架构 -本页目前是概览(Draft)。相关设计可先看: +- **Router**:接入 HTTP 请求,选择 worker,转发 `GenerateRequest` / `GenerateStreamRequest` +- **Worker**:承载模型副本 -- `docs/src/design/http2-transport.md`:HTTP/2 流式协议设计 -- `docs/src/design/load_sync.md`:负载同步机制 +## 0)前置条件 -## 推荐架构 +- `pip install pulsing` +- 选择一种或两种后端: + - **Transformers**:安装 `torch` + `transformers` + - **vLLM**:安装 `vllm` + +## 1)启动 Router(终端 A) + +Router 需要指定 **actor system 地址**,以便其它进程启动的 workers 加入同一集群: + +```bash +pulsing actor router --addr 0.0.0.0:8000 --http_port 8080 --model_name my-llm +``` + +## 2)启动 Worker + +你可以启动 **一个或多个** worker。每个 worker 通过 `--seeds` 加入 Router 节点。 + +### 方案 A:Transformers Worker(终端 B) + +```bash +pulsing actor transformers --model gpt2 --device cpu --addr 0.0.0.0:8001 --seeds 127.0.0.1:8000 +``` + +### 方案 B:vLLM Worker(终端 C) + +```bash +pulsing actor vllm --model Qwen/Qwen2.5-0.5B --addr 0.0.0.0:8002 --seeds 127.0.0.1:8000 +``` + +## 3)验证集群与 worker + +### 列出 actors(观察者模式) + +```bash +pulsing actor list --endpoint 127.0.0.1:8000 +``` + +### 巡检集群 + +```bash +pulsing inspect --seeds 127.0.0.1:8000 +``` + +## 4)调用 OpenAI 兼容 API + +### 非流式 + +```bash +curl -s http://localhost:8080/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{"model":"my-llm","messages":[{"role":"user","content":"Hello"}],"stream":false}' +``` + +### 流式(SSE) + +```bash +curl -N http://localhost:8080/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{"model":"my-llm","messages":[{"role":"user","content":"Tell me a joke"}],"stream":true}' +``` -- **Router**:接入请求,选择 worker,转发请求 -- **Worker**:承载模型副本,对外提供 `generate` / `generate_stream` +## 排障 -## 下一步 +- 如果出现 `No available workers`,请检查: + - router 是否带了 `--addr` + - worker 是否通过 `--seeds ` 加入 + - worker actor 名称是否为 `worker`(默认) -如果你希望把这里做成可运行示例,请告诉我你希望使用哪种后端: +更多: -- `transformers` + `torch` -- `vllm` -- `triton` / 自研引擎 +- [运维(CLI)](../guide/operations.zh.md) +- [HTTP2 传输(设计)](../design/http2-transport.zh.md) +- [负载同步(设计)](../design/load_sync.zh.md) diff --git a/docs/src/guide/actor_list.md b/docs/src/guide/actor_list.md new file mode 100644 index 000000000..0b5173852 --- /dev/null +++ b/docs/src/guide/actor_list.md @@ -0,0 +1,39 @@ +# Actor List (CLI) + +`pulsing actor list` is a lightweight **observer** tool that queries actor lists via HTTP endpoints. + +## When to use + +- Verify whether a node is reachable +- Check which actors exist on a node +- Get a quick view of a cluster without joining the gossip cluster + +## Single node + +```bash +pulsing actor list --endpoint 127.0.0.1:8000 +``` + +### Show internal/system actors + +```bash +pulsing actor list --endpoint 127.0.0.1:8000 --all_actors True +``` + +### JSON output + +```bash +pulsing actor list --endpoint 127.0.0.1:8000 --json True +``` + +## Cluster (via seeds) + +```bash +pulsing actor list --seeds 127.0.0.1:8000,127.0.0.1:8001 +``` + +## Notes + +- This command uses HTTP/2 (h2c) requests and does **not** require joining the cluster. +- If a node does not expose the HTTP endpoints, it will appear as unreachable. + diff --git a/docs/src/guide/actor_list.zh.md b/docs/src/guide/actor_list.zh.md new file mode 100644 index 000000000..75efafd48 --- /dev/null +++ b/docs/src/guide/actor_list.zh.md @@ -0,0 +1,39 @@ +# Actor 列表(CLI) + +`pulsing actor list` 是一个轻量的 **观察者(observer)** 工具:通过 HTTP 端点查询 actor 列表,**无需加入 gossip 集群**。 + +## 适用场景 + +- 验证节点是否可达 +- 查看某个节点上有哪些 actors +- 快速了解集群概况(从 seeds 发现并逐个查询) + +## 单节点查询 + +```bash +pulsing actor list --endpoint 127.0.0.1:8000 +``` + +### 显示系统/内部 actors + +```bash +pulsing actor list --endpoint 127.0.0.1:8000 --all_actors True +``` + +### JSON 输出 + +```bash +pulsing actor list --endpoint 127.0.0.1:8000 --json True +``` + +## 集群查询(通过 seeds) + +```bash +pulsing actor list --seeds 127.0.0.1:8000,127.0.0.1:8001 +``` + +## 说明 + +- 该命令通过 HTTP/2(h2c)请求实现,不需要加入集群。 +- 如果节点未暴露相关 HTTP 端点,会显示为不可连接。 + diff --git a/docs/src/guide/bench.md b/docs/src/guide/bench.md new file mode 100644 index 000000000..f01b68d46 --- /dev/null +++ b/docs/src/guide/bench.md @@ -0,0 +1,18 @@ +# Bench (CLI) + +`pulsing bench` runs load tests against an inference endpoint (typically an OpenAI-compatible router). + +## Basic usage + +```bash +pulsing bench gpt2 --url http://localhost:8080 +``` + +## Notes + +- The benchmark extension is optional. If you see `pulsing._bench module not found`, install it with: + +```bash +maturin develop --manifest-path crates/pulsing-bench-py/Cargo.toml +``` + diff --git a/docs/src/guide/bench.zh.md b/docs/src/guide/bench.zh.md new file mode 100644 index 000000000..9bb46598d --- /dev/null +++ b/docs/src/guide/bench.zh.md @@ -0,0 +1,18 @@ +# 压测(CLI) + +`pulsing bench` 用于对推理端点进行压测(通常是 OpenAI 兼容 Router)。 + +## 基本用法 + +```bash +pulsing bench gpt2 --url http://localhost:8080 +``` + +## 说明 + +- Benchmark 扩展是可选的。如果出现 `pulsing._bench module not found`,需要单独安装: + +```bash +maturin develop --manifest-path crates/pulsing-bench-py/Cargo.toml +``` + diff --git a/docs/src/guide/index.md b/docs/src/guide/index.md index 106eaa5d2..242aa4f6d 100644 --- a/docs/src/guide/index.md +++ b/docs/src/guide/index.md @@ -36,4 +36,6 @@ Welcome to the Pulsing User Guide. This guide covers the core concepts and usage - **New to Pulsing?** Start with the [Quick Start Guide](../quickstart/index.md) - **Want to understand the design?** Check out the [Design Documents](../design/actor-system.md) +- **Reliability rules**: see [Reliability](reliability.md) +- **Operating a cluster?** See [Operations (CLI)](operations.md) - **Need API details?** See the [API Reference](../api_reference.md) diff --git a/docs/src/guide/index.zh.md b/docs/src/guide/index.zh.md index 9be84d873..f97f2203b 100644 --- a/docs/src/guide/index.zh.md +++ b/docs/src/guide/index.zh.md @@ -36,4 +36,6 @@ - **刚接触 Pulsing?** 从[快速开始指南](../quickstart/index.md)开始 - **想了解设计?** 查看[设计文档](../design/actor-system.md) +- **可靠性实践口径**:查看[可靠性实践](reliability.zh.md) +- **运维/巡检集群?** 查看[运维(CLI)](operations.zh.md) - **需要 API 详情?** 查看 [API 参考](../api_reference.md) diff --git a/docs/src/guide/inspect.md b/docs/src/guide/inspect.md new file mode 100644 index 000000000..9da5d6999 --- /dev/null +++ b/docs/src/guide/inspect.md @@ -0,0 +1,18 @@ +# Inspect (CLI) + +`pulsing inspect` joins a cluster (via seeds) and prints a human-friendly snapshot: + +- cluster members (node id / addr / status) +- named actor distribution (best-effort) + +## Usage + +```bash +pulsing inspect --seeds 127.0.0.1:8000 +``` + +## Notes + +- `--seeds` is required. +- If your seeds are local (`127.0.0.1` / `localhost`), the CLI binds to `127.0.0.1:0` for connectivity. + diff --git a/docs/src/guide/inspect.zh.md b/docs/src/guide/inspect.zh.md new file mode 100644 index 000000000..5eb525de4 --- /dev/null +++ b/docs/src/guide/inspect.zh.md @@ -0,0 +1,18 @@ +# 巡检(CLI) + +`pulsing inspect` 会通过 seeds 加入集群,并输出一个便于排障的快照: + +- 集群成员(node id / addr / status) +- 命名 actors 的分布(best-effort) + +## 用法 + +```bash +pulsing inspect --seeds 127.0.0.1:8000 +``` + +## 说明 + +- `--seeds` 为必填参数。 +- 如果 seeds 是本地地址(`127.0.0.1` / `localhost`),CLI 会绑定 `127.0.0.1:0` 以保证连通性。 + diff --git a/docs/src/guide/operations.md b/docs/src/guide/operations.md new file mode 100644 index 000000000..b9b9225c5 --- /dev/null +++ b/docs/src/guide/operations.md @@ -0,0 +1,65 @@ +# Operations (CLI) + +This page is a practical entry point for operating and inspecting Pulsing systems using the built-in CLI. + +## What you can do + +- **Run services**: start a router or inference workers +- **Inspect a cluster**: view nodes + named actors +- **List actors**: query actors via HTTP (observer mode) +- **Benchmark**: run load tests against an OpenAI-compatible endpoint + +## Commands + +## Quick links + +- [Actor List](actor_list.md) +- [Inspect](inspect.md) +- [Bench](bench.md) + +### Start services (router / workers) + +- Router (OpenAI-compatible HTTP API): + +```bash +pulsing actor router --addr 0.0.0.0:8000 --http_port 8080 --model_name my-llm +``` + +- Transformers worker: + +```bash +pulsing actor transformers --model gpt2 --addr 0.0.0.0:8001 --seeds 127.0.0.1:8000 +``` + +- vLLM worker: + +```bash +pulsing actor vllm --model Qwen/Qwen2 --addr 0.0.0.0:8002 --seeds 127.0.0.1:8000 +``` + +### Inspect cluster + +```bash +pulsing inspect --seeds 127.0.0.1:8000 +``` + +### List actors (observer mode) + +```bash +# single node +pulsing actor list --endpoint 127.0.0.1:8000 + +# cluster (via seeds) +pulsing actor list --seeds 127.0.0.1:8000,127.0.0.1:8001 +``` + +### Benchmark an endpoint + +```bash +pulsing bench gpt2 --url http://localhost:8080 +``` + +## Next + +- For a runnable end-to-end guide, see [LLM Inference](../examples/llm_inference.md). + diff --git a/docs/src/guide/operations.zh.md b/docs/src/guide/operations.zh.md new file mode 100644 index 000000000..7666d672a --- /dev/null +++ b/docs/src/guide/operations.zh.md @@ -0,0 +1,65 @@ +# 运维(CLI) + +本页提供使用 Pulsing CLI 进行运行、巡检与排障的最小入口。 + +## 你能做什么 + +- **启动服务**:Router / 推理 Worker +- **巡检集群**:查看节点 + 命名 actors 分布 +- **列出 actors**:通过 HTTP 观察者模式查询(无需加入集群) +- **压测**:对 OpenAI 兼容端点做基准测试 + +## 常用命令 + +## 快速入口 + +- [Actor 列表](actor_list.zh.md) +- [巡检](inspect.zh.md) +- [压测](bench.zh.md) + +### 启动服务(router / workers) + +- Router(OpenAI 兼容 HTTP API): + +```bash +pulsing actor router --addr 0.0.0.0:8000 --http_port 8080 --model_name my-llm +``` + +- Transformers Worker: + +```bash +pulsing actor transformers --model gpt2 --addr 0.0.0.0:8001 --seeds 127.0.0.1:8000 +``` + +- vLLM Worker: + +```bash +pulsing actor vllm --model Qwen/Qwen2 --addr 0.0.0.0:8002 --seeds 127.0.0.1:8000 +``` + +### 巡检集群 + +```bash +pulsing inspect --seeds 127.0.0.1:8000 +``` + +### 列出 actors(观察者模式) + +```bash +# 单节点 +pulsing actor list --endpoint 127.0.0.1:8000 + +# 集群(通过 seeds) +pulsing actor list --seeds 127.0.0.1:8000,127.0.0.1:8001 +``` + +### 压测端点 + +```bash +pulsing bench gpt2 --url http://localhost:8080 +``` + +## 下一步 + +- 想按步骤跑通完整链路:见 [LLM 推理](../examples/llm_inference.zh.md)。 + diff --git a/docs/src/guide/reliability.md b/docs/src/guide/reliability.md new file mode 100644 index 000000000..301ca19d0 --- /dev/null +++ b/docs/src/guide/reliability.md @@ -0,0 +1,54 @@ +# Reliability (timeouts, retries, restarts) + +This page collects **practical reliability rules** for building production systems with Pulsing. + +## TL;DR + +- Treat network + nodes as unreliable: **use timeouts** and design for failures. +- Pulsing does **not** provide end-to-end exactly-once semantics: **design idempotent handlers**. +- Pulsing supports **actor-level restart** (no supervision tree): use it for crash recovery, not for correctness. + +## Timeouts + +Prefer explicit timeouts on `ask`: + +```python +from pulsing.actor import ask_with_timeout + +result = await ask_with_timeout(ref, {"op": "compute"}, timeout=10.0) +``` + +## Retries (application-level) + +Pulsing does not hide retries for you. If you retry, assume duplicates are possible. + +Recommended pattern: + +- **idempotency key** in every request +- **dedup** in the actor state (or an external store) + +## Actor-level restart (supervision) + +You can configure restart policy on Python actors created via `@remote`: + +```python +from pulsing.actor import remote + +@remote(restart_policy="on-failure", max_restarts=5, min_backoff=0.2, max_backoff=10.0) +class Worker: + def work(self, x: int) -> int: + return 100 // x +``` + +### What it is (and isn’t) + +- **Is**: a crash-recovery mechanism for actor instances (with backoff and restart limits) +- **Is not**: a supervision tree, and **not** an exactly-once guarantee + +## Streaming resilience + +For streaming responses, assume partial streams are possible. Make chunks independently meaningful: + +- include `seq` / offsets / ids per chunk +- allow resume or dedup on the client side + diff --git a/docs/src/guide/reliability.zh.md b/docs/src/guide/reliability.zh.md new file mode 100644 index 000000000..6ffb06a72 --- /dev/null +++ b/docs/src/guide/reliability.zh.md @@ -0,0 +1,54 @@ +# 可靠性实践(超时 / 重试 / 重启) + +本页汇总在 Pulsing 上做生产化时,最容易踩坑、也最值得统一口径的可靠性实践。 + +## TL;DR + +- 网络与节点都会失败:**显式加超时**,并按失败场景设计。 +- Pulsing 不提供端到端 exactly-once:**业务必须幂等**。 +- Pulsing 支持 **actor 级别重启**(不引入 supervision tree):用它做“崩溃恢复”,不要把它当“正确性保证”。 + +## 超时 + +对 `ask` 建议显式加超时: + +```python +from pulsing.actor import ask_with_timeout + +result = await ask_with_timeout(ref, {"op": "compute"}, timeout=10.0) +``` + +## 重试(放在业务层) + +Pulsing 不会替你“隐式重试”。一旦你做重试,就要默认可能出现重复处理。 + +推荐模式: + +- 每个请求都带 **幂等键(idempotency key)** +- actor 内部(或外部存储)做 **去重(dedup)** + +## actor 级别重启(supervision) + +你可以在 Python 的 `@remote` 上配置重启策略: + +```python +from pulsing.actor import remote + +@remote(restart_policy="on-failure", max_restarts=5, min_backoff=0.2, max_backoff=10.0) +class Worker: + def work(self, x: int) -> int: + return 100 // x +``` + +### 它是什么 / 不是什么 + +- **是**:actor 实例崩溃后的自动恢复(带退避与重启上限) +- **不是**:supervision tree,也**不是** exactly-once 保证 + +## 流式响应的韧性 + +对流式响应要默认可能“部分输出后中断”。建议每个 chunk 自包含: + +- 每个 chunk 带 `seq` / offset / id +- 客户端可恢复或去重 + diff --git a/docs/src/guide/security.md b/docs/src/guide/security.md index 9b8804b1d..b4b20f740 100644 --- a/docs/src/guide/security.md +++ b/docs/src/guide/security.md @@ -11,6 +11,16 @@ Pulsing supports **passphrase-based mTLS (Mutual TLS)** for secure cluster commu - **Cluster isolation**: Different passphrases create completely isolated clusters - **Mutual authentication**: Both server and client verify each other's certificates +!!! danger "Pickle serialization risk (RCE)" + Pulsing currently uses **Python Pickle** for Python-to-Python message payloads. + **Never accept untrusted payloads** or expose Pulsing ports to an untrusted network. + + Production guidance: + + - **Enable mTLS** by setting a `passphrase` (required for any real deployment) + - **Network isolation** (private VPC/subnet + firewall) is still recommended + - Treat the cluster as a **trusted boundary** until non-pickle codecs are the default + ## Enabling TLS ### Development Mode (No TLS) diff --git a/docs/src/guide/security.zh.md b/docs/src/guide/security.zh.md index a3d5c27a5..8eb91ff71 100644 --- a/docs/src/guide/security.zh.md +++ b/docs/src/guide/security.zh.md @@ -11,6 +11,16 @@ Pulsing 支持**基于口令的 mTLS(双向 TLS)**实现安全的集群通 - **集群隔离**:不同口令创建完全隔离的集群 - **双向认证**:服务端和客户端互相验证证书 +!!! danger "Pickle 序列化风险(可能导致 RCE)" + Pulsing 目前在 Python-to-Python 的消息载荷上使用 **Pickle**。 + **不要接收不受信的载荷**,也不要把 Pulsing 端口暴露在不受信网络中。 + + 生产建议: + + - 通过设置 `passphrase` **启用 mTLS**(任何真实部署都应开启) + - 仍建议做 **网络隔离**(私有网段 + 防火墙) + - 在默认序列化格式不再是 pickle 之前,把集群当作 **可信边界** + ## 启用 TLS ### 开发模式(无 TLS) diff --git a/docs/src/guide/semantics.md b/docs/src/guide/semantics.md index 2516a8c49..8ad889f37 100644 --- a/docs/src/guide/semantics.md +++ b/docs/src/guide/semantics.md @@ -38,7 +38,8 @@ Practical implication: ### Failure & exceptions - If `receive` raises an exception (or returns an error message), the caller will observe a failure (typically surfaced as an exception in the client wrapper). -- Pulsing does **not** automatically restart an actor (no supervision tree guarantees unless you implement one at the application layer). +- Pulsing supports **actor-level restarts** (configurable restart policy + backoff) but does **not** provide a supervision tree. + - See: [Reliability](reliability.md) ## Remote messaging semantics (`ask` / `tell`) diff --git a/docs/src/guide/semantics.zh.md b/docs/src/guide/semantics.zh.md index 311cf3f3c..4cca9cb3f 100644 --- a/docs/src/guide/semantics.zh.md +++ b/docs/src/guide/semantics.zh.md @@ -36,7 +36,8 @@ ### 异常与失败 - `receive` 内抛异常或返回错误(如 `"Error"`)时,调用方会观察到失败(通常表现为异常或错误消息)。 -- Pulsing **不保证**自动重启/监督(supervision tree);需要应用层自行实现。 +- Pulsing 支持 **actor 级别重启**(可配置重启策略 + 退避),但 **不会引入 supervision tree**。 + - 参见:[可靠性实践](reliability.zh.md) ## 远程消息语义(`ask` / `tell`) diff --git a/docs/src/index.md b/docs/src/index.md index d263ea4e2..aab8e65dc 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -24,11 +24,12 @@ hide: toc ## Quick Start ```bash -# Install -pip install maturin -maturin develop +# Install (recommended) +pip install pulsing ``` +For development builds from source, see [Getting Started](quickstart/index.md#installation). + ```python from pulsing.actor import init, shutdown, remote @@ -54,6 +55,11 @@ async def main(): - **Distributed Computing** - Replace Ray for lightweight distributed workloads. - **Kubernetes Native** - Service discovery works seamlessly with K8s Service IPs. +## Next + +- **Run an LLM service**: see [LLM Inference](examples/llm_inference.md) +- **Operate/inspect a cluster**: see [Operations (CLI)](guide/operations.md) + ## Community - [GitHub Repository](https://github.com/reiase/pulsing) diff --git a/docs/src/index.zh.md b/docs/src/index.zh.md index 0d35ecebf..f932cfe98 100644 --- a/docs/src/index.zh.md +++ b/docs/src/index.zh.md @@ -24,11 +24,12 @@ hide: toc ## 快速开始 ```bash -# 安装 -pip install maturin -maturin develop +# 安装(推荐) +pip install pulsing ``` +如需从源码构建(开发用),请参考[快速开始](quickstart/index.zh.md#安装)。 + ```python from pulsing.actor import init, shutdown, remote @@ -54,6 +55,11 @@ async def main(): - **分布式计算** - 替代 Ray 用于轻量级分布式工作负载。 - **Kubernetes 原生** - 服务发现与 K8s Service IP 无缝配合。 +## 下一步 + +- **跑一个 LLM 服务**:[LLM 推理](examples/llm_inference.zh.md) +- **运维/巡检集群**:[运维(CLI)](guide/operations.zh.md) + ## 社区 - [GitHub 仓库](https://github.com/reiase/pulsing) diff --git a/docs/src/quickstart/index.md b/docs/src/quickstart/index.md index 2c1ec4533..86beaea5e 100644 --- a/docs/src/quickstart/index.md +++ b/docs/src/quickstart/index.md @@ -10,7 +10,13 @@ Get up and running with Pulsing quickly. - **Rust toolchain** (for building native extensions) - **Linux/macOS** -### From Source +### From PyPI (recommended) + +```bash +pip install pulsing +``` + +### From Source (development) ```bash git clone https://github.com/reiase/pulsing.git @@ -24,12 +30,6 @@ pip install maturin maturin develop ``` -### From PyPI - -```bash -pip install pulsing -``` - --- ## What is an Actor? @@ -188,4 +188,6 @@ result = await worker.ask("do_work") # Same API! - [Actor Guide](../guide/actors.md) - Advanced patterns - [Agent Frameworks](../agent/index.md) - AutoGen and LangGraph integration +- [Operations (CLI)](../guide/operations.md) - Inspect/list/benchmark +- [LLM Inference](../examples/llm_inference.md) - Router + worker architecture - [Examples](../examples/index.md) - Real-world use cases diff --git a/docs/src/quickstart/index.zh.md b/docs/src/quickstart/index.zh.md index 87ae5e933..15d61d704 100644 --- a/docs/src/quickstart/index.zh.md +++ b/docs/src/quickstart/index.zh.md @@ -10,7 +10,13 @@ - **Rust 工具链** (用于构建原生扩展) - **Linux/macOS** -### 从源码安装 +### 从 PyPI 安装(推荐) + +```bash +pip install pulsing +``` + +### 从源码安装(开发用) ```bash git clone https://github.com/reiase/pulsing.git @@ -24,12 +30,6 @@ pip install maturin maturin develop ``` -### 从 PyPI 安装 - -```bash -pip install pulsing -``` - --- ## 什么是 Actor? @@ -188,4 +188,6 @@ result = await worker.ask("do_work") # API 完全相同! - [Actor 指南](../guide/actors.zh.md) - 高级模式 - [Agent 框架](../agent/index.zh.md) - AutoGen 和 LangGraph 集成 +- [运维(CLI)](../guide/operations.zh.md) - 巡检/列表/压测 +- [LLM 推理](../examples/llm_inference.zh.md) - Router + Worker 架构 - [示例](../examples/index.zh.md) - 真实用例