open-tinker · lwaekfjlk · Jan 24, 2026 · Jan 24, 2026 · Jan 24, 2026 · Jan 24, 2026
diff --git a/.gitignore b/.gitignore
@@ -131,3 +131,4 @@ logs
 log
 outputs
 .history
+**/traces/
diff --git a/opentinker/backend_patch/verl/trainer/ppo/ray_trainer.py b/opentinker/backend_patch/verl/trainer/ppo/ray_trainer.py
diff --git a/opentinker/client/alfworld_inference.py b/opentinker/client/alfworld_inference.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+"""ALFWorld Inference Script.
+
+This script runs inference/evaluation on trained ALFWorld models.
+
+Usage:
+    # Start ALFWorld environment server first (in another terminal):
+    python -m opentinker.environment.alfworld.alfworld_server --port 8091 --split eval_in_distribution
+
+    # Run inference with scheduler:
+    python alfworld_inference.py \
+        model_path=/path/to/checkpoint \
+        scheduler_url=http://localhost:8089 \
+        data_path=/path/to/eval_data.jsonl
+"""
+
+import hydra
+
+from utils.http_training_client import InferenceSchedulerClient
+from utils.scheduler_client_lifecycle import get_lifecycle_manager
+from opentinker.environment.inference_pipeline import run_inference
+from opentinker.environment.alfworld import ALFWorldGame
+from opentinker.environment.game_stats_client import GameStatsClient
+
+
+@hydra.main(
+    config_path="client_config",
+    config_name="alfworld_inference_config.yaml",
+    version_base=None,
+)
+def main(args):
+    """Run ALFWorld inference with scheduler-managed vLLM server."""
+    lifecycle = get_lifecycle_manager()
+
+    print("=" * 60)
+    print("ALFWorld Inference with Scheduler")
+    print("=" * 60)
+
+    if not args.model_path:
+        raise ValueError("model_path is required")
+
+    # 1. Submit inference job to scheduler
+    scheduler_client = InferenceSchedulerClient(
+        scheduler_url=args.get("scheduler_url", "http://localhost:8089"),
+        api_key=args.get("scheduler_api_key"),
+    )
+
+    print(f"\nModel: {args.model_path}")
+    print(f"Scheduler: {args.scheduler_url}")
+    print(f"Environment: {args.env_endpoint}")
+    print(f"Split: {args.split}")
+
+    print("\nSubmitting inference job to scheduler...")
+    job_result = scheduler_client.submit_inference_job(
+        model_path=args.model_path,
+        tokenizer_path=args.get("tokenizer_path"),
+        tensor_parallel_size=args.get("tensor_parallel_size", 1),
+        num_gpus=args.get("num_gpus"),
+        gpu_memory_utilization=args.get("gpu_memory_utilization", 0.9),
+        max_model_len=args.get("max_model_len"),
+        trust_remote_code=args.get("trust_remote_code", True),
+    )
+
+    job_id = job_result["job_id"]
+    vllm_server_url = job_result["vllm_server_url"]
+
+    # Register job for lifecycle cleanup
+    lifecycle.register_job(scheduler_client, job_id)
+
+    print(f"✓ Inference job {job_id} started at {vllm_server_url}")
+
+    # 2. Setup GameStatsClient for per-step metrics (with job_id isolation)
+    game_stats = GameStatsClient(args.env_endpoint, job_id=job_id)
+    if game_stats.health_check():
+        print(f"✓ Connected to ALFWorld server at {args.env_endpoint}")
+        game_stats.reset_all()  # Reset stats for this job before inference
+    else:
+        print(
+            f"⚠ ALFWorld server not available at {args.env_endpoint}, continuing without stats"
+        )
+        game_stats = None
+
+    # 3. Run inference using the remote vLLM server
+    data_path = args.get("data_path")
+    if data_path:
+        print(f"Running inference on {data_path}...")
+    else:
+        print(f"Running inference on ALFWorld {args.split} split...")
+
+    results = run_inference(
+        model_path=None,  # Not needed when using vllm_server_url
+        vllm_server_url=vllm_server_url,
+        tokenizer_path=args.get("tokenizer_path") or args.model_path,
+        data_path=data_path,
+        game_class=ALFWorldGame,
+        env_endpoint=args.env_endpoint,
+        job_id=job_id,  # Pass job_id for stats isolation
+        output_path=args.get("output_path"),
+        temperature=args.temperature,
+        top_p=args.top_p,
+        max_tokens=args.max_new_tokens,
+        max_samples=args.get("max_samples"),
+        max_user_turns=args.multi_turn.max_user_turns,
+        max_assistant_turns=args.multi_turn.max_assistant_turns,
+    )
+
+    # 4. Log game stats after inference
+    print("\n" + "=" * 60)
+    print("Inference Results")
+    print("=" * 60)
+
+    if game_stats:
+        stats = game_stats.get_all_stats()
+        print(f"\nALFWorld Evaluation Stats (job_id={job_id}):")
+        print(f"  Total episodes: {stats.get('total_games', 0)}")
+        print(f"  Successes: {stats.get('total_wins', 0)}")
+        print(f"  Failures: {stats.get('total_losses', 0)}")
+        success_rate = stats.get("cumulative_win_rate", 0)
+        print(f"  Success rate: {success_rate:.1%}")
+        print(f"  Mean reward: {stats.get('mean_final_reward', 0):.4f}")
+        print(f"  Mean steps: {stats.get('mean_steps', 0):.2f}")
+
+    if results:
+        print(f"\nProcessed {len(results)} samples")
+
+    if args.get("output_path"):
+        print(f"Results saved to: {args.output_path}")
+
+    print(f"\n{'='*60}")
+    print("Inference completed! vLLM server will be automatically cleaned up.")
+    print(f"{'='*60}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/opentinker/client/client_config/alfworld_inference_config.yaml b/opentinker/client/client_config/alfworld_inference_config.yaml
@@ -0,0 +1,37 @@
+# ALFWorld Inference Configuration
+# Use with: python alfworld_inference.py
+
+# Model settings
+model_path: null # Path to trained checkpoint (HuggingFace format) - REQUIRED
+tokenizer_path: null # Tokenizer path (defaults to model_path if null)
+
+# GPU settings
+tensor_parallel_size: 1 # Number of GPUs for tensor parallelism
+num_gpus: 1 # Number of GPUs to request from scheduler
+gpu_memory_utilization: 0.9
+max_model_len: null # Max model context length (null = auto)
+trust_remote_code: true
+
+# Generation parameters (greedy by default for inference)
+temperature: 0.0 # 0.0 = greedy decoding for deterministic evaluation
+top_p: 1.0
+max_new_tokens: 4096 # Max tokens for full multi-turn trajectory
+
+# Data settings
+data_path: null # Input data file (parquet/jsonl), null = use ALFWorld split
+output_path: null # Output results file (jsonl)
+max_samples: null # Limit samples (null = all)
+
+# Environment settings
+env_endpoint: http://0.0.0.0:8091
+split: eval_in_distribution # train, eval_in_distribution, eval_out_of_distribution
+
+# Multi-turn settings for ALFWorld
+multi_turn:
+  max_user_turns: 50 # Max environment interactions
+  max_assistant_turns: 50
+  max_tokens_per_turn: 256 # Per-turn response limit
+
+# Scheduler settings
+scheduler_url: http://0.0.0.0:8089
+scheduler_api_key: null
diff --git a/opentinker/client/client_config/alfworld_param.yaml b/opentinker/client/client_config/alfworld_param.yaml
@@ -8,8 +8,8 @@ experiment_name: alfworld_training
 # Logging
 logger_backends: ["console", "wandb"]
 
-# Tracing (optional)
-enable_tracing: true
+# Tracing (optional) - DISABLED to prevent disk space issues
+enable_tracing: false
 weave_project: null
 
 # WandB (optional)
@@ -24,8 +24,8 @@ num_workers: 4
 # Training duration - set ONE of these (num_steps takes precedence if both set)
 num_epochs: null # Number of epochs (null = use num_steps)
 num_steps: 1000 # Total training steps (null = use num_epochs)
-save_freq: 20000
-test_freq: 10 # Validation frequency (every N steps)
+save_freq: 10000
+test_freq: 100 # Validation frequency (every N steps)
 
 # Validation parameters
 val_batch_size: 50 # Total validation samples (null = 50)
@@ -81,3 +81,27 @@ scheduler_api_key: otk_98b8db24ccd64c92e1fdd9a232e209fa
 
 # GPU settings
 num_gpus: 4
+
+# Actor settings (passed to server)
+actor:
+  # World model loss: predict environment observations as auxiliary task
+  # 训练模型预测环境观察，提供 WM 不确定性信号
+  use_world_model_loss: true
+  world_model_loss_coef: 0.01 # 用小系数避免干扰 policy
+
+
+  # Turn-wise Dynamic Entropy Coefficient (WM-guided)
+  # 根据每个 turn 的 WM uncertainty 调整 entropy bonus
+  # 高 uncertainty turn -> β > per_turn_budget -> 更多探索
+  # 低 uncertainty turn -> β < per_turn_budget -> 更稳定执行
+  wm_dynamic_entropy:
+    enabled: true
+    # Per-turn budget design:
+    # β_t = per_turn_budget * (1 + fluctuation * tanh(γ * (u - baseline) / baseline))
+    # - per_turn_budget: 每个 turn 的基础 entropy budget
+    # - baseline: 动态计算 = 该 sample 内所有 turn 的 uncertainty 均值
+    # - fluctuation: 浮动范围 (0.5 = ±50%)
+    # - gamma: 敏感度 (越大对 uncertainty 差异越敏感)
+    per_turn_budget: 0.002  # 每个 turn 的基础 budget
+    fluctuation: 0.5        # ±50% 浮动 (beta 范围: [0.001, 0.003])
+    gamma: 1.0              # 敏感度
diff --git a/opentinker/client/client_config/llm_user_param.yaml b/opentinker/client/client_config/llm_user_param.yaml
@@ -0,0 +1,57 @@
+# LLM User Simulator Training Configuration
+# Train a conversational agent with LLM-based user simulation
+
+# Project settings
+project_name: opentinker
+experiment_name: llm_user_training
+
+# Logging
+logger_backends: ["console", "wandb"]
+enable_tracing: false
+wandb_key: null
+
+# Model and tokenizer
+tokenizer_path: null
+
+# Training parameters
+batch_size: 8
+num_workers: 4
+num_steps: 1000
+save_freq: 500
+test_freq: 10
+val_batch_size: 20
+
+# Generation parameters
+temperature: 0.8
+top_p: 0.95
+max_new_tokens: 4096
+max_prompt_tokens: 2048
+
+# Algorithm
+algorithm: "agent_loop"
+adv_estimator: "grpo"
+rollout_n: 4
+
+# Interaction configuration
+interaction:
+  name: llm_user_simulator
+  class_path: opentinker.environment.gym_environment_interaction.GymEnvironmentInteraction
+  config:
+    env_host: 0.0.0.0
+    env_port: 8100
+    env_endpoint: http://${interaction.config.env_host}:${interaction.config.env_port}
+    env_shards: 8
+    max_steps: 10
+    observation_template: "{observation}"
+
+multi_turn:
+  max_user_turns: 10
+  max_assistant_turns: 10
+  max_tokens_per_turn: 512
+
+# Scheduler settings
+scheduler_url: "http://0.0.0.0:8780"
+scheduler_api_key: null
+
+# GPU settings
+num_gpus: 4
-Original file line number
+Diff line change
@@ Expand Up / @@ -131,3 +131,4 @@ logs @@
     log
     outputs
     .history
+    **/traces/