Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -131,3 +131,4 @@ logs
log
outputs
.history
**/traces/
314 changes: 314 additions & 0 deletions opentinker/backend_patch/verl/trainer/ppo/ray_trainer.py

Large diffs are not rendered by default.

135 changes: 135 additions & 0 deletions opentinker/client/alfworld_inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
#!/usr/bin/env python3
"""ALFWorld Inference Script.

This script runs inference/evaluation on trained ALFWorld models.

Usage:
# Start ALFWorld environment server first (in another terminal):
python -m opentinker.environment.alfworld.alfworld_server --port 8091 --split eval_in_distribution

# Run inference with scheduler:
python alfworld_inference.py \
model_path=/path/to/checkpoint \
scheduler_url=http://localhost:8089 \
data_path=/path/to/eval_data.jsonl
"""

import hydra

from utils.http_training_client import InferenceSchedulerClient
from utils.scheduler_client_lifecycle import get_lifecycle_manager
from opentinker.environment.inference_pipeline import run_inference
from opentinker.environment.alfworld import ALFWorldGame
from opentinker.environment.game_stats_client import GameStatsClient


@hydra.main(
config_path="client_config",
config_name="alfworld_inference_config.yaml",
version_base=None,
)
def main(args):
"""Run ALFWorld inference with scheduler-managed vLLM server."""
lifecycle = get_lifecycle_manager()

print("=" * 60)
print("ALFWorld Inference with Scheduler")
print("=" * 60)

if not args.model_path:
raise ValueError("model_path is required")

# 1. Submit inference job to scheduler
scheduler_client = InferenceSchedulerClient(
scheduler_url=args.get("scheduler_url", "http://localhost:8089"),
api_key=args.get("scheduler_api_key"),
)

print(f"\nModel: {args.model_path}")
print(f"Scheduler: {args.scheduler_url}")
print(f"Environment: {args.env_endpoint}")
print(f"Split: {args.split}")

print("\nSubmitting inference job to scheduler...")
job_result = scheduler_client.submit_inference_job(
model_path=args.model_path,
tokenizer_path=args.get("tokenizer_path"),
tensor_parallel_size=args.get("tensor_parallel_size", 1),
num_gpus=args.get("num_gpus"),
gpu_memory_utilization=args.get("gpu_memory_utilization", 0.9),
max_model_len=args.get("max_model_len"),
trust_remote_code=args.get("trust_remote_code", True),
)

job_id = job_result["job_id"]
vllm_server_url = job_result["vllm_server_url"]

# Register job for lifecycle cleanup
lifecycle.register_job(scheduler_client, job_id)

print(f"✓ Inference job {job_id} started at {vllm_server_url}")

# 2. Setup GameStatsClient for per-step metrics (with job_id isolation)
game_stats = GameStatsClient(args.env_endpoint, job_id=job_id)
if game_stats.health_check():
print(f"✓ Connected to ALFWorld server at {args.env_endpoint}")
game_stats.reset_all() # Reset stats for this job before inference
else:
print(
f"⚠ ALFWorld server not available at {args.env_endpoint}, continuing without stats"
)
game_stats = None

# 3. Run inference using the remote vLLM server
data_path = args.get("data_path")
if data_path:
print(f"Running inference on {data_path}...")
else:
print(f"Running inference on ALFWorld {args.split} split...")

results = run_inference(
model_path=None, # Not needed when using vllm_server_url
vllm_server_url=vllm_server_url,
tokenizer_path=args.get("tokenizer_path") or args.model_path,
data_path=data_path,
game_class=ALFWorldGame,
env_endpoint=args.env_endpoint,
job_id=job_id, # Pass job_id for stats isolation
output_path=args.get("output_path"),
temperature=args.temperature,
top_p=args.top_p,
max_tokens=args.max_new_tokens,
max_samples=args.get("max_samples"),
max_user_turns=args.multi_turn.max_user_turns,
max_assistant_turns=args.multi_turn.max_assistant_turns,
)

# 4. Log game stats after inference
print("\n" + "=" * 60)
print("Inference Results")
print("=" * 60)

if game_stats:
stats = game_stats.get_all_stats()
print(f"\nALFWorld Evaluation Stats (job_id={job_id}):")
print(f" Total episodes: {stats.get('total_games', 0)}")
print(f" Successes: {stats.get('total_wins', 0)}")
print(f" Failures: {stats.get('total_losses', 0)}")
success_rate = stats.get("cumulative_win_rate", 0)
print(f" Success rate: {success_rate:.1%}")
print(f" Mean reward: {stats.get('mean_final_reward', 0):.4f}")
print(f" Mean steps: {stats.get('mean_steps', 0):.2f}")

if results:
print(f"\nProcessed {len(results)} samples")

if args.get("output_path"):
print(f"Results saved to: {args.output_path}")

print(f"\n{'='*60}")
print("Inference completed! vLLM server will be automatically cleaned up.")
print(f"{'='*60}")


if __name__ == "__main__":
main()
37 changes: 37 additions & 0 deletions opentinker/client/client_config/alfworld_inference_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# ALFWorld Inference Configuration
# Use with: python alfworld_inference.py

# Model settings
model_path: null # Path to trained checkpoint (HuggingFace format) - REQUIRED
tokenizer_path: null # Tokenizer path (defaults to model_path if null)

# GPU settings
tensor_parallel_size: 1 # Number of GPUs for tensor parallelism
num_gpus: 1 # Number of GPUs to request from scheduler
gpu_memory_utilization: 0.9
max_model_len: null # Max model context length (null = auto)
trust_remote_code: true

# Generation parameters (greedy by default for inference)
temperature: 0.0 # 0.0 = greedy decoding for deterministic evaluation
top_p: 1.0
max_new_tokens: 4096 # Max tokens for full multi-turn trajectory

# Data settings
data_path: null # Input data file (parquet/jsonl), null = use ALFWorld split
output_path: null # Output results file (jsonl)
max_samples: null # Limit samples (null = all)

# Environment settings
env_endpoint: http://0.0.0.0:8091
split: eval_in_distribution # train, eval_in_distribution, eval_out_of_distribution

# Multi-turn settings for ALFWorld
multi_turn:
max_user_turns: 50 # Max environment interactions
max_assistant_turns: 50
max_tokens_per_turn: 256 # Per-turn response limit

# Scheduler settings
scheduler_url: http://0.0.0.0:8089
scheduler_api_key: null
32 changes: 28 additions & 4 deletions opentinker/client/client_config/alfworld_param.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ experiment_name: alfworld_training
# Logging
logger_backends: ["console", "wandb"]

# Tracing (optional)
enable_tracing: true
# Tracing (optional) - DISABLED to prevent disk space issues
enable_tracing: false
weave_project: null

# WandB (optional)
Expand All @@ -24,8 +24,8 @@ num_workers: 4
# Training duration - set ONE of these (num_steps takes precedence if both set)
num_epochs: null # Number of epochs (null = use num_steps)
num_steps: 1000 # Total training steps (null = use num_epochs)
save_freq: 20000
test_freq: 10 # Validation frequency (every N steps)
save_freq: 10000
test_freq: 100 # Validation frequency (every N steps)

# Validation parameters
val_batch_size: 50 # Total validation samples (null = 50)
Expand Down Expand Up @@ -81,3 +81,27 @@ scheduler_api_key: otk_98b8db24ccd64c92e1fdd9a232e209fa

# GPU settings
num_gpus: 4

# Actor settings (passed to server)
actor:
# World model loss: predict environment observations as auxiliary task
# 训练模型预测环境观察,提供 WM 不确定性信号
use_world_model_loss: true
world_model_loss_coef: 0.01 # 用小系数避免干扰 policy


# Turn-wise Dynamic Entropy Coefficient (WM-guided)
# 根据每个 turn 的 WM uncertainty 调整 entropy bonus
# 高 uncertainty turn -> β > per_turn_budget -> 更多探索
# 低 uncertainty turn -> β < per_turn_budget -> 更稳定执行
wm_dynamic_entropy:
enabled: true
# Per-turn budget design:
# β_t = per_turn_budget * (1 + fluctuation * tanh(γ * (u - baseline) / baseline))
# - per_turn_budget: 每个 turn 的基础 entropy budget
# - baseline: 动态计算 = 该 sample 内所有 turn 的 uncertainty 均值
# - fluctuation: 浮动范围 (0.5 = ±50%)
# - gamma: 敏感度 (越大对 uncertainty 差异越敏感)
per_turn_budget: 0.002 # 每个 turn 的基础 budget
fluctuation: 0.5 # ±50% 浮动 (beta 范围: [0.001, 0.003])
gamma: 1.0 # 敏感度
57 changes: 57 additions & 0 deletions opentinker/client/client_config/llm_user_param.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# LLM User Simulator Training Configuration
# Train a conversational agent with LLM-based user simulation

# Project settings
project_name: opentinker
experiment_name: llm_user_training

# Logging
logger_backends: ["console", "wandb"]
enable_tracing: false
wandb_key: null

# Model and tokenizer
tokenizer_path: null

# Training parameters
batch_size: 8
num_workers: 4
num_steps: 1000
save_freq: 500
test_freq: 10
val_batch_size: 20

# Generation parameters
temperature: 0.8
top_p: 0.95
max_new_tokens: 4096
max_prompt_tokens: 2048

# Algorithm
algorithm: "agent_loop"
adv_estimator: "grpo"
rollout_n: 4

# Interaction configuration
interaction:
name: llm_user_simulator
class_path: opentinker.environment.gym_environment_interaction.GymEnvironmentInteraction
config:
env_host: 0.0.0.0
env_port: 8100
env_endpoint: http://${interaction.config.env_host}:${interaction.config.env_port}
env_shards: 8
max_steps: 10
observation_template: "{observation}"

multi_turn:
max_user_turns: 10
max_assistant_turns: 10
max_tokens_per_turn: 512

# Scheduler settings
scheduler_url: "http://0.0.0.0:8780"
scheduler_api_key: null

# GPU settings
num_gpus: 4
Loading
Loading