Add video_track_override_path to Agent to play video from local files

dangusev · dangusev · commit ad6e0e10c3cb · 2025-12-10T13:47:29.000+01:00
diff --git a/agents-core/vision_agents/core/agents/agents.py b/agents-core/vision_agents/core/agents/agents.py
@@ -6,6 +6,7 @@
 import time
 import uuid
 from collections import defaultdict
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeGuard
 from uuid import uuid4
 
@@ -51,6 +52,7 @@
     set_call_context,
 )
 from ..utils.video_forwarder import VideoForwarder
+from ..utils.video_track import VideoFileTrack
 from . import events
 from .conversation import Conversation
 from .transcript_buffer import TranscriptBuffer
@@ -125,6 +127,7 @@ def __init__(
         options: Optional[AgentOptions] = None,
         tracer: Tracer = trace.get_tracer("agents"),
         profiler: Optional[Profiler] = None,
+        video_track_override_path: Optional[str | Path] = None,
     ):
         self._pending_turn: Optional[LLMTurn] = None
         self.participants: Optional[ParticipantsState] = None
@@ -204,12 +207,23 @@ def __init__(
         self._interval_task = None
         self._callback_executed = False
         self._track_tasks: Dict[str, asyncio.Task] = {}
+
         # Track metadata: track_id -> TrackInfo
         self._active_video_tracks: Dict[str, TrackInfo] = {}
         self._video_forwarders: List[VideoForwarder] = []
         self._current_video_track_id: Optional[str] = None
         self._connection: Optional[Connection] = None
 
+        # Optional local video track override for debugging.
+        # This track will play instead of any incoming video track.
+        self._video_track_override: Optional[VideoFileTrack] = None
+        if video_track_override_path:
+            logger.warning(
+                f'🎥 The video will be played from "{video_track_override_path}" instead of the call'
+            )
+            # Store the local video track.
+            self._video_track_override = VideoFileTrack(video_track_override_path)
+
         # the outgoing audio track
         self._audio_track: Optional[OutputAudioTrack] = None
 
@@ -1011,11 +1025,17 @@ async def _on_track_added(
         ):
             return
 
-        # Subscribe to the video track, we watch all tracks by default
-        track = self.edge.add_track_subscriber(track_id)
-        if not track:
-            self.logger.error(f"Failed to subscribe to {track_id}")
-            return
+        if self._video_track_override is not None:
+            # If local video track is set, we override all other video tracks with it.
+            # We override tracks instead of simply playing one in order to keep the same lifecycle within the call.
+            # Otherwise, we'd have a video going on without anybody on the call.
+            track = self._video_track_override
+        else:
+            # Subscribe to the video track, we watch all tracks by default
+            track = self.edge.add_track_subscriber(track_id)
+            if not track:
+                self.logger.error(f"Failed to subscribe to {track_id}")
+                return
 
         # Store track metadata
         forwarder = VideoForwarder(
diff --git a/agents-core/vision_agents/core/utils/video_track.py b/agents-core/vision_agents/core/utils/video_track.py
@@ -1,8 +1,15 @@
 import asyncio
 import logging
+import time
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+from typing import Optional, cast
 
 import av
+import av.filter
+import av.frame
 from aiortc import VideoStreamTrack
+from av import VideoFrame
 from PIL import Image
 from vision_agents.core.utils.video_queue import VideoLatestNQueue
 
@@ -88,3 +95,118 @@ def stop(self):
     @property
     def stopped(self) -> bool:
         return self._stopped
+
+
+class VideoFileTrack(VideoStreamTrack):
+    """
+    A video track reading from a local MP4 file,
+    filtered to a constant FPS using FFmpeg (30 FPS by default).
+
+    Use it for testing and debugging.
+    """
+
+    def __init__(self, path: str | Path, fps: int = 30):
+        super().__init__()
+        self.fps = fps
+        self.path = Path(path)
+
+        self._stopped = False
+        self._container = av.open(path)
+        self._stream = self._container.streams.video[0]
+        if self._stream.time_base is None:
+            raise ValueError("Cannot determine time_base for the video stream")
+
+        self._time_base = self._stream.time_base
+
+        # Decoder iterator to read the frames
+        self._decoder = self._container.decode(self._stream)
+        self._executor = ThreadPoolExecutor(1)
+        self._set_filter_graph()
+
+    def _set_filter_graph(self):
+        # Safe extraction of sample_aspect_ratio
+        sar = self._stream.sample_aspect_ratio
+        if sar is None:
+            sar_num, sar_den = 1, 1
+        else:
+            sar_num, sar_den = sar.numerator, sar.denominator
+
+        # Build ffmpeg filter graph to resample video to fixed fps
+        # Keep the reference to the graph to avoid GC
+        self._graph = av.filter.Graph()
+        # Buffer source with all required parameters
+
+        self._src = self._graph.add(
+            "buffer",
+            f"video_size={self._stream.width}x{self._stream.height}:"
+            f"pix_fmt={self._stream.pix_fmt}:"
+            f"time_base={self._time_base.numerator}/{self._time_base.denominator}:"
+            f"pixel_aspect={sar_num}/{sar_den}",
+        )
+
+        # Add an FPS filter
+        fps_filter = self._graph.add("fps", f"fps={self.fps}")
+
+        # Add a buffer sink
+        self._sink = self._graph.add("buffersink")
+
+        # Connect graph: buffer -> fps filter -> sink
+        self._src.link_to(fps_filter)
+        fps_filter.link_to(self._sink)
+        self._graph.configure()
+
+    def _next_frame(self) -> av.VideoFrame:
+        filtered_frame: Optional[av.VideoFrame] = None
+        while filtered_frame is None:
+            # Get the next decoded frame
+            try:
+                frame = next(self._decoder)
+            except StopIteration:
+                # Loop the video when it ends
+                self._container.seek(0)
+                self._decoder = self._container.decode(self._stream)
+                # Reset the filter graph too
+                self._set_filter_graph()
+                frame = next(self._decoder)
+
+            # Ensure frame has a time_base (required by buffer source)
+            frame.time_base = self._time_base
+
+            # Push decoded frame into the filter graph
+            self._src.push(frame)
+
+            # Pull filtered frame from buffersink
+            try:
+                filtered_frame = cast(av.VideoFrame, self._sink.pull())
+            except (av.ExitError, av.BlockingIOError):
+                # Filter graph is not ready to output yet
+                time.sleep(0.001)
+                continue
+            except Exception:
+                logger.exception("Failed to read a frame from video file")
+                continue
+
+        # Convert the filtered video frame to RGB for aiortc
+        new_frame = filtered_frame.to_rgb()
+
+        return new_frame
+
+    async def recv(self) -> VideoFrame:
+        """
+        Async method to produce the next filtered video frame.
+        Loops automatically at the end of the file.
+        """
+        if self._stopped:
+            raise VideoTrackClosedError("Track stopped")
+        loop = asyncio.get_running_loop()
+        frame = await loop.run_in_executor(self._executor, self._next_frame)
+        # Sleep between frames to let other coroutines to run
+        await asyncio.sleep(float(frame.time_base))
+        return frame
+
+    def stop(self) -> None:
+        self._stopped = True
+        self._executor.shutdown(wait=False)
+
+    def __repr__(self):
+        return f'<{self.__class__.__name__} path="{self.path}" fps={self.fps}>'