apache
diff --git a/‎sdks/python/apache_beam/runners/interactive/interactive_beam.py‎
Lines changed: 85 additions & 0 deletions b/‎sdks/python/apache_beam/runners/interactive/interactive_beam.py‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎sdks/python/apache_beam/runners/interactive/interactive_beam_test.py‎
Lines changed: 240 additions & 0 deletions b/‎sdks/python/apache_beam/runners/interactive/interactive_beam_test.py‎
Lines changed: 240 additions & 0 deletions
diff --git a/‎sdks/python/apache_beam/runners/interactive/interactive_environment.py‎
Lines changed: 19 additions & 0 deletions b/‎sdks/python/apache_beam/runners/interactive/interactive_environment.py‎
Lines changed: 19 additions & 0 deletions
@@ -57,6 +57,7 @@
 from apache_beam.runners.interactive.display.pcoll_visualization import visualize
 from apache_beam.runners.interactive.display.pcoll_visualization import visualize_computed_pcoll
 from apache_beam.runners.interactive.options import interactive_options
+from apache_beam.runners.interactive.recording_manager import AsyncComputationResult
 from apache_beam.runners.interactive.utils import deferred_df_to_pcollection
 from apache_beam.runners.interactive.utils import elements_to_df
 from apache_beam.runners.interactive.utils import find_pcoll_name
@@ -1012,6 +1013,90 @@ def as_pcollection(pcoll_or_df):
     return result_tuple
 
 
+@progress_indicated
+def compute(
+    *pcolls: Union[Dict[Any, PCollection], Iterable[PCollection], PCollection],
+    wait_for_inputs: bool = True,
+    blocking: bool = False,
+    runner=None,
+    options=None,
+    force_compute=False,
+) -> Optional[AsyncComputationResult]:
+  """Computes the given PCollections, potentially asynchronously.
+
+  Args:
+    *pcolls: PCollections to compute. Can be a single PCollection, an iterable
+      of PCollections, or a dictionary with PCollections as values.
+    wait_for_inputs: Whether to wait until the asynchronous dependencies are
+      computed. Setting this to False allows to immediately schedule the
+      computation, but also potentially results in running the same pipeline
+      stages multiple times.
+    blocking: If False, the computation will run in non-blocking fashion. In
+      Colab/IPython environment this mode will also provide the controls for the
+      running pipeline. If True, the computation will block until the pipeline
+      is done.
+    runner: (optional) the runner with which to compute the results.
+    options: (optional) any additional pipeline options to use to compute the
+      results.
+    force_compute: (optional) if True, forces recomputation rather than using
+      cached PCollections.
+
+  Returns:
+    An AsyncComputationResult object if blocking is False, otherwise None.
+  """
+  flatten_pcolls = []
+  for pcoll_container in pcolls:
+    if isinstance(pcoll_container, dict):
+      flatten_pcolls.extend(pcoll_container.values())
+    elif isinstance(pcoll_container, (beam.pvalue.PCollection, DeferredBase)):
+      flatten_pcolls.append(pcoll_container)
+    else:
+      try:
+        flatten_pcolls.extend(iter(pcoll_container))
+      except TypeError:
+        raise ValueError(
+            f'The given pcoll {pcoll_container} is not a dict, an iterable or '
+            'a PCollection.'
+        )
+
+  pcolls_set = set()
+  for pcoll in flatten_pcolls:
+    if isinstance(pcoll, DeferredBase):
+      pcoll, _ = deferred_df_to_pcollection(pcoll)
+      watch({f'anonymous_pcollection_{id(pcoll)}': pcoll})
+    assert isinstance(
+        pcoll, beam.pvalue.PCollection
+    ), f'{pcoll} is not an apache_beam.pvalue.PCollection.'
+    pcolls_set.add(pcoll)
+
+  if not pcolls_set:
+    _LOGGER.info('No PCollections to compute.')
+    return None
+
+  pcoll_pipeline = next(iter(pcolls_set)).pipeline
+  user_pipeline = ie.current_env().user_pipeline(pcoll_pipeline)
+  if not user_pipeline:
+    watch({f'anonymous_pipeline_{id(pcoll_pipeline)}': pcoll_pipeline})
+    user_pipeline = pcoll_pipeline
+
+  for pcoll in pcolls_set:
+    if pcoll.pipeline is not user_pipeline:
+      raise ValueError('All PCollections must belong to the same pipeline.')
+
+  recording_manager = ie.current_env().get_recording_manager(
+      user_pipeline, create_if_absent=True
+  )
+
+  return recording_manager.compute_async(
+      pcolls_set,
+      wait_for_inputs=wait_for_inputs,
+      blocking=blocking,
+      runner=runner,
+      options=options,
+      force_compute=force_compute,
+  )
+
+
 @progress_indicated
 def show_graph(pipeline):
   """Shows the current pipeline shape of a given Beam pipeline as a DAG.
 
@@ -25,6 +25,7 @@
 import unittest
 from typing import NamedTuple
 from unittest.mock import patch
+from concurrent.futures import Future, TimeoutError
 
 import apache_beam as beam
 from apache_beam import dataframe as frames
@@ -36,6 +37,7 @@
 from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import DataprocClusterManager
 from apache_beam.runners.interactive.dataproc.types import ClusterMetadata
 from apache_beam.runners.interactive.options.capture_limiters import Limiter
+from apache_beam.runners.interactive.recording_manager import AsyncComputationResult
 from apache_beam.runners.interactive.testing.mock_env import isolated_env
 from apache_beam.runners.runner import PipelineState
 from apache_beam.testing.test_stream import TestStream
@@ -671,5 +673,243 @@ def test_default_value_for_invalid_worker_number(self):
     self.assertEqual(meta.num_workers, 2)
 
 
+@isolated_env
+class InteractiveBeamComputeTest(unittest.TestCase):
+
+  def setUp(self):
+    self.env = ie.current_env()
+    self.env._is_in_ipython = False  # Default to non-IPython
+
+  def test_compute_blocking(self):
+    p = beam.Pipeline(ir.InteractiveRunner())
+    data = list(range(10))
+    pcoll = p | 'Create' >> beam.Create(data)
+    ib.watch(locals())
+    self.env.track_user_pipelines()
+
+    result = ib.compute(pcoll, blocking=True)
+    self.assertIsNone(result)  # Blocking returns None
+    self.assertTrue(pcoll in self.env.computed_pcollections)
+    collected = ib.collect(pcoll, raw_records=True)
+    self.assertEqual(collected, data)
+
+  def test_compute_non_blocking(self):
+    p = beam.Pipeline(ir.InteractiveRunner())
+    data = list(range(5))
+    pcoll = p | 'Create' >> beam.Create(data)
+    ib.watch(locals())
+    self.env.track_user_pipelines()
+
+    async_result = ib.compute(pcoll, blocking=False)
+    self.assertIsInstance(async_result, AsyncComputationResult)
+
+    pipeline_result = async_result.result(timeout=60)
+    self.assertTrue(async_result.done())
+    self.assertIsNone(async_result.exception())
+    self.assertEqual(pipeline_result.state, PipelineState.DONE)
+    self.assertTrue(pcoll in self.env.computed_pcollections)
+    collected = ib.collect(pcoll, raw_records=True)
+    self.assertEqual(collected, data)
+
+  def test_compute_with_list_input(self):
+    p = beam.Pipeline(ir.InteractiveRunner())
+    pcoll1 = p | 'Create1' >> beam.Create([1, 2, 3])
+    pcoll2 = p | 'Create2' >> beam.Create([4, 5, 6])
+    ib.watch(locals())
+    self.env.track_user_pipelines()
+
+    ib.compute([pcoll1, pcoll2], blocking=True)
+    self.assertTrue(pcoll1 in self.env.computed_pcollections)
+    self.assertTrue(pcoll2 in self.env.computed_pcollections)
+
+  def test_compute_with_dict_input(self):
+    p = beam.Pipeline(ir.InteractiveRunner())
+    pcoll1 = p | 'Create1' >> beam.Create([1, 2, 3])
+    pcoll2 = p | 'Create2' >> beam.Create([4, 5, 6])
+    ib.watch(locals())
+    self.env.track_user_pipelines()
+
+    ib.compute({'a': pcoll1, 'b': pcoll2}, blocking=True)
+    self.assertTrue(pcoll1 in self.env.computed_pcollections)
+    self.assertTrue(pcoll2 in self.env.computed_pcollections)
+
+  def test_compute_empty_input(self):
+    result = ib.compute([], blocking=True)
+    self.assertIsNone(result)
+    result_async = ib.compute([], blocking=False)
+    self.assertIsNone(result_async)
+
+  def test_compute_force_recompute(self):
+    p = beam.Pipeline(ir.InteractiveRunner())
+    pcoll = p | 'Create' >> beam.Create([1, 2, 3])
+    ib.watch(locals())
+    self.env.track_user_pipelines()
+
+    ib.compute(pcoll, blocking=True)
+    self.assertTrue(pcoll in self.env.computed_pcollections)
+
+    # Mock evict_computed_pcollections to check if it's called
+    with patch.object(self.env, 'evict_computed_pcollections') as mock_evict:
+      ib.compute(pcoll, blocking=True, force_compute=True)
+      mock_evict.assert_called_once_with(p)
+    self.assertTrue(pcoll in self.env.computed_pcollections)
+
+  def test_compute_non_blocking_exception(self):
+    p = beam.Pipeline(ir.InteractiveRunner())
+
+    def raise_error(elem):
+      raise ValueError('Test Error')
+
+    pcoll = p | 'Create' >> beam.Create([1]) | 'Error' >> beam.Map(raise_error)
+    ib.watch(locals())
+    self.env.track_user_pipelines()
+
+    async_result = ib.compute(pcoll, blocking=False)
+    self.assertIsInstance(async_result, AsyncComputationResult)
+
+    with self.assertRaises(ValueError):
+      async_result.result(timeout=60)
+
+    self.assertTrue(async_result.done())
+    self.assertIsInstance(async_result.exception(), ValueError)
+    self.assertFalse(pcoll in self.env.computed_pcollections)
+
+  @patch('apache_beam.runners.interactive.recording_manager.IS_IPYTHON', True)
+  @patch('apache_beam.runners.interactive.recording_manager.display')
+  @patch('ipywidgets.Button')
+  @patch('ipywidgets.FloatProgress')
+  @patch('ipywidgets.Output')
+  @patch('ipywidgets.HBox')
+  @patch('ipywidgets.VBox')
+  def test_compute_non_blocking_ipython_widgets(
+      self,
+      mock_vbox,
+      mock_hbox,
+      mock_output,
+      mock_progress,
+      mock_button,
+      mock_display,
+  ):
+    self.env._is_in_ipython = True
+    p = beam.Pipeline(ir.InteractiveRunner())
+    pcoll = p | 'Create' >> beam.Create(range(3))
+    ib.watch(locals())
+    self.env.track_user_pipelines()
+
+    async_result = ib.compute(pcoll, blocking=False)
+    self.assertIsNotNone(async_result)
+    mock_button.assert_called_once_with(description='Cancel')
+    mock_progress.assert_called_once()
+    mock_output.assert_called_once()
+    mock_hbox.assert_called_once()
+    mock_vbox.assert_called_once()
+    mock_display.assert_called_once()
+    async_result.result(timeout=60)  # Let it finish
+
+  def test_compute_dependency_wait_true(self):
+    p = beam.Pipeline(ir.InteractiveRunner())
+    pcoll1 = p | 'Create1' >> beam.Create([1, 2, 3])
+    pcoll2 = pcoll1 | 'Map' >> beam.Map(lambda x: x * 2)
+    ib.watch(locals())
+    self.env.track_user_pipelines()
+
+    rm = self.env.get_recording_manager(p)
+
+    # Start pcoll1 computation
+    async_res1 = ib.compute(pcoll1, blocking=False)
+    self.assertTrue(self.env.is_pcollection_computing(pcoll1))
+
+    # Spy on _wait_for_dependencies
+    with patch.object(
+        rm, '_wait_for_dependencies', wraps=rm._wait_for_dependencies
+    ) as spy_wait:
+      async_res2 = ib.compute(pcoll2, blocking=False, wait_for_inputs=True)
+
+      # Check that wait_for_dependencies was called for pcoll2
+      spy_wait.assert_called_with({pcoll2}, async_res2)
+
+      # Let pcoll1 finish
+      async_res1.result(timeout=60)
+      self.assertTrue(pcoll1 in self.env.computed_pcollections)
+      self.assertFalse(self.env.is_pcollection_computing(pcoll1))
+
+      # pcoll2 should now run and complete
+      async_res2.result(timeout=60)
+      self.assertTrue(pcoll2 in self.env.computed_pcollections)
+
+  @patch.object(ie.InteractiveEnvironment, 'is_pcollection_computing')
+  def test_compute_dependency_wait_false(self, mock_is_computing):
+    p = beam.Pipeline(ir.InteractiveRunner())
+    pcoll1 = p | 'Create1' >> beam.Create([1, 2, 3])
+    pcoll2 = pcoll1 | 'Map' >> beam.Map(lambda x: x * 2)
+    ib.watch(locals())
+    self.env.track_user_pipelines()
+
+    rm = self.env.get_recording_manager(p)
+
+    # Pretend pcoll1 is computing
+    mock_is_computing.side_effect = lambda pcoll: pcoll is pcoll1
+
+    with patch.object(
+        rm, '_execute_pipeline_fragment', wraps=rm._execute_pipeline_fragment
+    ) as spy_execute:
+      async_res2 = ib.compute(pcoll2, blocking=False, wait_for_inputs=False)
+      async_res2.result(timeout=60)
+
+      # Assert that execute was called for pcoll2 without waiting
+      spy_execute.assert_called_with({pcoll2}, async_res2, ANY, ANY)
+      self.assertTrue(pcoll2 in self.env.computed_pcollections)
+
+  def test_async_computation_result_cancel(self):
+    p = beam.Pipeline(ir.InteractiveRunner())
+    # A stream that never finishes to test cancellation
+    pcoll = p | beam.Create([1]) | beam.Map(lambda x: time.sleep(100))
+    ib.watch(locals())
+    self.env.track_user_pipelines()
+
+    async_result = ib.compute(pcoll, blocking=False)
+    self.assertIsInstance(async_result, AsyncComputationResult)
+
+    # Give it a moment to start
+    time.sleep(0.1)
+
+    # Mock the pipeline result's cancel method
+    mock_pipeline_result = MagicMock()
+    mock_pipeline_result.state = PipelineState.RUNNING
+    async_result.set_pipeline_result(mock_pipeline_result)
+
+    self.assertTrue(async_result.cancel())
+    mock_pipeline_result.cancel.assert_called_once()
+
+    # The future should be cancelled eventually by the runner
+    # This part is hard to test without deeper runner integration
+    with self.assertRaises(TimeoutError):
+      async_result.result(timeout=1)  # It should not complete successfully
+
+  def test_compute_multiple_async(self):
+    p = beam.Pipeline(ir.InteractiveRunner())
+    pcoll1 = p | 'Create1' >> beam.Create([1, 2, 3])
+    pcoll2 = p | 'Create2' >> beam.Create([4, 5, 6])
+    pcoll3 = pcoll1 | 'Map1' >> beam.Map(lambda x: x * 2)
+    ib.watch(locals())
+    self.env.track_user_pipelines()
+
+    res1 = ib.compute(pcoll1, blocking=False)
+    res2 = ib.compute(pcoll2, blocking=False)
+    res3 = ib.compute(pcoll3, blocking=False)  # Depends on pcoll1
+
+    self.assertIsNotNone(res1)
+    self.assertIsNotNone(res2)
+    self.assertIsNotNone(res3)
+
+    res1.result(timeout=60)
+    res2.result(timeout=60)
+    res3.result(timeout=60)
+
+    self.assertTrue(pcoll1 in self.env.computed_pcollections)
+    self.assertTrue(pcoll2 in self.env.computed_pcollections)
+    self.assertTrue(pcoll3 in self.env.computed_pcollections)
+
+
 if __name__ == '__main__':
   unittest.main()
@@ -175,6 +175,9 @@ def __init__(self):
     # Tracks the computation completeness of PCollections. PCollections tracked
     # here don't need to be re-computed when data introspection is needed.
     self._computed_pcolls = set()
+
+    self._computing_pcolls = set()
+
     # Always watch __main__ module.
     self.watch('__main__')
     # Check if [interactive] dependencies are installed.
@@ -720,3 +723,19 @@ def _get_gcs_cache_dir(self, pipeline, cache_dir):
     bucket_name = cache_dir_path.parts[1]
     assert_bucket_exists(bucket_name)
     return 'gs://{}/{}'.format('/'.join(cache_dir_path.parts[1:]), id(pipeline))
+
+  @property
+  def computing_pcollections(self):
+    return self._computing_pcolls
+
+  def mark_pcollection_computing(self, pcolls):
+    """Marks the given pcolls as currently being computed."""
+    self._computing_pcolls.update(pcolls)
+
+  def unmark_pcollection_computing(self, pcolls):
+    """Removes the given pcolls from the computing set."""
+    self._computing_pcolls.difference_update(pcolls)
+
+  def is_pcollection_computing(self, pcoll):
+    """Checks if the given pcollection is currently being computed."""
+    return pcoll in self._computing_pcolls