From d40ca1fb73856ee15ed0bc7868e975dd0f9b8ad0 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Tue, 14 Oct 2025 12:14:19 -0400 Subject: [PATCH 1/9] renaem tsets on the clsuter, for memory tracking --- pyproject.toml | 1 + tests/conftest.py | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index e96b3bf4f..a91c8218a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,6 +78,7 @@ dev = [ "pytest-print", # - 16.0 is causing pytest-xdist to crash in case of failure or skipped tests "pytest-rerunfailures<16.0", + "setproctitle", # allows renaming the test processes on the cluster "syrupy", "huggingface_hub[hf_xet]", "wandb", diff --git a/tests/conftest.py b/tests/conftest.py index 24d1078f1..60d221841 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -8,6 +8,7 @@ from io import BytesIO from pathlib import Path +import setproctitle import psutil import pyglet import pytest @@ -63,6 +64,12 @@ def pytest_make_parametrize_id(config, val, argname): return f"{val}" +@pytest.hookimpl(tryfirst=True) +def pytest_runtest_setup(item): + # Include test name in process title + setproctitle.setproctitle(f"pytest: {item.nodeid}") + + @pytest.hookimpl(tryfirst=True) def pytest_cmdline_main(config: pytest.Config) -> None: # Force disabling forked for non-linux systems From 58797b15f4805600dde992d39a25290a972a6d9b Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 14 Nov 2025 10:52:39 -0500 Subject: [PATCH 2/9] ps name has no spaes --- tests/conftest.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index d1a4589de..3f3cf5de8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -68,7 +68,8 @@ def pytest_make_parametrize_id(config, val, argname): @pytest.hookimpl(tryfirst=True) def pytest_runtest_setup(item): # Include test name in process title - setproctitle.setproctitle(f"pytest: {item.nodeid}") + test_name = item.nodeid.replace(" ", "") + setproctitle.setproctitle(f"pytest: {test_name}") @pytest.hookimpl(tryfirst=True) From effb0a057daa6e05f02f8cc3d2b6c3c90fcfab8a Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 14 Nov 2025 12:46:43 -0500 Subject: [PATCH 3/9] add monitor test mem --- tests/monitor_test_mem.py | 84 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 tests/monitor_test_mem.py diff --git a/tests/monitor_test_mem.py b/tests/monitor_test_mem.py new file mode 100644 index 000000000..422b48511 --- /dev/null +++ b/tests/monitor_test_mem.py @@ -0,0 +1,84 @@ +from collections import defaultdict +import csv +import subprocess +import time +import argparse + + +def grep(contents: list[str], target): + return [l for l in contents if target in l] + + +def get_cuda_usage() -> dict[int, int]: + output = subprocess.check_output(["nvidia-smi"]).decode("utf-8") + section = 0 + subsec = 0 + res = {} + for line in output.split("\n"): + if line.startswith("|============"): + section += 1 + subsec = 0 + continue + if line.startswith("+-------"): + subsec += 1 + continue + if section == 2 and subsec == 0: + if "No running processes" in line: + continue + split_line = line.split() + pid = int(split_line[4]) + mem = int(split_line[-2].split("MiB")[0]) + res[pid] = mem + return res + + +def get_test_name_by_pid() -> dict[int, str]: + ps_ef = subprocess.check_output(["ps", "-ef"]).decode("utf-8").split("\n") + test_lines = grep(ps_ef, "pytest-xdist") + tests = [line.partition("::")[2] for line in test_lines] + psids = [int(line.split()[1]) for line in test_lines] + test_by_psid = {psid: test for test, psid in zip(tests, psids) if test.strip() != ""} + return test_by_psid + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--out-csv-filepath", type=str, required=True) + args = parser.parse_args() + + max_mem_by_test = defaultdict(int) + + f = open(args.out_csv_filepath, "w") + dict_writer = csv.DictWriter(f, fieldnames=["test", "max_mem_mb"]) + dict_writer.writeheader() + old_mem_by_test = {} + num_results_written = 0 + disp = False + while True: + mem_by_pid = get_cuda_usage() + test_by_psid = get_test_name_by_pid() + num_tests = len(test_by_psid) + _mem_by_test = {} + for psid, test in test_by_psid.items(): + if psid not in mem_by_pid: + continue + if test.strip() == "": + continue + _mem = mem_by_pid[psid] + _mem_by_test[test] = _mem + for test, _mem in _mem_by_test.items(): + max_mem_by_test[test] = max(_mem, max_mem_by_test[test]) + for _test, _mem in old_mem_by_test.items(): + if _test not in _mem_by_test: + dict_writer.writerow({"test": _test, "max_mem_mb": max_mem_by_test[_test]}) + f.flush() + num_results_written += 1 + spinny = "x" if disp else "+" + print(num_tests, "tests running, of which", len(_mem_by_test), "on gpu. Num results written: ", num_results_written, "[updating]", " ", end="\r", flush=True) + old_mem_by_test = _mem_by_test + disp = not disp + time.sleep(1.0) + + +if __name__ == '__main__': + main() From b0d5a601c5cc8b9bbd4bb7b19e29d69663b82ae9 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 14 Nov 2025 12:51:10 -0500 Subject: [PATCH 4/9] add memory monitoring --- .github/workflows/production.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/production.yml b/.github/workflows/production.yml index 8f740cfa6..09a8d8f91 100644 --- a/.github/workflows/production.yml +++ b/.github/workflows/production.yml @@ -139,9 +139,13 @@ jobs: source /root/.venv/bin/activate pip install --no-input ".[dev,render]" + python tests/monitor_test_mem.py --out-csv-filepath "/mnt/data/artifacts/mem_test_${SLURM_JOB_NAME}.csv" & + pytest --print -x -m "benchmarks" ./tests cat speed_test*.txt > "/mnt/data/artifacts/speed_test_${SLURM_JOB_NAME}.txt" + kill $(ps -ef | grep monitor_test_mem | grep -v grep | awk '{print $2}') + # tmate -S /tmp/tmate.sock wait tmate-exit EOF - name: Kill srun job systematically From fa44c093379979021e94c0f16e1037d6ee59c936 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 14 Nov 2025 12:52:11 -0500 Subject: [PATCH 5/9] upload resutls --- .github/workflows/production.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/production.yml b/.github/workflows/production.yml index 09a8d8f91..8ec7b51c1 100644 --- a/.github/workflows/production.yml +++ b/.github/workflows/production.yml @@ -162,3 +162,8 @@ jobs: with: name: speed-test-${{ matrix.GS_ENABLE_NDARRAY }} path: "/mnt/data/artifacts/speed_test_${{ env.SLURM_JOB_NAME }}.txt" + - name: Upload benchmark mem stats as artifact + uses: actions/upload-artifact@v4 + with: + name: mem-test-${{ matrix.GS_ENABLE_NDARRAY }} + path: "/mnt/data/artifacts/mem_test_${{ env.SLURM_JOB_NAME }}.csv" \ No newline at end of file From 3cef25b145d6223f0be84616ac25b10bd574aade Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 14 Nov 2025 12:55:33 -0500 Subject: [PATCH 6/9] precommit --- tests/monitor_test_mem.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tests/monitor_test_mem.py b/tests/monitor_test_mem.py index 422b48511..c60dd07ec 100644 --- a/tests/monitor_test_mem.py +++ b/tests/monitor_test_mem.py @@ -74,11 +74,21 @@ def main() -> None: f.flush() num_results_written += 1 spinny = "x" if disp else "+" - print(num_tests, "tests running, of which", len(_mem_by_test), "on gpu. Num results written: ", num_results_written, "[updating]", " ", end="\r", flush=True) + print( + num_tests, + "tests running, of which", + len(_mem_by_test), + "on gpu. Num results written: ", + num_results_written, + "[updating]", + " ", + end="\r", + flush=True, + ) old_mem_by_test = _mem_by_test disp = not disp time.sleep(1.0) -if __name__ == '__main__': +if __name__ == "__main__": main() From 889ab0ce05f7711ececdaabc91de5d89e96eab9e Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Mon, 17 Nov 2025 14:34:59 -0500 Subject: [PATCH 7/9] use subprocess to run memory monitoring --- .github/workflows/production.yml | 4 +--- tests/conftest.py | 27 +++++++++++++++++++++++++++ tests/monitor_test_mem.py | 5 ++++- 3 files changed, 32 insertions(+), 4 deletions(-) diff --git a/.github/workflows/production.yml b/.github/workflows/production.yml index 8ec7b51c1..b667b7844 100644 --- a/.github/workflows/production.yml +++ b/.github/workflows/production.yml @@ -139,9 +139,7 @@ jobs: source /root/.venv/bin/activate pip install --no-input ".[dev,render]" - python tests/monitor_test_mem.py --out-csv-filepath "/mnt/data/artifacts/mem_test_${SLURM_JOB_NAME}.csv" & - - pytest --print -x -m "benchmarks" ./tests + pytest --mem-monitoring-filepath "/mnt/data/artifacts/mem_test_${SLURM_JOB_NAME}.csv" --print -x -m "benchmarks" ./tests cat speed_test*.txt > "/mnt/data/artifacts/speed_test_${SLURM_JOB_NAME}.txt" kill $(ps -ef | grep monitor_test_mem | grep -v grep | awk '{print $2}') diff --git a/tests/conftest.py b/tests/conftest.py index 3f3cf5de8..37823bd4f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -72,12 +72,33 @@ def pytest_runtest_setup(item): setproctitle.setproctitle(f"pytest: {test_name}") +def validate_mem_option() -> None: + try: + assert sys.platform.startswith("linux") + nvidia_out = subprocess.check_output(["nvidia-smi"]).decode("utf-8") + except Exception as e: + print("--mem not supported on this platform", e) + raise e + + @pytest.hookimpl(tryfirst=True) def pytest_cmdline_main(config: pytest.Config) -> None: # Force disabling forked for non-linux systems if not sys.platform.startswith("linux"): config.option.forked = False + if config.getoption("--mem-monitoring-filepath"): + validate_mem_option() + subprocess.Popen( + [ + sys.executable, + "tests/monitor_test_mem.py", + "--die-with-parent", + "--out-csv-filepath", + config.getoption("--mem-monitoring-filepath"), + ] + ) + # Make sure that benchmarks are running on GPU and the number of workers if valid expr = Expression.compile(config.option.markexpr) is_benchmarks = expr.evaluate(MarkMatcher.from_markers((pytest.mark.benchmarks,))) @@ -316,6 +337,12 @@ def pytest_runtest_setup(item): def pytest_addoption(parser): + parser.addoption( + "--mem-monitoring-filepath", + action="store", + default=None, + help="Run memory monitoring, and store results to mem_monitoring_filepath. CUDA on linux ONLY.", + ) parser.addoption("--backend", action="store", default=None, help="Default simulation backend.") parser.addoption( "--logical", action="store_true", default=False, help="Consider logical cores in default number of workers." diff --git a/tests/monitor_test_mem.py b/tests/monitor_test_mem.py index c60dd07ec..6780ecb22 100644 --- a/tests/monitor_test_mem.py +++ b/tests/monitor_test_mem.py @@ -2,6 +2,7 @@ import csv import subprocess import time +import os import argparse @@ -44,6 +45,7 @@ def get_test_name_by_pid() -> dict[int, str]: def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--out-csv-filepath", type=str, required=True) + parser.add_argument("--die-with-parent", action="store_true") args = parser.parse_args() max_mem_by_test = defaultdict(int) @@ -54,7 +56,7 @@ def main() -> None: old_mem_by_test = {} num_results_written = 0 disp = False - while True: + while not args.die_with_parent or os.getppid() != 1: mem_by_pid = get_cuda_usage() test_by_psid = get_test_name_by_pid() num_tests = len(test_by_psid) @@ -88,6 +90,7 @@ def main() -> None: old_mem_by_test = _mem_by_test disp = not disp time.sleep(1.0) + print("Test monitor exiting") if __name__ == "__main__": From b68bf115730d38e36a2b6d060f5ca841eeaf7b12 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Mon, 17 Nov 2025 14:55:18 -0500 Subject: [PATCH 8/9] remove kill --- .github/workflows/production.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/production.yml b/.github/workflows/production.yml index 4960ef596..66e00b7bf 100644 --- a/.github/workflows/production.yml +++ b/.github/workflows/production.yml @@ -142,8 +142,6 @@ jobs: pytest --mem-monitoring-filepath "/mnt/data/artifacts/mem_test_${SLURM_JOB_NAME}.csv" --print -x -m "benchmarks" ./tests cat speed_test*.txt > "/mnt/data/artifacts/speed_test_${SLURM_JOB_NAME}.txt" - kill $(ps -ef | grep monitor_test_mem | grep -v grep | awk '{print $2}') - # tmate -S /tmp/tmate.sock wait tmate-exit EOF - name: Kill srun job systematically From f256dc7560e9f74d48d91f2dc52c72878ab30167 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Mon, 17 Nov 2025 17:58:34 -0500 Subject: [PATCH 9/9] retry --- .github/workflows/examples.yml | 138 +++++------ .github/workflows/generic.yml | 414 +++++++++++++++---------------- .github/workflows/production.yml | 118 ++++----- 3 files changed, 335 insertions(+), 335 deletions(-) diff --git a/.github/workflows/examples.yml b/.github/workflows/examples.yml index 14df54801..23774505c 100644 --- a/.github/workflows/examples.yml +++ b/.github/workflows/examples.yml @@ -1,81 +1,81 @@ -name: Examples (CPU) +# name: Examples (CPU) -on: - pull_request: - branches: - - main +# on: +# pull_request: +# branches: +# - main -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} - cancel-in-progress: true +# concurrency: +# group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} +# cancel-in-progress: true -jobs: - run-examples: - runs-on: ubuntu-24.04 - name: ubuntu-24.04-3.12-examples +# jobs: +# run-examples: +# runs-on: ubuntu-24.04 +# name: ubuntu-24.04-3.12-examples - env: - HF_HUB_DOWNLOAD_TIMEOUT: 60 - FORCE_COLOR: 1 - PY_COLORS: 1 - GS_CACHE_FILE_PATH: ".cache/genesis" - TI_OFFLINE_CACHE: "1" - TI_OFFLINE_CACHE_CLEANING_POLICY: "never" - TI_OFFLINE_CACHE_FILE_PATH: ".cache/taichi" - TI_ENABLE_CUDA: "0" - TI_ENABLE_METAL: "0" - TI_ENABLE_OPENGL: "0" - TI_ENABLE_VULKAN: "0" - TI_DEBUG: "0" +# env: +# HF_HUB_DOWNLOAD_TIMEOUT: 60 +# FORCE_COLOR: 1 +# PY_COLORS: 1 +# GS_CACHE_FILE_PATH: ".cache/genesis" +# TI_OFFLINE_CACHE: "1" +# TI_OFFLINE_CACHE_CLEANING_POLICY: "never" +# TI_OFFLINE_CACHE_FILE_PATH: ".cache/taichi" +# TI_ENABLE_CUDA: "0" +# TI_ENABLE_METAL: "0" +# TI_ENABLE_OPENGL: "0" +# TI_ENABLE_VULKAN: "0" +# TI_DEBUG: "0" - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 1 +# steps: +# - name: Checkout code +# uses: actions/checkout@v4 +# with: +# fetch-depth: 1 - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.12" +# - name: Set up Python +# uses: actions/setup-python@v5 +# with: +# python-version: "3.12" - - name: Install Mesa OpenGL driver for headless rendering - run: | - sudo apt-get update - sudo apt install -y \ - libglu1-mesa \ - libegl-mesa0 \ - libgl1-mesa-dev +# - name: Install Mesa OpenGL driver for headless rendering +# run: | +# sudo apt-get update +# sudo apt install -y \ +# libglu1-mesa \ +# libegl-mesa0 \ +# libgl1-mesa-dev - - name: Install Python deps - run: | - pip install --upgrade pip setuptools wheel - pip install torch --index-url https://download.pytorch.org/whl/cpu - pip install -e '.[dev]' pynput +# - name: Install Python deps +# run: | +# pip install --upgrade pip setuptools wheel +# pip install torch --index-url https://download.pytorch.org/whl/cpu +# pip install -e '.[dev]' pynput - - name: Get gstaichi version - id: gstaichi_version - shell: bash - run: | - GSTAICHI_VERSION=$(python -c "import importlib.metadata ; print(importlib.metadata.version('gstaichi'))") - echo "GSTAICHI_VERSION=${GSTAICHI_VERSION}" - echo "GSTAICHI_VERSION=${GSTAICHI_VERSION}" >> $GITHUB_OUTPUT +# - name: Get gstaichi version +# id: gstaichi_version +# shell: bash +# run: | +# GSTAICHI_VERSION=$(python -c "import importlib.metadata ; print(importlib.metadata.version('gstaichi'))") +# echo "GSTAICHI_VERSION=${GSTAICHI_VERSION}" +# echo "GSTAICHI_VERSION=${GSTAICHI_VERSION}" >> $GITHUB_OUTPUT - - name: Restore cache - uses: actions/cache/restore@v4 - with: - path: .cache - key: ubuntu-24.04-3.12-examples-${{ steps.gstaichi_version.outputs.GSTAICHI_VERSION }} - restore-keys: | - ubuntu-24.04-3.12-examples-${{ steps.gstaichi_version.outputs.GSTAICHI_VERSION }}- +# - name: Restore cache +# uses: actions/cache/restore@v4 +# with: +# path: .cache +# key: ubuntu-24.04-3.12-examples-${{ steps.gstaichi_version.outputs.GSTAICHI_VERSION }} +# restore-keys: | +# ubuntu-24.04-3.12-examples-${{ steps.gstaichi_version.outputs.GSTAICHI_VERSION }}- - - name: Run examples suite - run: | - pytest -v -m examples tests/test_examples.py +# - name: Run examples suite +# run: | +# pytest -v -m examples tests/test_examples.py - - name: Save cache - if: always() - uses: actions/cache/save@v4 - with: - path: .cache - key: ubuntu-24.04-3.12-examples-${{ steps.gstaichi_version.outputs.GSTAICHI_VERSION }}-${{ github.run_id }}-${{ github.run_attempt }} +# - name: Save cache +# if: always() +# uses: actions/cache/save@v4 +# with: +# path: .cache +# key: ubuntu-24.04-3.12-examples-${{ steps.gstaichi_version.outputs.GSTAICHI_VERSION }}-${{ github.run_id }}-${{ github.run_attempt }} diff --git a/.github/workflows/generic.yml b/.github/workflows/generic.yml index f4ec80d1f..b44d31dac 100644 --- a/.github/workflows/generic.yml +++ b/.github/workflows/generic.yml @@ -1,207 +1,207 @@ -name: Generic - -on: - pull_request: - branches: - - main - release: - branches: - - main - types: [published] - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} - cancel-in-progress: ${{ github.event_name == 'pull_request' }} - -jobs: - generic-cpu: - name: ${{ matrix.OS }}-${{ matrix.PYTHON_VERSION }}-${{ matrix.GS_BACKEND }}-${{ matrix.GS_ENABLE_NDARRAY == '0' && 'field' || 'ndarray' }} - - strategy: - fail-fast: false - matrix: - # See official Github documentation for details: https://shorturl.at/NJgsj - OS: ["ubuntu-24.04", "macos-15"] - PYTHON_VERSION: ["3.10", "3.11", "3.12", "3.13"] - GS_BACKEND: ["cpu"] - GS_ENABLE_NDARRAY: ["1"] - include: - # CPU backend - dynamic array (other OSes) - - OS: "ubuntu-22.04" - PYTHON_VERSION: "3.12" - GS_BACKEND: "cpu" - GS_ENABLE_NDARRAY: "1" - - OS: "ubuntu-24.04-arm" - PYTHON_VERSION: "3.12" - GS_BACKEND: "cpu" - GS_ENABLE_NDARRAY: "1" - - OS: "windows-2025" - PYTHON_VERSION: "3.12" - GS_BACKEND: "cpu" - GS_ENABLE_NDARRAY: "1" - # CPU backend - field array - - OS: "ubuntu-24.04" - PYTHON_VERSION: "3.12" - GS_BACKEND: "cpu" - GS_ENABLE_NDARRAY: "0" - - OS: "ubuntu-24.04-arm" - PYTHON_VERSION: "3.12" - GS_BACKEND: "cpu" - GS_ENABLE_NDARRAY: "0" - - OS: "windows-2025" - PYTHON_VERSION: "3.12" - GS_BACKEND: "cpu" - GS_ENABLE_NDARRAY: "0" - - OS: "macos-15" - PYTHON_VERSION: "3.12" - GS_BACKEND: "cpu" - GS_ENABLE_NDARRAY: "0" - # GPU backend - field array - - OS: "macos-15" - PYTHON_VERSION: "3.12" - GS_BACKEND: "gpu" - GS_ENABLE_NDARRAY: "0" - - env: - HF_HUB_DOWNLOAD_TIMEOUT: "60" - FORCE_COLOR: "1" - PY_COLORS: "1" - GS_CACHE_FILE_PATH: ".cache/genesis" - GS_ENABLE_NDARRAY: ${{ matrix.GS_ENABLE_NDARRAY }} - GS_TORCH_FORCE_CPU_DEVICE: ${{ startsWith(matrix.OS, 'macos-') && '1' || '0' }} - TI_OFFLINE_CACHE: "1" - TI_OFFLINE_CACHE_CLEANING_POLICY: "never" - TI_OFFLINE_CACHE_FILE_PATH: ".cache/taichi" - TI_ENABLE_CUDA: "1" - TI_ENABLE_METAL: "1" - TI_ENABLE_OPENGL: "0" - TI_ENABLE_VULKAN: "0" - TI_DEBUG: "0" - - runs-on: ${{ matrix.OS }} - if: github.event_name != 'release' - - steps: - - name: Print system information (Windows) - if: startsWith(matrix.OS, 'windows-') - shell: pwsh - run: | - $cpu = Get-CimInstance -ClassName Win32_Processor - $ram = Get-CimInstance -ClassName Win32_ComputerSystem - [PSCustomObject]@{ - CPU_Name = $cpu.Name - Physical_Cores = ($cpu | Measure-Object -Property NumberOfCores -Sum).Sum - Logical_Processors = ($cpu | Measure-Object -Property NumberOfLogicalProcessors -Sum).Sum - Total_RAM_GB = [math]::Round($ram.TotalPhysicalMemory / 1GB, 2) - } - - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 1 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.PYTHON_VERSION }} - - - name: Install system dependencies (Windows) - if: startsWith(matrix.OS, 'windows-') - shell: bash - run: | - curl -L -o mesa.7z https://github.com/pal1000/mesa-dist-win/releases/download/25.1.5/mesa3d-25.1.5-release-msvc.7z - 7z x mesa.7z -omesa - mv -v mesa/x64/* /C/Windows/System32/ - - - name: Install Mesa 25 OpenGL driver (Linux) - if: startsWith(matrix.OS, 'ubuntu-') - run: | - sudo add-apt-repository -y ppa:kisak/kisak-mesa - sudo apt-get update - sudo apt install -y \ - libglu1-mesa \ - libegl-mesa0 \ - libgl1-mesa-dev - - name: Install python dependencies - run: | - pip install --upgrade pip setuptools pkg-info wheel - pip3 install torch --index-url https://download.pytorch.org/whl/cpu - - - name: Black Format Check - if: matrix.OS == 'ubuntu-24.04' && matrix.PYTHON_VERSION == '3.12' - run: | - pip install black - black --line-length 120 --check . - - - name: Install Genesis - shell: bash - run: | - PYTHON_DEPS="dev" - if [[ "${{ matrix.OS }}" != 'ubuntu-24.04-arm' ]] ; then - PYTHON_DEPS="${PYTHON_DEPS},usd" - fi - pip install -e ".[${PYTHON_DEPS}]" - - - name: Get artifact prefix name - id: artifact_prefix - shell: bash - run: | - OS_FAMILY=$(python -c "import platform; print(platform.system())") - MACHINE_ARCH=$(python -c "import platform; print(platform.machine())") - GSTAICHI_VERSION=$(python -c "import importlib.metadata ; print(importlib.metadata.version('gstaichi'))") - echo "ARTIFACT_PREFIX=${OS_FAMILY}-${MACHINE_ARCH}-${GSTAICHI_VERSION}-${{ matrix.GS_BACKEND }}-${{ matrix.GS_ENABLE_NDARRAY }}" >> $GITHUB_OUTPUT - - - name: Restore Taichi Kernel Cache - if: always() - uses: actions/cache/restore@v4 - with: - path: .cache - key: ${{ steps.artifact_prefix.outputs.ARTIFACT_PREFIX }} - restore-keys: | - ${{ steps.artifact_prefix.outputs.ARTIFACT_PREFIX }}- - - - name: Run unit tests - run: | - pytest -v --logical --dev --backend ${{ matrix.GS_BACKEND }} -m required --forked ./tests - - - name: Save Updated Taichi Kernel Cache - # lets match this version to the black format check - if: >- - ${{ always() && - (matrix.OS == 'ubuntu-24.04' || matrix.OS == 'macos-15' || matrix.OS == 'windows-2025') && - matrix.PYTHON_VERSION == '3.12' }} - uses: actions/cache/save@v4 - with: - path: .cache - # Note that it is necessary to create a new archive systematically for now: - # See: https://github.com/actions/cache/issues/1594 - key: ${{ steps.artifact_prefix.outputs.ARTIFACT_PREFIX }}-${{ github.run_id }}-${{ github.run_attempt }} - - publish-pypi: - name: Publish on PyPI - runs-on: ubuntu-24.04 - permissions: - id-token: write - environment: - name: advance - - if: github.event_name == 'release' - - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 1 - - - name: Build wheels - run: | - pip wheel --no-deps . -w wheelhouse - - - name: Publish the wheels on PyPI - uses: pypa/gh-action-pypi-publish@v1.12.4 - with: - packages-dir: wheelhouse - verify-metadata: true - attestations: true - print-hash: true - skip-existing: true +# name: Generic + +# on: +# pull_request: +# branches: +# - main +# release: +# branches: +# - main +# types: [published] + +# concurrency: +# group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} +# cancel-in-progress: ${{ github.event_name == 'pull_request' }} + +# jobs: +# generic-cpu: +# name: ${{ matrix.OS }}-${{ matrix.PYTHON_VERSION }}-${{ matrix.GS_BACKEND }}-${{ matrix.GS_ENABLE_NDARRAY == '0' && 'field' || 'ndarray' }} + +# strategy: +# fail-fast: false +# matrix: +# # See official Github documentation for details: https://shorturl.at/NJgsj +# OS: ["ubuntu-24.04", "macos-15"] +# PYTHON_VERSION: ["3.10", "3.11", "3.12", "3.13"] +# GS_BACKEND: ["cpu"] +# GS_ENABLE_NDARRAY: ["1"] +# include: +# # CPU backend - dynamic array (other OSes) +# - OS: "ubuntu-22.04" +# PYTHON_VERSION: "3.12" +# GS_BACKEND: "cpu" +# GS_ENABLE_NDARRAY: "1" +# - OS: "ubuntu-24.04-arm" +# PYTHON_VERSION: "3.12" +# GS_BACKEND: "cpu" +# GS_ENABLE_NDARRAY: "1" +# - OS: "windows-2025" +# PYTHON_VERSION: "3.12" +# GS_BACKEND: "cpu" +# GS_ENABLE_NDARRAY: "1" +# # CPU backend - field array +# - OS: "ubuntu-24.04" +# PYTHON_VERSION: "3.12" +# GS_BACKEND: "cpu" +# GS_ENABLE_NDARRAY: "0" +# - OS: "ubuntu-24.04-arm" +# PYTHON_VERSION: "3.12" +# GS_BACKEND: "cpu" +# GS_ENABLE_NDARRAY: "0" +# - OS: "windows-2025" +# PYTHON_VERSION: "3.12" +# GS_BACKEND: "cpu" +# GS_ENABLE_NDARRAY: "0" +# - OS: "macos-15" +# PYTHON_VERSION: "3.12" +# GS_BACKEND: "cpu" +# GS_ENABLE_NDARRAY: "0" +# # GPU backend - field array +# - OS: "macos-15" +# PYTHON_VERSION: "3.12" +# GS_BACKEND: "gpu" +# GS_ENABLE_NDARRAY: "0" + +# env: +# HF_HUB_DOWNLOAD_TIMEOUT: "60" +# FORCE_COLOR: "1" +# PY_COLORS: "1" +# GS_CACHE_FILE_PATH: ".cache/genesis" +# GS_ENABLE_NDARRAY: ${{ matrix.GS_ENABLE_NDARRAY }} +# GS_TORCH_FORCE_CPU_DEVICE: ${{ startsWith(matrix.OS, 'macos-') && '1' || '0' }} +# TI_OFFLINE_CACHE: "1" +# TI_OFFLINE_CACHE_CLEANING_POLICY: "never" +# TI_OFFLINE_CACHE_FILE_PATH: ".cache/taichi" +# TI_ENABLE_CUDA: "1" +# TI_ENABLE_METAL: "1" +# TI_ENABLE_OPENGL: "0" +# TI_ENABLE_VULKAN: "0" +# TI_DEBUG: "0" + +# runs-on: ${{ matrix.OS }} +# if: github.event_name != 'release' + +# steps: +# - name: Print system information (Windows) +# if: startsWith(matrix.OS, 'windows-') +# shell: pwsh +# run: | +# $cpu = Get-CimInstance -ClassName Win32_Processor +# $ram = Get-CimInstance -ClassName Win32_ComputerSystem +# [PSCustomObject]@{ +# CPU_Name = $cpu.Name +# Physical_Cores = ($cpu | Measure-Object -Property NumberOfCores -Sum).Sum +# Logical_Processors = ($cpu | Measure-Object -Property NumberOfLogicalProcessors -Sum).Sum +# Total_RAM_GB = [math]::Round($ram.TotalPhysicalMemory / 1GB, 2) +# } + +# - name: Checkout code +# uses: actions/checkout@v4 +# with: +# fetch-depth: 1 + +# - name: Set up Python +# uses: actions/setup-python@v5 +# with: +# python-version: ${{ matrix.PYTHON_VERSION }} + +# - name: Install system dependencies (Windows) +# if: startsWith(matrix.OS, 'windows-') +# shell: bash +# run: | +# curl -L -o mesa.7z https://github.com/pal1000/mesa-dist-win/releases/download/25.1.5/mesa3d-25.1.5-release-msvc.7z +# 7z x mesa.7z -omesa +# mv -v mesa/x64/* /C/Windows/System32/ + +# - name: Install Mesa 25 OpenGL driver (Linux) +# if: startsWith(matrix.OS, 'ubuntu-') +# run: | +# sudo add-apt-repository -y ppa:kisak/kisak-mesa +# sudo apt-get update +# sudo apt install -y \ +# libglu1-mesa \ +# libegl-mesa0 \ +# libgl1-mesa-dev +# - name: Install python dependencies +# run: | +# pip install --upgrade pip setuptools pkg-info wheel +# pip3 install torch --index-url https://download.pytorch.org/whl/cpu + +# - name: Black Format Check +# if: matrix.OS == 'ubuntu-24.04' && matrix.PYTHON_VERSION == '3.12' +# run: | +# pip install black +# black --line-length 120 --check . + +# - name: Install Genesis +# shell: bash +# run: | +# PYTHON_DEPS="dev" +# if [[ "${{ matrix.OS }}" != 'ubuntu-24.04-arm' ]] ; then +# PYTHON_DEPS="${PYTHON_DEPS},usd" +# fi +# pip install -e ".[${PYTHON_DEPS}]" + +# - name: Get artifact prefix name +# id: artifact_prefix +# shell: bash +# run: | +# OS_FAMILY=$(python -c "import platform; print(platform.system())") +# MACHINE_ARCH=$(python -c "import platform; print(platform.machine())") +# GSTAICHI_VERSION=$(python -c "import importlib.metadata ; print(importlib.metadata.version('gstaichi'))") +# echo "ARTIFACT_PREFIX=${OS_FAMILY}-${MACHINE_ARCH}-${GSTAICHI_VERSION}-${{ matrix.GS_BACKEND }}-${{ matrix.GS_ENABLE_NDARRAY }}" >> $GITHUB_OUTPUT + +# - name: Restore Taichi Kernel Cache +# if: always() +# uses: actions/cache/restore@v4 +# with: +# path: .cache +# key: ${{ steps.artifact_prefix.outputs.ARTIFACT_PREFIX }} +# restore-keys: | +# ${{ steps.artifact_prefix.outputs.ARTIFACT_PREFIX }}- + +# - name: Run unit tests +# run: | +# pytest -v --logical --dev --backend ${{ matrix.GS_BACKEND }} -m required --forked ./tests + +# - name: Save Updated Taichi Kernel Cache +# # lets match this version to the black format check +# if: >- +# ${{ always() && +# (matrix.OS == 'ubuntu-24.04' || matrix.OS == 'macos-15' || matrix.OS == 'windows-2025') && +# matrix.PYTHON_VERSION == '3.12' }} +# uses: actions/cache/save@v4 +# with: +# path: .cache +# # Note that it is necessary to create a new archive systematically for now: +# # See: https://github.com/actions/cache/issues/1594 +# key: ${{ steps.artifact_prefix.outputs.ARTIFACT_PREFIX }}-${{ github.run_id }}-${{ github.run_attempt }} + +# publish-pypi: +# name: Publish on PyPI +# runs-on: ubuntu-24.04 +# permissions: +# id-token: write +# environment: +# name: advance + +# if: github.event_name == 'release' + +# steps: +# - name: Checkout code +# uses: actions/checkout@v4 +# with: +# fetch-depth: 1 + +# - name: Build wheels +# run: | +# pip wheel --no-deps . -w wheelhouse + +# - name: Publish the wheels on PyPI +# uses: pypa/gh-action-pypi-publish@v1.12.4 +# with: +# packages-dir: wheelhouse +# verify-metadata: true +# attestations: true +# print-hash: true +# skip-existing: true diff --git a/.github/workflows/production.yml b/.github/workflows/production.yml index 66e00b7bf..90a47d577 100644 --- a/.github/workflows/production.yml +++ b/.github/workflows/production.yml @@ -28,69 +28,69 @@ env: OMNI_KIT_ALLOW_ROOT: "1" jobs: - unit-tests: - name: production-unit_tests-${{ matrix.GS_ENABLE_NDARRAY == '0' && 'field' || 'ndarray' }} - - runs-on: [self-hosted, coreweave, genesis-world] - - strategy: - fail-fast: true - max-parallel: 1 - matrix: - GS_ENABLE_NDARRAY: ["0", "1"] - - env: - GS_ENABLE_NDARRAY: ${{ matrix.GS_ENABLE_NDARRAY }} - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - name: Run unit tests - if: github.event_name == 'pull_request' - run: | - SLURM_JOB_NAME="$(uuidgen)_$(date +%Y%m%d_%H%M%S)" - echo "SLURM_JOB_NAME=${SLURM_JOB_NAME}" >> $GITHUB_ENV - - mkdir -p "${HOME}/.cache" "${HOME}/.venv" - - # TODO: USD baking does not currently support Python 3.11 since - # NVIDIA does not currently release `omniverse-kit==107.3` on PyPI. - # See: https://github.com/Genesis-Embodied-AI/Genesis/pull/1300 - srun \ - --container-image="/mnt/data/images/genesis-v${GENESIS_IMAGE_VER}.sqsh" \ - --container-mounts=\ - "${HOME}/.venv":/root/.venv,\ - "${HOME}/.cache":/root/.cache,\ - "${{ github.workspace }}":/root/workspace \ - --no-container-mount-home --container-workdir=/root/workspace \ - --export=NVIDIA_DRIVER_CAPABILITIES=all,BASH_ENV=/root/.bashrc,HF_TOKEN,GS_ENABLE_NDARRAY=${GS_ENABLE_NDARRAY} \ - --partition=hpc-mid --nodes=1 --gpus=8 --exclusive --time="${TIMEOUT_MINUTES}" \ - --job-name=${SLURM_JOB_NAME} \ - bash -e -s << 'EOF' - if test -n "$(find /root/.venv -maxdepth 0 -empty)"; then - python3 -m venv --system-site-packages /root/.venv - source /root/.venv/bin/activate - pip install --no-input --upgrade pip pkg-info wheel - pip install --no-input --ignore-installed --upgrade blinker pyparsing setuptools - fi - source /root/.venv/bin/activate - - pip install --no-input --extra-index-url https://pypi.nvidia.com/ omniverse-kit - pip install --no-input ".[dev,render,usd]" - - pytest -v -ra --backend gpu --dev --forked ./tests - EOF - - name: Kill srun job systematically - if: always() - run: | - if [ -n "${SLURM_JOB_NAME}" ] ; then - scancel --user=${USER} --name="${SLURM_JOB_NAME}" - fi + # unit-tests: + # name: production-unit_tests-${{ matrix.GS_ENABLE_NDARRAY == '0' && 'field' || 'ndarray' }} + + # runs-on: [self-hosted, coreweave, genesis-world] + + # strategy: + # fail-fast: true + # max-parallel: 1 + # matrix: + # GS_ENABLE_NDARRAY: ["0", "1"] + + # env: + # GS_ENABLE_NDARRAY: ${{ matrix.GS_ENABLE_NDARRAY }} + + # steps: + # - name: Checkout code + # uses: actions/checkout@v4 + # - name: Run unit tests + # if: github.event_name == 'pull_request' + # run: | + # SLURM_JOB_NAME="$(uuidgen)_$(date +%Y%m%d_%H%M%S)" + # echo "SLURM_JOB_NAME=${SLURM_JOB_NAME}" >> $GITHUB_ENV + + # mkdir -p "${HOME}/.cache" "${HOME}/.venv" + + # # TODO: USD baking does not currently support Python 3.11 since + # # NVIDIA does not currently release `omniverse-kit==107.3` on PyPI. + # # See: https://github.com/Genesis-Embodied-AI/Genesis/pull/1300 + # srun \ + # --container-image="/mnt/data/images/genesis-v${GENESIS_IMAGE_VER}.sqsh" \ + # --container-mounts=\ + # "${HOME}/.venv":/root/.venv,\ + # "${HOME}/.cache":/root/.cache,\ + # "${{ github.workspace }}":/root/workspace \ + # --no-container-mount-home --container-workdir=/root/workspace \ + # --export=NVIDIA_DRIVER_CAPABILITIES=all,BASH_ENV=/root/.bashrc,HF_TOKEN,GS_ENABLE_NDARRAY=${GS_ENABLE_NDARRAY} \ + # --partition=hpc-mid --nodes=1 --gpus=8 --exclusive --time="${TIMEOUT_MINUTES}" \ + # --job-name=${SLURM_JOB_NAME} \ + # bash -e -s << 'EOF' + # if test -n "$(find /root/.venv -maxdepth 0 -empty)"; then + # python3 -m venv --system-site-packages /root/.venv + # source /root/.venv/bin/activate + # pip install --no-input --upgrade pip pkg-info wheel + # pip install --no-input --ignore-installed --upgrade blinker pyparsing setuptools + # fi + # source /root/.venv/bin/activate + + # pip install --no-input --extra-index-url https://pypi.nvidia.com/ omniverse-kit + # pip install --no-input ".[dev,render,usd]" + + # pytest -v -ra --backend gpu --dev --forked ./tests + # EOF + # - name: Kill srun job systematically + # if: always() + # run: | + # if [ -n "${SLURM_JOB_NAME}" ] ; then + # scancel --user=${USER} --name="${SLURM_JOB_NAME}" + # fi benchmarks: name: production-benchmarks-${{ matrix.GS_ENABLE_NDARRAY == '0' && 'field' || 'ndarray' }} - needs: unit-tests + # needs: unit-tests runs-on: [self-hosted, coreweave, genesis-world] strategy: