MDAnalysis · jauy123 · Feb 25, 2025 · Mar 3, 2025 · Mar 3, 2025 · Mar 3, 2025
diff --git a/.github/actions/setup-deps/action.yaml b/.github/actions/setup-deps/action.yaml
@@ -72,6 +72,8 @@ inputs:
     default: 'networkx'
   openmm:
     default: 'openmm'
+  pooch:
+    default: 'pooch'
   pytng:
     default: 'pytng>=0.2.3'
   rdkit:
@@ -142,6 +144,7 @@ runs:
           ${{ inputs.netcdf4 }}
           ${{ inputs.networkx }}
           ${{ inputs.openmm }}
+          ${{ inputs.pooch }}
           ${{ inputs.pytng }}
           ${{ inputs.rdkit }}
           ${{ inputs.scikit-learn }}

diff --git a/package/CHANGELOG b/package/CHANGELOG
@@ -16,9 +16,9 @@ The rules for this file:
 -------------------------------------------------------------------------------
 ??/??/?? IAlibay, orbeckst, BHM-Bob, TRY-ER, Abdulrahman-PROG, pbuslaev,
          yuxuanzhuang, yuyuan871111, tanishy7777, tulga-rdn, Gareth-elliott,
-         hmacdope, tylerjereddy, cbouy, talagayev, DrDomenicoMarson
-
-
+         hmacdope, tylerjereddy, cbouy, talagayev, DrDomenicoMarson, jauy1,
+         BradyAJohnston
+  
  * 2.10.0
 
 Fixes
@@ -42,6 +42,9 @@ Fixes
    directly passing them. (Issue #3520, PR #5006)
 
 Enhancements
+* Added new function `PDBParser.fetch_pdb` to download structure files from wwPDB
+   using `pooch` as optional dependency. This can be used to interactively create
+   `Universe` classes on the fly. (Issue #4907, PR #4943)  
  * Added capability to calculate MSD from frames with irregular (non-linear)
    time spacing in analysis.msd.EinsteinMSD with keyword argument
    `non_linear=True` (Issue #5028, PR #5066)

diff --git a/package/MDAnalysis/__init__.py b/package/MDAnalysis/__init__.py
@@ -221,6 +221,8 @@
 
 from .due import due, Doi, BibTeX
 
+from .topology.PDBParser import fetch_pdb
+
 due.cite(
     Doi("10.25080/majora-629e541a-00e"),
     description="Molecular simulation analysis library",

diff --git a/package/MDAnalysis/topology/PDBParser.py b/package/MDAnalysis/topology/PDBParser.py
@@ -63,7 +63,7 @@
 .. autoclass:: PDBParser
    :members:
    :inherited-members:
-
+   
 """
 import numpy as np
 import warnings
@@ -95,6 +95,12 @@
 # Set up a logger for the PDBParser
 logger = logging.getLogger("MDAnalysis.topology.PDBParser")
 
+try:
+    import pooch
+except ImportError:
+    HAS_POOCH = False
+else:
+    HAS_POOCH = True
 
 def float_or_default(val, default):
     try:
@@ -515,3 +521,103 @@ def _parse_conect(conect):
     bond_atoms = (int(conect[11 + i * 5: 16 + i * 5]) for i in
                   range(n_bond_atoms))
     return atom_id, bond_atoms
+
+def fetch_pdb(
+    PDB_IDS=None,
+    cache_path=None,
+    progressbar=False,
+    file_format="pdb.gz",
+):
+    """
+    Download one or more PDB files from the RCSB Protein Data Bank and cache them locally.
+
+    Given one or multiple PDB IDs, downloads the corresponding structure files in the specified
+    format and stores them in a local cache directory. If files are cached on disk, fetch_pdb() will skip the download and use
+    the cached version instead.
+
+    Returns the path(s) as a string to the downloaded files.
+
+    Parameters
+    ----------
+    PDB_IDS : str or sequence of str
+        A single PDB ID as a string, or a sequence of PDB IDs to fetch.
+    cache_path : str or pathlib.Path
+        Directory where downloaded file(s) will be cached.
+    file_format : str
+        The file extension/format to download (e.g., "cif", "pdb")
+    progressbar : bool, optional
+        If True, display a progress bar during file downloads. Default is False.
+
+    Returns
+    -------
+    str or list of str
+        The path(s) to the downloaded file(s). Returns a single string if one PDB ID is given,
+        or a list of strings if multiple PDB IDs are provided.
+
+    Raises
+    ------
+    requests.exceptions.HTTPError
+        If an invalid PDB code or file format is specified.
+
+    Notes
+    -----
+    This function downloads using the API established here at https://www.rcsb.org/docs/programmatic-access/file-download-services.
+
+    Examples
+    --------
+    Download a single PDB file:
+
+    >>> mda.fetch_pdb("1AKE", file_format="cif")
+    './pdb_cache/1AKE.cif'
+
+    Download multiple PDB files with a progress bar:
+
+    >>> mda.fetch_pdb(["1AKE", "4BWZ"], progressbar=True)
+    ['./pdb_cache/1AKE.pdb.gz', './pdb_cache/4BWZ.pdb.gz']
+
+    Download a single PDB file and converting it to a universe:
+
+    >>> mda.Universe(mda.fetch_pdb("1AKE"), file_format="pdb.gz")
+    <Universe with 3816 atoms>
+
+    Download multiple PDB files and converting them to a universe:
-    Download multiple PDB files and converting them to a universe:
+    Download multiple PDB files and convert each of them into a universe:
-    Download multiple PDB files and converting them to a universe:
+    Download multiple PDB files and convert each of them into a universe:
+
+    >>> [mda.Universe(mda.fetch_pdb(PDB_ID), file_format="pdb.gz") for PDB_ID in ("1AKE", "4BWZ")]
+    [<Universe with 3816 atoms>, <Universe with 2824 atoms>]
+
+    """
+
+    if not HAS_POOCH:
+        raise ModuleNotFoundError(
+            "pooch is needed as a dependency for fetch_pdb()"
+        )
+
+    if isinstance(PDB_IDS, str):
+        PDB_IDS = (PDB_IDS,)
+
+    if cache_path is None:
+        cache_path = pooch.os_cache("pdb_cache")
+
+    # Have to do this dictionary approach instead of using Pooch.retrieve in order to prevent the hardcoded known_hash warning from showing up.
+    registry_dictionary = {
+        f"{PDB_ID}.{file_format}": None for PDB_ID in PDB_IDS
+    }
+
+    downloader = pooch.create(
+        path=cache_path,
+        base_url="https://files.wwpdb.org/download/",
+        registry=registry_dictionary,
+    )
+
+    if len(PDB_IDS) == 1:
+        return str(
+            downloader.fetch(
+                fname=tuple(registry_dictionary.keys())[0],
+                progressbar=progressbar,
+            )
+        )
+    else:
+        return [
+            downloader.fetch(fname=file_name, progressbar=progressbar)
+            for file_name in registry_dictionary.keys()
+        ]
diff --git a/package/pyproject.toml b/package/pyproject.toml
@@ -72,6 +72,7 @@ extra_formats = [
     "h5py>=2.10",
     "chemfiles>=0.10",
     "parmed",
+    "pooch",
     "pyedr>=0.7.0",
     "pytng>=0.2.3",
     "gsd>3.0.0",

diff --git a/package/requirements.txt b/package/requirements.txt
@@ -13,6 +13,7 @@ networkx
 numpy>=1.23.2
 packaging
 parmed
+pooch
 pytest
 scikit-learn
 scipy

diff --git a/testsuite/MDAnalysisTests/topology/test_fetch_pdb.py b/testsuite/MDAnalysisTests/topology/test_fetch_pdb.py
@@ -0,0 +1,98 @@
+# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*-
+# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 fileencoding=utf-8
+#
+# MDAnalysis --- https://www.mdanalysis.org
+# Copyright (c) 2006-2017 The MDAnalysis Development Team and contributors
+# (see the file AUTHORS for the full list of names)
+#
+# Released under the Lesser GNU Public Licence, v2.1 or any higher version
+#
+# Please cite your use of MDAnalysis in published work:
+#
+# R. J. Gowers, M. Linke, J. Barnoud, T. J. E. Reddy, M. N. Melo, S. L. Seyler,
+# D. L. Dotson, J. Domanski, S. Buchoux, I. M. Kenney, and O. Beckstein.
+# MDAnalysis: A Python package for the rapid analysis of molecular dynamics
+# simulations. In S. Benthall and S. Rostrup editors, Proceedings of the 15th
+# Python in Science Conference, pages 102-109, Austin, TX, 2016. SciPy.
+# doi: 10.25080/majora-629e541a-00e
+#
+# N. Michaud-Agrawal, E. J. Denning, T. B. Woolf, and O. Beckstein.
+# MDAnalysis: A Toolkit for the Analysis of Molecular Dynamics Simulations.
+# J. Comput. Chem. 32 (2011), 2319--2327, doi:10.1002/jcc.21787
+#
+
+import pytest
+
+import MDAnalysis as mda
+from MDAnalysis.topology.PDBParser import HAS_POOCH
+
+from urllib import request
+
+if HAS_POOCH:
+    from requests.exceptions import HTTPError
+
+try:
+    request.urlopen("https://files.wwpdb.org/", timeout=2)
+    HAS_ACCESS_TO_WWPDB = True
+except request.URLError:
+    HAS_ACCESS_TO_WWPDB = False
+
+
+@pytest.mark.skipif(
+    not (HAS_POOCH and HAS_ACCESS_TO_WWPDB),
+    reason="Pooch is not installed or can not connect to https://files.wwpdb.org/",
+)
+class TestDocstringExamples:
+    """This class tests all the examples found in fetch_pdb's docstring"""
+
+    @pytest.mark.parametrize("pdb_id", [("1AKE"), ("4BWZ")])
+    def test_one_file_download(self, tmp_path, pdb_id):
+        assert isinstance(
+            mda.fetch_pdb(pdb_id, cache_path=tmp_path, file_format="cif"), str
+        )
+
+    def test_multiple_files_download(self, tmp_path):
+        list_of_path_strings = mda.fetch_pdb(
+            ["1AKE", "4BWZ"], cache_path=tmp_path, progressbar=True
+        )
+        assert all(isinstance(PDB_ID, str) for PDB_ID in list_of_path_strings)
+
+    @pytest.mark.parametrize(
+        "pdb_id, n_atoms", [("1AKE", 3816), ("4BWZ", 2824)]
+    )
+    def test_files_to_universe(self, tmp_path, pdb_id, n_atoms):
+        u = mda.Universe(
+            mda.fetch_pdb(
+                pdb_id,
+                file_format="pdb.gz",
+                cache_path=tmp_path,
+                progressbar=True,
+            )
+        )
+        assert isinstance(u, mda.Universe) and (len(u.atoms) == n_atoms)
+
+
+@pytest.mark.skipif(
+    not (HAS_POOCH and HAS_ACCESS_TO_WWPDB),
+    reason="Pooch is not installed or can not connect to https://files.wwpdb.org/",
+)
+class TestExpectedErrors:
+
+    def test_invalid_pdb(self, tmp_path):
+        with pytest.raises(HTTPError):
+            mda.fetch_pdb(PDB_IDS="foobar", cache_path=tmp_path)
+
+    def test_invalid_file_format(self, tmp_path):
+        with pytest.raises(HTTPError):
+            mda.fetch_pdb(
+                PDB_IDS="1AKE", cache_path=tmp_path, file_format="barfoo"
+            )
+
+
+@pytest.mark.skipif(
+    HAS_POOCH,
+    reason="Pooch is installed.",
+)
+def test_pooch_installation(tmp_path):
+    with pytest.raises(ModuleNotFoundError):
+        mda.fetch_pdb("1AKE", cache_path=tmp_path, file_format="cif")
-Original file line number
+Diff line change
@@ Expand Up / @@ -13,6 +13,7 @@ networkx @@
     numpy>=1.23.2
     packaging
     parmed
+    pooch
     pytest
     scikit-learn
     scipy
@@ Expand Down @@