diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index d3831d5..6a8dd6d 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -35,17 +35,18 @@ jobs: any::pkgdown - name: Set up Python - uses: actions/setup-python@v2 + id: setup-python + uses: actions/setup-python@v5 with: - python-version: '3.8' + python-version: '3.11' - name: Install Python dependencies run: | python -m pip install --upgrade pip - pip install anndata - pip install numpy - env: - RETICULATE_PYTHON: /opt/hostedtoolcache/Python/3.8.10/x64/bin/python + pip install anndata numpy + # Pin reticulate to this interpreter for all subsequent steps so + # the AnnData <-> Seurat/SCE tests run instead of skipping. + echo "RETICULATE_PYTHON=${{ steps.setup-python.outputs.python-path }}" >> "$GITHUB_ENV" - name: Build pkgdown site run: | diff --git a/DESCRIPTION b/DESCRIPTION index a88d9dc..cccad68 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,20 +1,26 @@ Package: convert2anndata Type: Package -Title: Convert SingleCellExperiment and Seurat Objects to AnnData -Version: 0.2.0 +Title: Bidirectional Conversion Between AnnData and SingleCellExperiment / Seurat +Version: 0.3.0 Author: Dominik Otto [aut, cre] () Maintainer: Dominik Otto -Description: This package provides functions to convert SingleCellExperiment and Seurat objects to AnnData format. It handles split layers (Seurat), assays, dimensional reductions, metadata, and alternative experiments, ensuring comprehensive conversion. +Description: Bidirectional conversion between AnnData (.h5ad) and either + Seurat or SingleCellExperiment objects. Handles split layers (Seurat), + assays, dimensional reductions (obsm <-> reducedDims), metadata + (obs/var <-> colData/rowData), layers, and alternative experiments, + aiming for a faithful roundtrip. License: GPL-3 Encoding: UTF-8 LazyData: true -RoxygenNote: 7.3.1 -URL: https://research.fredhutch.org/setty/en.html +RoxygenNote: 7.3.3 +URL: https://github.com/settylab/convert2anndata, https://settylab.github.io/convert2anndata BugReports: https://github.com/settylab/convert2anndata/issues Imports: Seurat, + SeuratObject, SingleCellExperiment, anndata, + reticulate, optparse, Matrix, S4Vectors, @@ -23,5 +29,7 @@ Imports: Suggests: testthat (>= 3.0.0), covr, - BiocNeighbors + BiocNeighbors, + withr, + R6 biocViews: SingleCellData, DataImport, Transcriptomics, Sequencing, RNASeq diff --git a/NAMESPACE b/NAMESPACE index 8f3a716..607e56b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,14 +1,26 @@ # Generated by roxygen2: do not edit by hand export(align_metadata_seurat) +export(anndata_names) export(attach_alt_experiments_sce) +export(attach_reductions_seurat) +export(check_anndata_python) export(cli_convert) +export(convert_anndata_to_sce) +export(convert_anndata_to_seurat) export(convert_commands) export(convert_graph_to_colPair) export(convert_seurat_to_sce) export(convert_to_anndata) export(create_combined_assay) +export(default_reduction_map) +export(diagnose_anndata_python) export(ensure_csparse_matrix) +export(extract_anndata_X) +export(extract_anndata_layers) +export(extract_anndata_obsm) +export(extract_anndata_obsp) +export(extract_anndata_raw) export(extract_counts_matrix) export(extract_data) export(extract_pairs) @@ -20,6 +32,7 @@ export(process_layers) export(process_main_assay) export(process_metadata_and_pairwise) export(process_other_assays) +export(setup_anndata_python) export(timestamped_cat) export(update_seurat_object) import(Matrix) @@ -36,6 +49,8 @@ importFrom(S4Vectors,subjectHits) importFrom(Seurat,"DefaultAssay<-") importFrom(Seurat,Cells) importFrom(Seurat,CreateAssayObject) +importFrom(Seurat,CreateDimReducObject) +importFrom(Seurat,CreateSeuratObject) importFrom(Seurat,DefaultAssay) importFrom(Seurat,GetAssayData) importFrom(Seurat,SetAssayData) @@ -54,11 +69,14 @@ importFrom(SingleCellExperiment,reducedDim) importFrom(SingleCellExperiment,reducedDims) importFrom(SingleCellExperiment,removeAltExps) importFrom(SingleCellExperiment,rowPairs) +importFrom(SummarizedExperiment,"colData<-") +importFrom(SummarizedExperiment,"rowData<-") importFrom(SummarizedExperiment,assay) importFrom(SummarizedExperiment,assays) importFrom(SummarizedExperiment,colData) importFrom(SummarizedExperiment,rowData) importFrom(anndata,AnnData) +importFrom(anndata,read_h5ad) importFrom(anndata,write_h5ad) importFrom(methods,as) importFrom(methods,is) diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 0000000..02698c1 --- /dev/null +++ b/NEWS.md @@ -0,0 +1,95 @@ +# convert2anndata 0.3.0 + +## Bidirectional conversion + +`convert2anndata` is now a bidirectional converter. In addition to the +existing SingleCellExperiment / Seurat → AnnData direction, it can +convert AnnData (`.h5ad`) objects into Seurat or SingleCellExperiment +objects: + +* `convert_anndata_to_seurat()` — AnnData → Seurat. +* `convert_anndata_to_sce()` — AnnData → SingleCellExperiment. + +Both accept either an in-memory AnnData object or a path to an `.h5ad` +file. AnnData components are mapped as follows: `X` / `layers` → +assays, `obsm` → dimensional reductions, `obs` / `var` → cell / feature +metadata, and `obsp` → Seurat graphs (for the Seurat target). + +## Command-line interface + +* `cli_convert()` now dispatches on the input file extension: + `.rds` → `.h5ad` and `.h5ad` → `.rds`. +* The `.h5ad` → `.rds` direction supports `-t seurat` (default) or + `-t sce` to choose the target object type. + +## Python / reticulate setup + +New helpers make the reticulate ↔ Python `anndata` wiring explicit and +easier to debug: + +* `setup_anndata_python()` — resolve and activate a Python environment + that has `anndata` installed. +* `check_anndata_python()` — fail fast with an actionable message when + Python, `anndata`, or `numpy` are not reachable. +* `diagnose_anndata_python()` — print a full diagnostic of the active + Python environment. + +The path entry points to the converters call `check_anndata_python()` +automatically. + +## Seurat 5 support and `use_raw` + +* Conversions are robust across Seurat 4 / Seurat 5 layer and slot + naming. +* `use_raw = "auto"` (the default) uses `adata.raw` as the counts + matrix only when no requested `counts_layer` is present; `"always"` + and `"never"` force the behaviour, and a logical value is still + accepted for backward compatibility. + +## New exported helpers + +The full `extract_anndata_*` family is now exported for building custom +conversions: `extract_anndata_X()`, `extract_anndata_layers()`, +`extract_anndata_obsm()`, `extract_anndata_obsp()`, and +`extract_anndata_raw()`. Also new: `attach_reductions_seurat()`. + +## Tests, evaluation harness, and validation + +* About ten new test files cover the AnnData → Seurat / SCE direction, + edge cases, sparse-matrix handling, `raw` / `obsp`, and realistic + scanpy-style datasets. Tests that need Python `anndata` skip + gracefully when it is unavailable. +* A standalone evaluation harness (`eval/`) exercises full roundtrips, + edge cases, a real pbmc3k dataset, and a quantitative + roundtrip-fidelity check. + +### Validation results (0.3.0) + +Validated with R 4.4.1, Seurat 5.4.0, SingleCellExperiment 1.28.1, +reticulate 1.45.0, and Python `anndata` 0.12.0: + +* `R CMD check`: 0 errors, 0 warnings, 3 (benign) notes. +* `testthat`: 387 passing, 0 failing, 1 skipped (a CLI subprocess test + that needs the package installed on the library path). +* Roundtrip fidelity: `X`, `obs`, `var`, `obsm`, and `obsp` roundtrip + within numeric tolerance through the SCE-mediated path; `obsp` also + roundtrips through the Seurat-mediated path. + +### Known limitations + +* The Seurat-mediated path (`AnnData → Seurat → ...`) cannot preserve + arbitrary extra `layers` or `var` (feature) metadata, because a + Seurat assay only has `counts` / `data` / `scale.data` slots and no + feature-metadata store. Use the SCE-mediated path when verbatim + layers or `var` columns matter. +* `convert_anndata_to_sce()` does not yet attach `obsp` matrices as + `colPairs` (the Seurat target does attach them as graphs). This is a + planned follow-up. + +## Documentation + +* `README` rewritten with a direction reference table, Python + environment notes, and a troubleshooting section. +* `_pkgdown.yml` updated for bidirectional conversion with a grouped + reference index. +* `DESCRIPTION` retitled and bumped to 0.3.0. diff --git a/R/anndata_keys.R b/R/anndata_keys.R new file mode 100644 index 0000000..dc6c5cb --- /dev/null +++ b/R/anndata_keys.R @@ -0,0 +1,38 @@ +#' Extract dict-like keys from an AnnData mapping (layers, obsm, uns, ...) +#' +#' AnnData's mapping attributes (`layers`, `obsm`, `uns`, ...) expose a +#' `.keys()` method. Different combinations of `reticulate` and `anndata` +#' return that result in different shapes: +#' +#' - newer reticulate auto-converts the Python `KeysView` to an R character +#' vector (so `as.character(builtins$list(x$keys()))` runs `list()` on a +#' char-vec and splits each name into individual characters); +#' - older versions hand back a Python object that needs `builtins$list()` +#' to materialise. +#' +#' This helper handles both cases, plus the edge case where the mapping has +#' no `.keys()` method but `names()` works. +#' +#' @param mapping The mapping attribute (e.g. `adata$layers`). +#' @return A character vector of keys, or `character(0)` on failure. +#' @keywords internal +anndata_mapping_keys <- function(mapping) { + raw <- tryCatch(mapping$keys(), error = function(e) NULL) + + if (is.character(raw)) { + return(raw) + } + + if (!is.null(raw)) { + builtins <- reticulate::import_builtins() + py_listed <- tryCatch(builtins$list(raw), error = function(e) NULL) + if (is.character(py_listed)) return(py_listed) + if (!is.null(py_listed)) { + return(tryCatch(as.character(py_listed), error = function(e) character(0))) + } + } + + nm <- tryCatch(names(mapping), error = function(e) NULL) + if (is.character(nm)) return(nm) + character(0) +} diff --git a/R/anndata_names.R b/R/anndata_names.R new file mode 100644 index 0000000..43e99dc --- /dev/null +++ b/R/anndata_names.R @@ -0,0 +1,23 @@ +#' Extract obs/var names from an AnnData object +#' +#' Cell and feature names on AnnData objects are Python pandas Indexes; +#' R's `rownames()`/`colnames()` do not return them reliably. This helper +#' coerces them to character vectors via Python's `list()` builtin, falling +#' back to `names()` if that path fails. +#' +#' @param adata An AnnData object (e.g. as returned by `anndata::read_h5ad`). +#' @return A list with elements `obs_names` (cells) and `var_names` (features), +#' each a character vector or `NULL` if extraction failed. +#' @export +anndata_names <- function(adata) { + builtins <- reticulate::import_builtins() + obs_names <- tryCatch( + as.character(builtins$list(adata$obs_names)), + error = function(e) NULL + ) + var_names <- tryCatch( + as.character(builtins$list(adata$var_names)), + error = function(e) NULL + ) + list(obs_names = obs_names, var_names = var_names) +} diff --git a/R/attach_alt_experiments_sce.R b/R/attach_alt_experiments_sce.R index 7c80a6b..27d2de5 100644 --- a/R/attach_alt_experiments_sce.R +++ b/R/attach_alt_experiments_sce.R @@ -19,7 +19,7 @@ attach_alt_experiments_sce <- function(data, sce, altExp_names) { if (inherits(alt_assay, "Assay5")) { alt_counts <- GetAssayData(alt_assay, layer = "counts") } else if (inherits(alt_assay, "Assay")) { - alt_counts <- GetAssayData(alt_assay, slot = "counts") + alt_counts <- GetAssayData(alt_assay, layer = "counts") } else { next # Unsupported assay type } diff --git a/R/attach_reductions_seurat.R b/R/attach_reductions_seurat.R new file mode 100644 index 0000000..98637e2 --- /dev/null +++ b/R/attach_reductions_seurat.R @@ -0,0 +1,91 @@ +#' Default mapping from AnnData obsm keys to Seurat reduction (name, key) pairs +#' +#' Provides the canonical Seurat naming for the obsm keys conventionally +#' produced by scanpy. Users who want extra mappings should pass a list with +#' the same shape into `attach_reductions_seurat()` (or its callers). +#' +#' @return A named list. Each name is an obsm key (e.g. `X_pca`); each +#' element is a list with `name` (Seurat reduction name, e.g. `"pca"`) +#' and `key` (column-name prefix, e.g. `"PC_"`). +#' @export +default_reduction_map <- function() { + list( + X_pca = list(name = "pca", key = "PC_"), + X_umap = list(name = "umap", key = "UMAP_"), + X_tsne = list(name = "tsne", key = "tSNE_"), + X_diffmap = list(name = "diffmap", key = "DM_"), + X_phate = list(name = "phate", key = "PHATE_"), + X_harmony = list(name = "harmony", key = "harmony_"), + X_scvi = list(name = "scvi", key = "scVI_"), + X_lsi = list(name = "lsi", key = "LSI_"), + X_spectral = list(name = "spectral", key = "spectral_") + ) +} + +#' Attach obsm embeddings to a Seurat object as dimensional reductions +#' +#' Uses an obsm-key -> (reduction-name, key-prefix) mapping. Keys not in the +#' mapping fall back to a derived name: leading `X_` stripped, lowercased, +#' with a sanitized key prefix. +#' +#' @param seurat_obj A Seurat object to attach reductions to. +#' @param obsm Named list of numeric matrices (cells x dim), as returned by +#' `extract_anndata_obsm()`. +#' @param assay Assay name to associate the reductions with. Defaults to "RNA". +#' @param reduction_map Named list overriding or extending +#' `default_reduction_map()`. Each entry maps an obsm key (e.g. `"X_pca"`) +#' to a list with `name` and `key`. The supplied map is merged on top of +#' the defaults; pass an empty list to keep defaults, or pass an entry +#' with `name = NA` to disable a default. +#' @return The Seurat object with reductions attached. +#' @importFrom Seurat CreateDimReducObject +#' @export +attach_reductions_seurat <- function(seurat_obj, obsm, assay = "RNA", + reduction_map = list()) { + if (length(obsm) == 0) return(seurat_obj) + + merged_map <- default_reduction_map() + for (k in names(reduction_map)) { + merged_map[[k]] <- reduction_map[[k]] + } + + for (k in names(obsm)) { + mat <- obsm[[k]] + if (nrow(mat) != ncol(seurat_obj)) { + warning(sprintf( + "obsm['%s'] has %d rows but the Seurat object has %d cells; skipping.", + k, nrow(mat), ncol(seurat_obj) + )) + next + } + if (k %in% names(merged_map)) { + entry <- merged_map[[k]] + if (is.null(entry) || (length(entry$name) == 1 && is.na(entry$name))) { + timestamped_cat(sprintf("Skipping obsm['%s'] (disabled in reduction_map).\n", k)) + next + } + red_name <- entry$name + red_key <- entry$key %||% paste0(red_name, "_") + } else { + red_name <- tolower(sub("^X_", "", k)) + red_key <- paste0(red_name, "_") + } + sanitized_key <- gsub("[^[:alnum:]]", "", red_key) + if (!grepl("_$", sanitized_key)) sanitized_key <- paste0(sanitized_key, "_") + colnames(mat) <- paste0(sanitized_key, seq_len(ncol(mat))) + # Seurat may have rewritten cell names during object construction; align + # the embedding rownames to the seurat object so CreateDimReducObject + # accepts them. + rownames(mat) <- colnames(seurat_obj) + + seurat_obj[[red_name]] <- Seurat::CreateDimReducObject( + embeddings = mat, + key = sanitized_key, + assay = assay + ) + timestamped_cat(sprintf("Added obsm['%s'] as reduction '%s'.\n", k, red_name)) + } + seurat_obj +} + +`%||%` <- function(a, b) if (is.null(a)) b else a diff --git a/R/check_anndata_python.R b/R/check_anndata_python.R new file mode 100644 index 0000000..859661d --- /dev/null +++ b/R/check_anndata_python.R @@ -0,0 +1,243 @@ +#' Check that reticulate has a working Python with the `anndata` module +#' +#' Reticulate-based packages typically fail late and cryptically when the +#' Python side isn't wired up correctly: the user gets a numpy import +#' error from deep inside Python, far from the call site, with no hint +#' as to what to fix. This helper runs a layered check at the start of +#' a conversion call and stops with a single message that says exactly +#' what is wrong and how to fix it. +#' +#' Checks performed, in order, with stop-at-first-failure semantics: +#' +#' 1. **A Python interpreter is reachable.** If `reticulate::py_available()` +#' is `FALSE` even after initialisation, no Python could be located. +#' 2. **`PYTHONPATH` does not point at a different Python's site-packages.** +#' A common HPC failure mode is that loading an R module exports +#' paths like `/app/.../Python-3.11/site-packages` while reticulate +#' has selected a 3.8 / 3.12 interpreter, producing the cryptic +#' "Error importing numpy: you should not try to import numpy from +#' its source directory" error. We detect this *before* it fires. +#' 3. **The Python `anndata` module is importable.** +#' 4. **`numpy` imports without complaint** (catches any remaining ABI +#' mismatches). +#' 5. **An end-to-end smoke probe**: round-trip a 2x2 AnnData object +#' through reticulate. This is the only way to be sure the env is +#' actually usable for conversion work. +#' +#' Each failure produces a tailored error message including the Python +#' interpreter that was selected and a one-line suggested fix. +#' +#' @param action What to do on failure: `"stop"` (default) raises an +#' error; `"warn"` emits a warning and returns `FALSE`; `"silent"` +#' returns `FALSE` quietly. +#' @param verbose Logical. When `TRUE`, prints a full diagnostic block +#' on success showing Python path, version, anndata version, numpy +#' version, `RETICULATE_PYTHON` and `CONDA_PREFIX`. Useful as a +#' one-stop "what does my env actually look like" call. +#' @return `TRUE` (invisibly) when all checks pass, `FALSE` when any +#' check fails and `action != "stop"`. +#' @examples +#' \dontrun{ +#' check_anndata_python() # stops with a helpful message +#' check_anndata_python(verbose = TRUE) # also prints env diagnostics +#' if (check_anndata_python(action = "silent")) { +#' adata <- anndata::read_h5ad(path) +#' } +#' } +#' @export +check_anndata_python <- function(action = c("stop", "warn", "silent"), + verbose = FALSE) { + action <- match.arg(action) + + emit <- function(msg) { + switch(action, + stop = stop(msg, call. = FALSE), + warn = { warning(msg, call. = FALSE); FALSE }, + silent = FALSE + ) + } + + python_path <- tryCatch( + reticulate::py_config()$python, + error = function(e) NA_character_ + ) + + # 1. Reachable Python + if (!isTRUE(reticulate::py_available(initialize = TRUE))) { + return(emit(paste0( + "No Python interpreter available to reticulate.\n", + "Try one of:\n", + " Sys.setenv(RETICULATE_PYTHON = '/path/to/python')\n", + " reticulate::use_condaenv('your-env')\n", + " convert2anndata::setup_anndata_python('your-env')\n", + " anndata::install_anndata() # let reticulate manage a venv" + ))) + } + + # 2. PYTHONPATH mismatch (HPC pollution detection) + pp_problem <- detect_pythonpath_mismatch(python_path) + if (!is.null(pp_problem)) { + return(emit(paste0( + "PYTHONPATH appears to point at a different Python's site-packages.\n", + " Selected Python: ", python_path, "\n", + " Conflicting PYTHONPATH entry: ", pp_problem, "\n", + "This is a common HPC failure mode (an R module pushes paths for\n", + "another Python version onto PYTHONPATH). It produces the cryptic\n", + "'Error importing numpy: you should not try to import numpy from\n", + "its source directory' error if you proceed.\n", + "Fix: clear PYTHONPATH before launching R:\n", + " unset PYTHONPATH && Rscript your-script.R\n", + "Or from inside R, before any reticulate import:\n", + " Sys.unsetenv('PYTHONPATH'); .rs.restartR() # or quit + restart" + ))) + } + + # 3. anndata module importable + if (!isTRUE(reticulate::py_module_available("anndata"))) { + return(emit(paste0( + "Python module 'anndata' is not importable.\n", + " Selected Python: ", python_path, "\n", + "Fixes:\n", + " - Install the module into the active Python:\n", + " reticulate::py_install('anndata') \n", + " - Or use the anndata R package's bundled installer:\n", + " anndata::install_anndata()\n", + " - Or point reticulate at a Python that already has anndata:\n", + " Sys.setenv(RETICULATE_PYTHON = '/path/to/python')" + ))) + } + + # 4. numpy import works + numpy_err <- tryCatch({ + reticulate::py_run_string("import numpy") + NULL + }, error = function(e) conditionMessage(e)) + + if (!is.null(numpy_err)) { + msg <- paste0( + "Python `import numpy` failed.\n", + " Selected Python: ", python_path, "\n", + " Error: ", numpy_err, "\n", + "If the error mentions 'do not try to import numpy from its source\n", + "directory', PYTHONPATH is likely polluted by an HPC module that\n", + "exposes site-packages for a different Python version. Clear it:\n", + " Sys.unsetenv('PYTHONPATH')\n", + "and start a fresh R session before retrying." + ) + return(emit(msg)) + } + + # 5. End-to-end smoke probe + smoke_err <- tryCatch({ + reticulate::py_run_string(paste( + "import anndata, numpy as np", + "_probe_ad = anndata.AnnData(X=np.array([[1.0,2.0],[3.0,4.0]]))", + "_probe_shape = _probe_ad.shape", + sep = "\n" + )) + NULL + }, error = function(e) conditionMessage(e)) + + if (!is.null(smoke_err)) { + return(emit(paste0( + "anndata smoke probe failed: a Python `anndata.AnnData(...)` call\n", + "did not complete cleanly.\n", + " Selected Python: ", python_path, "\n", + " Error: ", smoke_err, "\n", + "Likely causes: ABI mismatch between numpy and anndata, or a partial\n", + "install. Reinstall into a clean env:\n", + " anndata::install_anndata()" + ))) + } + + if (isTRUE(verbose)) { + print_python_env_diagnostics(python_path) + } + + invisible(TRUE) +} + +#' Print a diagnostic block for the currently-active Python env +#' +#' Convenience wrapper for `check_anndata_python(verbose = TRUE, +#' action = "silent")`. Always prints the diagnostic block; never +#' raises. Returns `TRUE` if the env is usable, `FALSE` otherwise. +#' +#' @return Logical, invisibly. +#' @export +diagnose_anndata_python <- function() { + ok <- suppressWarnings(check_anndata_python(action = "silent")) + python_path <- tryCatch( + reticulate::py_config()$python, + error = function(e) NA_character_ + ) + print_python_env_diagnostics(python_path, ok = ok) + invisible(ok) +} + +# ---- Internals ---------------------------------------------------------- + +#' @keywords internal +detect_pythonpath_mismatch <- function(python_path) { + pp <- Sys.getenv("PYTHONPATH", unset = "") + if (!nzchar(pp) || is.na(python_path) || !nzchar(python_path)) return(NULL) + + # Determine the Python major.minor of the selected interpreter from + # `py_config()` (most reliable) or the binary path. + cfg_ver <- tryCatch( + as.character(reticulate::py_config()$version), + error = function(e) NA_character_ + ) + active_mm <- if (!is.na(cfg_ver) && nzchar(cfg_ver)) { + sub("^(\\d+\\.\\d+).*$", "\\1", cfg_ver) + } else NA_character_ + + if (is.na(active_mm) || !nzchar(active_mm)) return(NULL) + + # Look for any entry on PYTHONPATH that pins a Python major.minor + # (e.g. `python3.10/site-packages`, `Python-3.11/site-packages`, + # `Python/3.11/...`) where the X.Y disagrees with the active + # interpreter. + patterns <- c( + "[Pp]ython(\\d+\\.\\d+)", # python3.10 Python3.11 + "[Pp]ython[-/](\\d+\\.\\d+)" # Python-3.11 Python/3.11 + ) + entries <- strsplit(pp, .Platform$path.sep, fixed = TRUE)[[1]] + for (entry in entries) { + for (pat in patterns) { + m <- regmatches(entry, regexec(pat, entry))[[1]] + if (length(m) >= 2 && nzchar(m[2]) && m[2] != active_mm) { + return(entry) + } + } + } + NULL +} + +#' @keywords internal +print_python_env_diagnostics <- function(python_path, ok = TRUE) { + cat("---- convert2anndata Python environment ----\n") + cat("status: ", if (isTRUE(ok)) "OK" else "BROKEN", "\n") + cat("python: ", python_path, "\n") + cfg_ver <- tryCatch(reticulate::py_config()$version, error = function(e) NA) + cat("python version: ", as.character(cfg_ver), "\n") + cat("RETICULATE_PYTHON:", Sys.getenv("RETICULATE_PYTHON", unset = ""), "\n") + cat("CONDA_PREFIX: ", Sys.getenv("CONDA_PREFIX", unset = ""), "\n") + pp <- Sys.getenv("PYTHONPATH", unset = "") + cat("PYTHONPATH: ", if (nchar(pp) > 80) paste0(substr(pp, 1, 77), "...") else pp, "\n") + ad_v <- tryCatch( + reticulate::py_run_string( + "import anndata, numpy; _v = (anndata.__version__, numpy.__version__)", + local = TRUE + )$`_v`, + error = function(e) NULL + ) + if (!is.null(ad_v)) { + cat("anndata (Python): ", ad_v[[1]], "\n") + cat("numpy: ", ad_v[[2]], "\n") + } else { + cat("anndata (Python): \n") + cat("numpy: \n") + } + cat("--------------------------------------------\n") +} diff --git a/R/cli_convert.R b/R/cli_convert.R index 9d6ab50..f6b520d 100644 --- a/R/cli_convert.R +++ b/R/cli_convert.R @@ -1,81 +1,103 @@ #' Command Line Interface for convert2anndata #' -#' This function serves as a command line interface for the convert2anndata package. -#' It parses command line arguments and calls the appropriate functions to convert -#' a SingleCellExperiment or Seurat object to an AnnData object. +#' Bidirectional CLI: dispatches on the input file extension. An `.rds` input +#' is treated as a Seurat or SingleCellExperiment object and converted to an +#' AnnData `.h5ad` file. An `.h5ad` input is read with `anndata::read_h5ad()` +#' and converted to a Seurat (default) or SingleCellExperiment object saved +#' to `.rds`. #' #' @import optparse -#' @importFrom anndata write_h5ad +#' @importFrom anndata write_h5ad read_h5ad #' @importFrom optparse make_option OptionParser parse_args #' @export cli_convert <- function() { anndata::write_h5ad + anndata::read_h5ad - # Description and help description <- paste( - "This script converts a potentially old Seurat object or a SingleCellExperiment", - "stored in an RDS file into an AnnData object stored as an H5AD file.", - "The user can specify input and output file paths, with an option to change", - "the output filename from .rds to .h5ad if no output is specified." + "Bidirectional converter between AnnData (.h5ad) and Seurat / SingleCellExperiment (.rds).", + "Input format is detected by extension: .rds -> .h5ad, .h5ad -> .rds." ) - # Set up command-line options option_list <- list( make_option(c("-i", "--input"), type = "character", default = NULL, - help = "Path to the input RDS file containing the SingleCellExperiment object. This option is required.", + help = "Path to the input file (.rds or .h5ad). Required.", metavar = "file" ), make_option(c("-o", "--output"), type = "character", default = NULL, help = paste( - "Path to the output H5AD file. If not specified,", - "the output path is derived by replacing the .rds", - "extension of the input path with .h5ad." + "Path to the output file. If not specified, the output path is derived", + "by swapping the extension (.rds <-> .h5ad)." ), metavar = "file" ), make_option(c("-a", "--assay"), type = "character", default = "counts", - help = "The assay to use as the main matrix (anndata.X). Defaults to 'counts'.", + help = paste( + "For .rds -> .h5ad: assay to use as anndata.X (defaults to 'counts').", + "For .h5ad -> .rds: layer name to use as the counts assay (defaults to 'counts')." + ), metavar = "assay_name" ), make_option(c("-d", "--disable-recursive-altExp"), action = "store_true", default = FALSE, help = "Disable recursive recovery of altExperiments and discard them instead.", metavar = "boolean" + ), + make_option(c("-t", "--target"), + type = "character", default = "seurat", + help = paste( + "For .h5ad -> .rds only: target object type, 'seurat' (default) or 'sce'." + ), + metavar = "type" ) ) - # Parse command-line arguments opt_parser <- OptionParser(option_list = option_list, description = description) opt <- parse_args(opt_parser) - # Check if input file is provided if (is.null(opt$input)) { - stop("No input file provided. Use --input to specify the RDS file.", - call. = FALSE - ) + stop("No input file provided. Use --input to specify the input file.", call. = FALSE) } - # Set output filename - if (is.null(opt$output)) { - opt$output <- sub("\\.[rR][dD][sS]$", ".h5ad", opt$input, ignore.case = TRUE) - } - - # Load data - timestamped_cat("Loading data from:", opt$input, "\n") - data <- readRDS(opt$input) - - # Convert to SingleCellExperiment if necessary - sce <- convert_seurat_to_sce(data) - timestamped_cat("Data loaded and converted successfully if needed.\n") + is_h5ad <- grepl("\\.h5ad$", opt$input, ignore.case = TRUE) + is_rds <- grepl("\\.[rR][dD][sS]$", opt$input) - # Convert SCE to AnnData - ad <- convert_to_anndata(sce, opt$assay, useAltExp = !isTRUE(opt$`disable-recursive-altExp`)) + if (!is_h5ad && !is_rds) { + stop("Input file must end in .rds or .h5ad. Got: ", opt$input, call. = FALSE) + } - # Save AnnData object - timestamped_cat("Saving the AnnData object to:", opt$output, "\n") - write_h5ad(ad, opt$output) - timestamped_cat("Conversion complete:", opt$output, "\n") + if (is_rds) { + if (is.null(opt$output)) { + opt$output <- sub("\\.[rR][dD][sS]$", ".h5ad", opt$input, ignore.case = TRUE) + } + timestamped_cat("Loading data from:", opt$input, "\n") + data <- readRDS(opt$input) + sce <- convert_seurat_to_sce(data) + timestamped_cat("Data loaded and converted successfully if needed.\n") + ad <- convert_to_anndata(sce, opt$assay, useAltExp = !isTRUE(opt$`disable-recursive-altExp`)) + timestamped_cat("Saving the AnnData object to:", opt$output, "\n") + write_h5ad(ad, opt$output) + timestamped_cat("Conversion complete:", opt$output, "\n") + } else { + if (is.null(opt$output)) { + opt$output <- sub("\\.h5ad$", ".rds", opt$input, ignore.case = TRUE) + } + target <- tolower(opt$target) + if (!target %in% c("seurat", "sce")) { + stop("--target must be 'seurat' or 'sce'. Got: ", opt$target, call. = FALSE) + } + timestamped_cat("Loading AnnData from:", opt$input, "\n") + adata <- anndata::read_h5ad(opt$input) + if (target == "seurat") { + out <- convert_anndata_to_seurat(adata, counts_layer = opt$assay) + } else { + out <- convert_anndata_to_sce(adata, counts_layer = opt$assay) + } + timestamped_cat("Saving", target, "object to:", opt$output, "\n") + saveRDS(out, opt$output) + timestamped_cat("Conversion complete:", opt$output, "\n") + } } diff --git a/R/convert_anndata_to_sce.R b/R/convert_anndata_to_sce.R new file mode 100644 index 0000000..10334dd --- /dev/null +++ b/R/convert_anndata_to_sce.R @@ -0,0 +1,117 @@ +#' Convert an AnnData object (or a .h5ad file) to a SingleCellExperiment +#' +#' Reverse direction of `convert_to_anndata()` for the SCE branch. +#' `adata$X` (or `adata$layers[[counts_layer]]` if present) becomes the +#' `counts` assay, additional layers become other assays, `adata$obs` is +#' written to `colData`, `adata$var` to `rowData`, and `adata$obsm` entries +#' to `reducedDims`. +#' +#' @param adata An AnnData object, OR a character path to an `.h5ad` file +#' (in which case it is read with `anndata::read_h5ad()`). +#' @param counts_layer Name(s) of the layer in `adata$layers` to use as the +#' `counts` assay. May be a single string or a character vector of +#' candidates; the first one present in `adata$layers` is used. If none +#' are present, `adata$X` is used as `counts`. Defaults to +#' `c("counts", "raw_counts", "raw_count")`. +#' @param conda_env Optional conda environment to activate before reading +#' the file. Equivalent to calling `setup_anndata_python(conda_env)` +#' first. Only relevant when `adata` is a path. +#' @return A SingleCellExperiment object. +#' @examples +#' \dontrun{ +#' # Path entrypoint: +#' sce <- convert_anndata_to_sce("pbmc.h5ad") +#' # Object entrypoint: +#' adata <- anndata::read_h5ad("pbmc.h5ad") +#' sce <- convert_anndata_to_sce(adata) +#' } +#' @importFrom SingleCellExperiment SingleCellExperiment reducedDims<- +#' @importFrom SummarizedExperiment colData<- rowData<- +#' @importFrom Matrix t +#' @importFrom methods as +#' @export +convert_anndata_to_sce <- function(adata, + counts_layer = c("counts", "raw_counts", "raw_count"), + conda_env = NULL) { + if (is.null(adata)) stop("`adata` is NULL.") + + if (is.character(adata) && length(adata) == 1) { + source_path <- adata + if (!file.exists(source_path)) { + stop("Input file does not exist: ", source_path) + } + if (!is.null(conda_env)) setup_anndata_python(conda_env) + check_anndata_python() + timestamped_cat(sprintf("Reading AnnData from '%s'.\n", source_path)) + adata <- tryCatch( + anndata::read_h5ad(source_path), + error = function(e) stop("Failed to read AnnData h5ad file: ", conditionMessage(e)) + ) + } + + timestamped_cat("Summary of AnnData object:\n\n") + print(adata) + cat("\n") + + names <- anndata_names(adata) + obs_names <- names$obs_names + var_names <- names$var_names + + selected_layer <- pick_first_layer(adata, counts_layer) + has_counts_layer <- !is.null(selected_layer) + + counts_mat <- extract_anndata_X( + adata, + layer = selected_layer, + obs_names = obs_names, + var_names = var_names + ) + # SCE/Seurat orientation: features x cells; AnnData is cells x features. + counts_mat <- Matrix::t(counts_mat) + + assays_list <- list(counts = counts_mat) + + if (has_counts_layer && !is.null(adata$X)) { + X_mat <- extract_anndata_X(adata, layer = NULL, obs_names = obs_names, var_names = var_names) + assays_list[["X"]] <- Matrix::t(X_mat) + } + + exclude_layers <- if (has_counts_layer) selected_layer else character(0) + other_layers <- extract_anndata_layers( + adata, + obs_names = obs_names, + var_names = var_names, + exclude = exclude_layers + ) + for (k in names(other_layers)) { + assays_list[[k]] <- Matrix::t(other_layers[[k]]) + } + + sce <- SingleCellExperiment::SingleCellExperiment(assays = assays_list) + + if (!is.null(adata$obs)) { + obs_df <- tryCatch(as.data.frame(adata$obs), error = function(e) NULL) + if (!is.null(obs_df) && nrow(obs_df) == ncol(sce)) { + if (!is.null(obs_names)) rownames(obs_df) <- obs_names + SummarizedExperiment::colData(sce) <- methods::as(obs_df, "DataFrame") + } + } + if (!is.null(adata$var)) { + var_df <- tryCatch(as.data.frame(adata$var), error = function(e) NULL) + if (!is.null(var_df) && nrow(var_df) == nrow(sce)) { + if (!is.null(var_names)) rownames(var_df) <- var_names + SummarizedExperiment::rowData(sce) <- methods::as(var_df, "DataFrame") + } + } + + obsm <- extract_anndata_obsm(adata, obs_names = obs_names) + if (length(obsm) > 0) { + # Strip leading "X_" so reducedDimNames are clean (e.g. "PCA", "UMAP"). + clean_names <- sub("^X_", "", names(obsm)) + names(obsm) <- toupper(clean_names) + SingleCellExperiment::reducedDims(sce) <- obsm + } + + timestamped_cat("Conversion to SingleCellExperiment complete.\n") + sce +} diff --git a/R/convert_anndata_to_seurat.R b/R/convert_anndata_to_seurat.R new file mode 100644 index 0000000..05286de --- /dev/null +++ b/R/convert_anndata_to_seurat.R @@ -0,0 +1,283 @@ +#' Convert an AnnData object (or a .h5ad file) to a Seurat object +#' +#' Reverse direction of `convert_to_anndata()`: takes an AnnData object +#' (typically loaded with `anndata::read_h5ad()`) or a path to an `.h5ad` +#' file and constructs a Seurat object. Counts go into the `counts` layer, +#' `adata$X` into the `data` layer, `adata$obs` into the meta.data, and +#' entries of `adata$obsm` are attached as `DimReduc` objects. +#' +#' @param adata An AnnData object, OR a character path to an `.h5ad` file +#' (in which case it is read with `anndata::read_h5ad()`). +#' @param counts_layer Name(s) of the layer in `adata$layers` to use as the +#' counts matrix. May be a single string or a character vector of +#' candidate names; the first one present in `adata$layers` is used. +#' If none are present, `adata$X` is used as counts and no separate +#' `data` layer is added. Defaults to +#' `c("counts", "raw_counts", "raw_count")`. +#' @param assay Name of the resulting Seurat assay. Defaults to `"RNA"`. +#' @param reduction_map Optional named list passed to +#' `attach_reductions_seurat()` to override or extend the default obsm-key +#' to Seurat-reduction mapping. See `default_reduction_map()`. +#' @param conda_env Optional conda environment to activate before reading +#' the file. Equivalent to calling `setup_anndata_python(conda_env)` +#' first. Only relevant when `adata` is a path. +#' @param orig.ident Optional value for the `orig.ident` column of the +#' resulting Seurat meta.data. If `NULL` (default), the value is taken +#' from `adata$uns$conversion_source` if present, otherwise from the file +#' basename when `adata` was passed as a path, otherwise `"AnnData"`. +#' @param use_raw How to handle `adata$raw` (a separate AnnData snapshot +#' often used by scanpy to retain raw counts after normalising `X`). +#' `"auto"` (default) uses `raw$X` as the counts layer when no +#' `counts_layer` candidate matched and `adata$raw` is set; `TRUE` +#' always uses `raw$X` as counts (and falls back to candidates / `X` +#' if it is missing); `FALSE` ignores raw entirely. +#' @param attach_obsp Logical. When `TRUE` (default), entries of +#' `adata$obsp` (cell x cell pairwise matrices, e.g. nearest-neighbor +#' graphs from `sc.pp.neighbors`) are attached as Seurat `Graphs`. +#' @return A Seurat object. +#' @details The returned object has the AnnData orientation transposed: Seurat +#' stores genes x cells while AnnData stores cells x genes. Cell and feature +#' names are taken from `adata$obs_names` and `adata$var_names`. +#' +#' `adata$obs` is attached as `meta.data`. `adata$var` is attached via +#' `seurat_obj[["RNA"]][[]] <- ...` when present and dimensionally +#' compatible. +#' @examples +#' \dontrun{ +#' # Path entrypoint: +#' seurat_obj <- convert_anndata_to_seurat("pbmc.h5ad") +#' +#' # Object entrypoint: +#' adata <- anndata::read_h5ad("pbmc.h5ad") +#' seurat_obj <- convert_anndata_to_seurat(adata) +#' +#' # Custom layer + extra reductions: +#' seurat_obj <- convert_anndata_to_seurat( +#' "pbmc.h5ad", +#' counts_layer = c("counts", "raw_counts"), +#' reduction_map = list(X_my_emb = list(name = "myemb", key = "MyEmb_")) +#' ) +#' } +#' @importFrom Seurat CreateSeuratObject SetAssayData +#' @importFrom Matrix t +#' @importFrom methods as +#' @export +convert_anndata_to_seurat <- function(adata, + counts_layer = c("counts", "raw_counts", "raw_count"), + assay = "RNA", + reduction_map = list(), + conda_env = NULL, + orig.ident = NULL, + use_raw = c("auto", "always", "never"), + attach_obsp = TRUE) { + if (is.logical(use_raw)) { + use_raw <- if (isTRUE(use_raw)) "always" else "never" + } + use_raw <- match.arg(use_raw) + if (is.null(adata)) stop("`adata` is NULL.") + + source_path <- NULL + if (is.character(adata) && length(adata) == 1) { + source_path <- adata + if (!file.exists(source_path)) { + stop("Input file does not exist: ", source_path) + } + if (!is.null(conda_env)) setup_anndata_python(conda_env) + # Fail fast with an actionable message if Python / anndata / numpy + # aren't wired up correctly -- otherwise read_h5ad surfaces a + # cryptic error from deep inside Python. + check_anndata_python() + timestamped_cat(sprintf("Reading AnnData from '%s'.\n", source_path)) + adata <- tryCatch( + anndata::read_h5ad(source_path), + error = function(e) stop("Failed to read AnnData h5ad file: ", conditionMessage(e)) + ) + } + + timestamped_cat("Summary of AnnData object:\n\n") + print(adata) + cat("\n") + + names <- anndata_names(adata) + obs_names <- names$obs_names + var_names <- names$var_names + + selected_layer <- pick_first_layer(adata, counts_layer) + has_counts_layer <- !is.null(selected_layer) + + raw_mat <- NULL + used_raw <- FALSE + if (use_raw %in% c("auto", "always")) { + use_when_no_layer <- !has_counts_layer && use_raw == "auto" + if (use_raw == "always" || use_when_no_layer) { + raw_mat <- extract_anndata_raw(adata, obs_names = obs_names) + # In "auto" mode, only use raw when it has the same gene set as X. + # The typical scanpy pattern that "auto" is meant to support is + # `adata.raw = adata` *before* normalisation, in which case raw and + # adata share genes and raw is just the un-normalised counts. When + # raw has a *different* (usually larger, unfiltered) gene set, the + # analysis-ready data is adata$X, so prefer that and ignore raw. + if (use_raw == "auto" && !is.null(raw_mat) && + ncol(raw_mat) != as.integer(adata$n_vars)) { + timestamped_cat(sprintf( + "adata$raw has %d genes vs adata$X's %d; raw is the unfiltered set. Using adata$X (use_raw='always' to override).\n", + ncol(raw_mat), as.integer(adata$n_vars) + )) + raw_mat <- NULL + } + } + } + + if (!is.null(raw_mat)) { + used_raw <- TRUE + counts_mat <- raw_mat + timestamped_cat(sprintf( + "Using adata$raw$X (%d x %d) as counts.\n", + nrow(raw_mat), ncol(raw_mat) + )) + } else { + counts_mat <- extract_anndata_X( + adata, + layer = selected_layer, + obs_names = obs_names, + var_names = var_names + ) + } + + meta.data <- NULL + if (!is.null(adata$obs)) { + meta.data <- tryCatch(as.data.frame(adata$obs), error = function(e) NULL) + if (!is.null(meta.data)) { + if (!"orig.ident" %in% colnames(meta.data)) { + meta.data$orig.ident <- resolve_orig_ident(orig.ident, adata, source_path) + } + if (!is.null(obs_names)) rownames(meta.data) <- obs_names + } + } + + seurat_obj <- Seurat::CreateSeuratObject( + counts = Matrix::t(counts_mat), + meta.data = meta.data, + assay = assay + ) + + # Add adata$X as the data layer when: + # - we used a counts layer for counts (X is the normalised companion), OR + # - we used raw and the main X has the same gene set as raw + # If we used raw with a *different* gene set (typical scanpy pattern: raw + # is unfiltered), attaching X would fail because the dims don't match the + # Seurat object's gene axis. We skip in that case and emit a notice. + if ((has_counts_layer || used_raw) && !is.null(adata$X)) { + data_mat <- extract_anndata_X(adata, layer = NULL, obs_names = obs_names, var_names = var_names) + data_mat <- Matrix::t(data_mat) + if (nrow(data_mat) == nrow(seurat_obj) && ncol(data_mat) == ncol(seurat_obj)) { + # Seurat sanitizes feature names (e.g. 'gene_01' -> 'gene-01') when the + # object is created, so the data matrix's rownames/colnames must be + # realigned to whatever the seurat object actually carries before + # SetAssayData -- positions are preserved, only labels were rewritten. + rownames(data_mat) <- rownames(seurat_obj) + colnames(data_mat) <- colnames(seurat_obj) + seurat_obj <- Seurat::SetAssayData( + object = seurat_obj, + assay = assay, + layer = "data", + new.data = data_mat + ) + timestamped_cat(sprintf("Added adata$X to assay '%s' as 'data' layer.\n", assay)) + } else if (used_raw) { + timestamped_cat(sprintf( + "adata$X (%d x %d) has a different shape than adata$raw$X (%d x %d); skipping data layer.\n", + nrow(data_mat), ncol(data_mat), nrow(seurat_obj), ncol(seurat_obj) + )) + } + } + + obsm <- extract_anndata_obsm(adata, obs_names = obs_names) + seurat_obj <- attach_reductions_seurat( + seurat_obj, obsm, assay = assay, reduction_map = reduction_map + ) + + if (isTRUE(attach_obsp)) { + seurat_obj <- attach_obsp_graphs(seurat_obj, adata, obs_names = obs_names, + assay = assay) + } + + # Choose the right `var` source. When we used raw, the active feature + # space is raw.var; otherwise it's adata.var. + var_source <- if (used_raw) { + tryCatch(adata$raw$var, error = function(e) NULL) + } else { + tryCatch(adata$var, error = function(e) NULL) + } + if (!is.null(var_source)) { + var_df <- tryCatch(as.data.frame(var_source), error = function(e) NULL) + if (!is.null(var_df) && nrow(var_df) == nrow(seurat_obj)) { + # Same realignment story as the data matrix: Seurat may have + # sanitized feature names, and feature meta has to match exactly. + rownames(var_df) <- rownames(seurat_obj) + tryCatch( + seurat_obj[[assay]][[]] <- var_df, + error = function(e) { + timestamped_cat("WARNING: could not attach var as feature meta:", conditionMessage(e), "\n") + } + ) + } + } + + timestamped_cat("Conversion to Seurat complete.\n") + seurat_obj +} + +#' @keywords internal +attach_obsp_graphs <- function(seurat_obj, adata, obs_names = NULL, assay = "RNA") { + obsp <- tryCatch(extract_anndata_obsp(adata, obs_names = obs_names), + error = function(e) list()) + if (length(obsp) == 0) return(seurat_obj) + + for (k in names(obsp)) { + g <- obsp[[k]] + if (nrow(g) != ncol(seurat_obj)) { + warning(sprintf( + "obsp['%s'] has %d cells but the Seurat object has %d; skipping.", + k, nrow(g), ncol(seurat_obj) + )) + next + } + rownames(g) <- colnames(seurat_obj) + colnames(g) <- colnames(seurat_obj) + graph_obj <- tryCatch( + SeuratObject::as.Graph(g), + error = function(e) NULL + ) + if (is.null(graph_obj)) { + warning(sprintf("Could not coerce obsp['%s'] to a Seurat Graph; skipping.", k)) + next + } + SeuratObject::DefaultAssay(graph_obj) <- assay + # Seurat convention is "_" for graphs, so stay consistent. + graph_name <- paste0(assay, "_", k) + seurat_obj[[graph_name]] <- graph_obj + timestamped_cat(sprintf("Attached obsp['%s'] as Seurat graph '%s'.\n", k, graph_name)) + } + seurat_obj +} + +pick_first_layer <- function(adata, candidates) { + if (is.null(candidates)) return(NULL) + candidates <- candidates[nzchar(candidates)] + if (length(candidates) == 0) return(NULL) + layer_keys <- anndata_mapping_keys(adata$layers) + hit <- candidates[candidates %in% layer_keys] + if (length(hit) == 0) return(NULL) + hit[[1]] +} + +resolve_orig_ident <- function(orig.ident, adata, source_path) { + if (!is.null(orig.ident)) return(orig.ident) + src <- tryCatch(adata$uns[["conversion_source"]], error = function(e) NULL) + if (!is.null(src) && length(src) == 1) return(as.character(src)) + if (!is.null(source_path)) { + return(basename(tools::file_path_sans_ext(source_path))) + } + "AnnData" +} diff --git a/R/convert_seurat_to_sce.R b/R/convert_seurat_to_sce.R index 4abb345..8475d19 100644 --- a/R/convert_seurat_to_sce.R +++ b/R/convert_seurat_to_sce.R @@ -44,18 +44,21 @@ convert_seurat_to_sce <- function(data) { timestamped_cat("Found assays:", paste(assay_names, collapse = ", "), "\n") # Determine reference features and cell names from the default assay. + # These must NOT be overwritten per-iteration -- the whole point of the + # mismatch checks below is to compare each non-default assay against + # the *default* one and route it to altExps when it disagrees. default_assay_name <- DefaultAssay(data) default_assay <- data@assays[[default_assay_name]] - + ref_cells <- colnames(default_assay) + ref_features <- rownames(default_assay) + # Collect metadata (colData; one row per cell). metadata_data <- data@meta.data - + # Process each assay. for (assay_name in assay_names) { assay_object <- data@assays[[assay_name]] timestamped_cat("Processing assay:", assay_name, "\n") - ref_cells <- colnames(assay_object) - ref_features <- rownames(assay_object) if (inherits(assay_object, "Assay5")) { # Handle multi-layered assays. diff --git a/R/convert_to_anndata.R b/R/convert_to_anndata.R index 12a156d..b33fae1 100644 --- a/R/convert_to_anndata.R +++ b/R/convert_to_anndata.R @@ -36,8 +36,9 @@ convert_to_anndata <- function(sce, assayName = "counts", useAltExp = TRUE) { # Process the main assay X <- process_main_assay(sce, assayName) - # Process other assays - layers <- process_other_assays(sce) + # Process other assays (exclude the one already attached as X to avoid + # duplicating it under layers). + layers <- process_other_assays(sce, assayName = assayName) # Process dimensional reductions obsm <- process_dimensional_reductions(sce) diff --git a/R/extract_anndata_X.R b/R/extract_anndata_X.R new file mode 100644 index 0000000..c2bb75c --- /dev/null +++ b/R/extract_anndata_X.R @@ -0,0 +1,50 @@ +#' Extract the primary count matrix from an AnnData object +#' +#' Selects a layer from `adata$layers` if `layer` is supplied and present; +#' otherwise falls back to `adata$X`. The returned matrix is in +#' AnnData orientation (cells x features) and converted to CsparseMatrix. +#' Row and column names are set from `obs_names` / `var_names` when provided. +#' +#' @param adata An AnnData object. +#' @param layer Optional name of a layer in `adata$layers` to use as the +#' primary matrix. If `NULL` or missing, `adata$X` is used. +#' @param obs_names Character vector of cell names (rows). Optional. +#' @param var_names Character vector of feature names (columns). Optional. +#' @return A CsparseMatrix of shape `n_obs x n_vars` (cells x features). +#' @importFrom methods as +#' @export +extract_anndata_X <- function(adata, layer = NULL, obs_names = NULL, var_names = NULL) { + layer_keys <- anndata_mapping_keys(adata$layers) + + if (!is.null(layer) && nzchar(layer) && layer %in% layer_keys) { + timestamped_cat(sprintf("Using layer '%s' as primary matrix.\n", layer)) + mat <- adata$layers[[layer]] + } else { + if (!is.null(layer) && nzchar(layer)) { + timestamped_cat(sprintf( + "Layer '%s' not found in adata$layers; falling back to adata$X.\n", + layer + )) + } else { + timestamped_cat("Using adata$X as primary matrix.\n") + } + mat <- adata$X + } + + if (is.null(mat)) { + stop("Could not obtain a primary matrix from the AnnData object.") + } + + mat <- ensure_csparse_matrix(mat) + if (!methods::is(mat, "dgCMatrix")) { + mat <- methods::as(methods::as(mat, "CsparseMatrix"), "generalMatrix") + } + + if (!is.null(obs_names) && length(obs_names) == nrow(mat)) { + rownames(mat) <- obs_names + } + if (!is.null(var_names) && length(var_names) == ncol(mat)) { + colnames(mat) <- var_names + } + mat +} diff --git a/R/extract_anndata_layers.R b/R/extract_anndata_layers.R new file mode 100644 index 0000000..963d777 --- /dev/null +++ b/R/extract_anndata_layers.R @@ -0,0 +1,44 @@ +#' Extract layers from an AnnData object as a named list of matrices +#' +#' Each layer is converted to CsparseMatrix and given dimnames from +#' `obs_names` / `var_names`. The orientation is preserved as +#' AnnData-native (cells x features); callers that need genes x cells +#' (e.g. Seurat) should transpose. +#' +#' @param adata An AnnData object. +#' @param obs_names Character vector of cell names. +#' @param var_names Character vector of feature names. +#' @param exclude Character vector of layer names to skip (e.g. one already +#' used as the primary `X`). +#' @return A named list of CsparseMatrix objects (cells x features). Empty +#' list if the AnnData has no layers. +#' @importFrom methods as +#' @export +extract_anndata_layers <- function(adata, obs_names = NULL, var_names = NULL, exclude = character(0)) { + layer_keys <- anndata_mapping_keys(adata$layers) + layer_keys <- setdiff(layer_keys, exclude) + if (length(layer_keys) == 0) { + return(list()) + } + + out <- list() + for (k in layer_keys) { + mat <- tryCatch(adata$layers[[k]], error = function(e) NULL) + if (is.null(mat)) { + timestamped_cat(sprintf("WARNING: could not read layer '%s'; skipping.\n", k)) + next + } + mat <- ensure_csparse_matrix(mat) + if (!methods::is(mat, "dgCMatrix")) { + mat <- methods::as(methods::as(mat, "CsparseMatrix"), "generalMatrix") + } + if (!is.null(obs_names) && length(obs_names) == nrow(mat)) { + rownames(mat) <- obs_names + } + if (!is.null(var_names) && length(var_names) == ncol(mat)) { + colnames(mat) <- var_names + } + out[[k]] <- mat + } + out +} diff --git a/R/extract_anndata_obsm.R b/R/extract_anndata_obsm.R new file mode 100644 index 0000000..9dd42b5 --- /dev/null +++ b/R/extract_anndata_obsm.R @@ -0,0 +1,40 @@ +#' Extract obsm embeddings from an AnnData object +#' +#' Returns a named list of numeric matrices, one per `adata$obsm` key. +#' Each matrix has rows in `obs_names` order. Non-numeric or +#' dimension-mismatched entries are skipped with a warning. +#' +#' @param adata An AnnData object. +#' @param obs_names Character vector of cell names; used both for sanity-checking +#' row count and for setting rownames. +#' @return A named list of numeric matrices (cells x dim). +#' @export +extract_anndata_obsm <- function(adata, obs_names = NULL) { + obsm_keys <- anndata_mapping_keys(adata$obsm) + if (length(obsm_keys) == 0) { + return(list()) + } + + out <- list() + for (k in obsm_keys) { + mat <- tryCatch(as.matrix(adata$obsm[[k]]), error = function(e) NULL) + if (is.null(mat)) { + warning(sprintf("Could not convert obsm['%s'] to matrix; skipping.", k)) + next + } + if (!is.numeric(mat)) { + warning(sprintf("obsm['%s'] is not numeric (type: %s); skipping.", k, typeof(mat))) + next + } + if (!is.null(obs_names) && nrow(mat) != length(obs_names)) { + warning(sprintf( + "obsm['%s'] has %d rows but expected %d cells; skipping.", + k, nrow(mat), length(obs_names) + )) + next + } + if (!is.null(obs_names)) rownames(mat) <- obs_names + out[[k]] <- mat + } + out +} diff --git a/R/extract_anndata_obsp.R b/R/extract_anndata_obsp.R new file mode 100644 index 0000000..1f260cf --- /dev/null +++ b/R/extract_anndata_obsp.R @@ -0,0 +1,55 @@ +#' Extract obsp (cell x cell) matrices from an AnnData object +#' +#' AnnData stores cell-pair matrices (e.g. nearest-neighbor graphs from +#' `sc.pp.neighbors`: `connectivities`, `distances`) in `adata.obsp`. +#' This helper returns them as a named list of sparse cell x cell +#' CsparseMatrix objects, ready to be attached as Seurat `Graphs` or +#' SCE `colPairs`. +#' +#' @param adata An AnnData object. +#' @param obs_names Character vector of cell names; used both for sanity- +#' checking shape and for setting dimnames. +#' @return A named list of sparse matrices (`n_obs x n_obs`). Empty +#' list if `adata$obsp` is empty / missing. +#' @export +extract_anndata_obsp <- function(adata, obs_names = NULL) { + obsp <- tryCatch(adata$obsp, error = function(e) NULL) + if (is.null(obsp)) return(list()) + + keys <- anndata_mapping_keys(obsp) + if (length(keys) == 0) return(list()) + + out <- list() + for (k in keys) { + mat <- tryCatch(obsp[[k]], error = function(e) NULL) + if (is.null(mat)) { + warning(sprintf("Could not read obsp['%s']; skipping.", k)) + next + } + mat <- tryCatch(ensure_csparse_matrix(mat), error = function(e) NULL) + if (is.null(mat)) { + warning(sprintf("Could not coerce obsp['%s'] to CsparseMatrix; skipping.", k)) + next + } + if (nrow(mat) != ncol(mat)) { + warning(sprintf( + "obsp['%s'] is not square (%d x %d); skipping.", + k, nrow(mat), ncol(mat) + )) + next + } + if (!is.null(obs_names) && nrow(mat) != length(obs_names)) { + warning(sprintf( + "obsp['%s'] has %d rows but expected %d cells; skipping.", + k, nrow(mat), length(obs_names) + )) + next + } + if (!is.null(obs_names)) { + rownames(mat) <- obs_names + colnames(mat) <- obs_names + } + out[[k]] <- mat + } + out +} diff --git a/R/extract_anndata_raw.R b/R/extract_anndata_raw.R new file mode 100644 index 0000000..a537b92 --- /dev/null +++ b/R/extract_anndata_raw.R @@ -0,0 +1,41 @@ +#' Extract `adata.raw` as a CsparseMatrix +#' +#' `adata.raw` (a snapshot of an AnnData often used by scanpy to retain +#' raw counts after normalisation) is a separate AnnData-like object +#' with its own `.X`, `.var`, and `.var_names`. It may have a different +#' number of features from the main AnnData object. +#' +#' This helper returns the raw matrix in AnnData orientation (cells x +#' features), with cell rownames from `obs_names` (the active AnnData) +#' and feature column names from `adata$raw$var_names`. Returns `NULL` +#' if `adata$raw` is unset. +#' +#' @param adata An AnnData object. +#' @param obs_names Character vector of cell names. Optional. +#' @return A CsparseMatrix of shape `n_obs x n_raw_vars`, or `NULL`. +#' @export +extract_anndata_raw <- function(adata, obs_names = NULL) { + raw <- tryCatch(adata$raw, error = function(e) NULL) + if (is.null(raw)) return(NULL) + + raw_X <- tryCatch(raw$X, error = function(e) NULL) + if (is.null(raw_X)) return(NULL) + + mat <- ensure_csparse_matrix(raw_X) + if (!methods::is(mat, "dgCMatrix")) { + mat <- methods::as(methods::as(mat, "CsparseMatrix"), "generalMatrix") + } + + raw_var_names <- tryCatch({ + nm <- raw$var_names + if (is.character(nm)) nm else as.character(reticulate::import_builtins()$list(nm)) + }, error = function(e) NULL) + + if (!is.null(obs_names) && length(obs_names) == nrow(mat)) { + rownames(mat) <- obs_names + } + if (!is.null(raw_var_names) && length(raw_var_names) == ncol(mat)) { + colnames(mat) <- raw_var_names + } + mat +} diff --git a/R/extract_counts_matrix.R b/R/extract_counts_matrix.R index c4c387d..47fb031 100644 --- a/R/extract_counts_matrix.R +++ b/R/extract_counts_matrix.R @@ -17,7 +17,9 @@ extract_counts_matrix <- function(assay_object) { GetAssayData(assay_object) } } else if (inherits(assay_object, "Assay")) { - GetAssayData(assay_object, slot = "counts") + # SeuratObject >= 5.0.0: `slot` is defunct; use `layer`. Older + # versions accepted both. `layer` works on both. + GetAssayData(assay_object, layer = "counts") } else { stop("Unsupported assay class: ", class(assay_object)) } diff --git a/R/identify_alt_exps_seurat.R b/R/identify_alt_exps_seurat.R index 1221347..3d87c19 100644 --- a/R/identify_alt_exps_seurat.R +++ b/R/identify_alt_exps_seurat.R @@ -18,7 +18,7 @@ identify_alt_exps_seurat <- function(data) { counts <- GetAssayData(assay, layer = "counts") has_data <- (nrow(counts) > 0 && ncol(counts) > 0) || length(assay@layers) > 0 } else if (inherits(assay, "Assay")) { - counts <- GetAssayData(assay, slot = "counts") + counts <- GetAssayData(assay, layer = "counts") has_data <- nrow(counts) > 0 && ncol(counts) > 0 } else { # Unsupported assay type diff --git a/R/setup_anndata_python.R b/R/setup_anndata_python.R new file mode 100644 index 0000000..dd311ae --- /dev/null +++ b/R/setup_anndata_python.R @@ -0,0 +1,120 @@ +#' Configure the Python interpreter used by reticulate for AnnData I/O +#' +#' Call this once at the start of an R session to point reticulate at a +#' Python interpreter that has the `anndata` module installed. The +#' resolution order is: +#' +#' 1. `conda_env` (if supplied) — first tried as a conda environment name, +#' then as a path to an environment prefix, then as a path under +#' `~/micromamba/envs/` or `~/miniconda3/envs/`. +#' 2. `RETICULATE_PYTHON` environment variable — used directly as a Python +#' binary path if it exists. +#' 3. `CONDA_PREFIX` environment variable — treated as the active +#' conda/micromamba environment prefix. +#' 4. No-op (reticulate falls back to its own discovery). +#' +#' Failures emit a warning and continue rather than throwing, so users +#' who already have Python wired up don't have to special-case this call. +#' +#' @param conda_env Optional. A conda environment name (resolved against +#' common locations: micromamba, miniconda3, conda) or an absolute path +#' to an environment prefix. +#' @param required Logical; passed to the underlying reticulate calls. +#' Defaults to `FALSE` so that misconfiguration emits a warning rather +#' than aborting. +#' @param validate Logical. After setting the interpreter preference, run +#' `check_anndata_python(action = "warn")` to confirm the env actually +#' imports `anndata`. Defaults to `TRUE`. With `required = FALSE` +#' reticulate's own validation is a no-op, so this is the only +#' protection against silent typos in `conda_env`. +#' @return Invisibly returns a single-element character vector describing +#' which path was used: one of `"conda_env"`, `"conda_env_resolved"`, +#' `"reticulate_python"`, `"conda_prefix"`, or `"none"`. +#' @examples +#' \dontrun{ +#' setup_anndata_python("my-anndata-env") +#' adata <- anndata::read_h5ad("pbmc.h5ad") +#' } +#' @export +setup_anndata_python <- function(conda_env = NULL, required = FALSE, + validate = TRUE) { + source_label <- "none" + + if (!is.null(conda_env) && nzchar(conda_env)) { + source_label <- try_use_conda_env(conda_env, required = required) + } + + if (source_label == "none") { + python_path <- Sys.getenv("RETICULATE_PYTHON", unset = NA) + if (!is.na(python_path) && nzchar(python_path) && file.exists(python_path)) { + ok <- FALSE + tryCatch({ + reticulate::use_python(python_path, required = required) + ok <- TRUE + }, error = function(e) { + warning(sprintf( + "Could not use Python at '%s': %s", + python_path, conditionMessage(e) + )) + }) + if (ok) source_label <- "reticulate_python" + } + } + + if (source_label == "none") { + conda_prefix <- Sys.getenv("CONDA_PREFIX", unset = NA) + if (!is.na(conda_prefix) && nzchar(conda_prefix)) { + ok <- FALSE + tryCatch({ + reticulate::use_condaenv(conda_prefix, required = required) + ok <- TRUE + }, error = function(e) { + warning(sprintf( + "Could not activate CONDA_PREFIX '%s': %s", + conda_prefix, conditionMessage(e) + )) + }) + if (ok) source_label <- "conda_prefix" + } + } + + if (isTRUE(validate) && source_label != "none") { + suppressWarnings(check_anndata_python(action = "warn")) + } + + invisible(source_label) +} + +#' @keywords internal +try_use_conda_env <- function(conda_env, required = FALSE) { + attempt <- function(arg) { + tryCatch({ + reticulate::use_condaenv(arg, required = required) + TRUE + }, error = function(e) FALSE) + } + + # 1. Treat as a name first. + if (attempt(conda_env)) return("conda_env") + + # 2. Treat as a literal path. + if (file.exists(conda_env) && attempt(conda_env)) return("conda_env") + + # 3. Resolve against common env-store locations. + candidates <- c( + file.path(path.expand("~/micromamba/envs"), conda_env), + file.path(path.expand("~/miniconda3/envs"), conda_env), + file.path(path.expand("~/anaconda3/envs"), conda_env), + file.path(Sys.getenv("MAMBA_ROOT_PREFIX", unset = ""), "envs", conda_env) + ) + candidates <- candidates[nzchar(candidates) & dir.exists(candidates)] + for (cand in candidates) { + if (attempt(cand)) return("conda_env_resolved") + } + + warning(sprintf( + "Could not activate conda environment '%s' as a name, a literal path, or under standard env-store locations.", + conda_env + )) + "none" +} diff --git a/R/zzz.R b/R/zzz.R new file mode 100644 index 0000000..38e65e5 --- /dev/null +++ b/R/zzz.R @@ -0,0 +1,11 @@ +.onAttach <- function(libname, pkgname) { + # Don't fire diagnostics in non-interactive sessions (knitting, CI, batch + # scripts). The conversion entry points already self-check and emit a + # tailored error if anything is wrong. + if (!interactive()) return(invisible()) + msg <- paste0( + "convert2anndata uses reticulate to call Python anndata. ", + "Run convert2anndata::diagnose_anndata_python() if you hit env issues." + ) + packageStartupMessage(msg) +} diff --git a/Readme.md b/Readme.md index 3f0dcb6..e3a4914 100644 --- a/Readme.md +++ b/Readme.md @@ -2,7 +2,24 @@ [![Codecov test coverage](https://codecov.io/gh/settylab/convert2anndata/branch/main/graph/badge.svg)](https://codecov.io/gh/settylab/convert2anndata) -`convert2anndata` is an R package designed to seamlessly convert `SingleCellExperiment` and `Seurat` objects into the `AnnData` format, widely used in single-cell data analysis. The package supports the conversion of split layers (Seurat), assays, dimensional reductions, metadata, cell-to-cell pairing data (e.g., distances), and alternative experiments, ensuring a comprehensive transfer of information. If you encounter any issues or notice incomplete conversions, please feel free to report them on our [GitHub issue tracker](https://github.com/settylab/convert2anndata/issues) to help us continuously improve. +`convert2anndata` is an R package for **bidirectional conversion** between +`AnnData` (the canonical Python single-cell format) and either +`SingleCellExperiment` or `Seurat` objects. It handles split layers +(Seurat), assays, dimensional reductions (`obsm` ↔ `reducedDims` / +Seurat reductions), metadata (`obs`/`var` ↔ `colData`/`rowData`), layers, +and alternative experiments, aiming for a faithful roundtrip. If you +encounter any issues or notice incomplete conversions, please feel free +to report them on our +[GitHub issue tracker](https://github.com/settylab/convert2anndata/issues) +to help us continuously improve. + +### Direction reference + +| From → To | Function | +|---|---| +| Seurat / SCE → AnnData | `convert_to_anndata(sce, ...)` (Seurat first via `convert_seurat_to_sce()`) | +| AnnData → Seurat | `convert_anndata_to_seurat(adata, ...)` | +| AnnData → SCE | `convert_anndata_to_sce(adata, ...)` | ## Installation @@ -82,10 +99,20 @@ Now you can use the command line toole, explained under `Command Line Usage` bel ### Command Line Usage -You can use the `convert2anndata` package from the command line to convert `SingleCellExperiment` or `Seurat` objects stored in RDS files to `AnnData` format (H5AD files). Here is an example of how to use it: +The CLI dispatches on the input file extension. `.rds` input is treated +as a Seurat or SingleCellExperiment object and converted to `.h5ad`; +`.h5ad` input is read with `anndata::read_h5ad()` and converted to a +Seurat (default) or SCE object saved to `.rds`. ```sh -Rscript -e "convert2anndata::cli_convert()" -i /path/to/input_file.rds -o /path/to/output_file.h5ad +# RDS -> H5AD +Rscript -e "convert2anndata::cli_convert()" -i /path/to/input.rds -o /path/to/output.h5ad + +# H5AD -> RDS (Seurat by default) +Rscript -e "convert2anndata::cli_convert()" -i /path/to/input.h5ad -o /path/to/output.rds + +# H5AD -> RDS (SingleCellExperiment) +Rscript -e "convert2anndata::cli_convert()" -i /path/to/input.h5ad -t sce ``` If you set up an alias, as suggested in the `Installation` section, then you can also conviniently run @@ -96,17 +123,18 @@ c2a -i /path/to/input_file.rds -o /path/to/output_file.h5ad #### Command Line Options -- `-i`, `--input`: Path to the input RDS file containing the `SingleCellExperiment` or `Seurat` object. This option is required. -- `-o`, `--output`: Path to the output H5AD file. If not specified, the output path is derived by replacing the `.rds` extension of the input path with `.h5ad`. -- `-a`, `--assay`: The assay to use as the main matrix (`anndata.X`). Defaults to 'counts'. -- `-d`, `--disable-recursive-altExp`: Disable recursive recovery of `altExperiments` and discard them instead. -- `-h`, `--help`: Show a help massage and exit. +- `-i`, `--input`: Path to the input file (`.rds` or `.h5ad`). Required. +- `-o`, `--output`: Path to the output file. If not specified, the output path is derived by swapping the extension (`.rds` ↔ `.h5ad`). +- `-a`, `--assay`: For `.rds → .h5ad`, the assay to use as `anndata.X`. For `.h5ad → .rds`, the layer name to use as the counts assay. Defaults to `counts`. +- `-d`, `--disable-recursive-altExp`: Disable recursive recovery of `altExperiments` and discard them instead. Applies to `.rds → .h5ad`. +- `-t`, `--target`: For `.h5ad → .rds` only: target object type, `seurat` (default) or `sce`. +- `-h`, `--help`: Show a help message and exit. ### R Usage You can also use the `convert2anndata` package directly in R. Below are examples of how to convert `SingleCellExperiment` or `Seurat` objects to `AnnData` format within an R session. -#### Example +#### Seurat / SCE → AnnData ```r library(convert2anndata) @@ -125,9 +153,111 @@ ad <- convert_to_anndata(sce, assayName = "counts", useAltExp = TRUE) write_h5ad(ad, "/path/to/output_file.h5ad") ``` +#### AnnData → Seurat + +```r +library(convert2anndata) +library(anndata) + +# Read an .h5ad file as an AnnData R6 object +adata <- read_h5ad("/path/to/input.h5ad") + +# Convert to Seurat. counts_layer selects which layer becomes the counts +# assay; if the layer is missing, adata$X is used. +seurat_obj <- convert_anndata_to_seurat(adata, counts_layer = "counts") + +saveRDS(seurat_obj, "/path/to/output.rds") +``` + +#### AnnData → SingleCellExperiment + +```r +adata <- read_h5ad("/path/to/input.h5ad") +sce <- convert_anndata_to_sce(adata, counts_layer = "counts") +``` + Find the function documentation in the [reference manual](https://settylab.github.io/convert2anndata/reference/) or retrive the documentation through `?convert_to_anndata` for any of functions. +## Python environment + +`convert2anndata` is a thin R-side wrapper around the Python `anndata` +library, accessed through `reticulate`. **You must have a Python +interpreter that has `anndata` (and a compatible `numpy`) installed +before any conversion call will work.** The package does not bundle +Python. + +The simplest setup uses the bundled installer from the `anndata` R +package, which provisions a managed Python venv for you: + +```r +anndata::install_anndata() +``` + +If you already have a Python environment with `anndata` installed +(e.g. a conda/micromamba env, or a `uv` venv), point reticulate at it. +The package looks at, in order: + +1. an explicit `conda_env` argument (path entrypoints only), +2. `Sys.getenv("RETICULATE_PYTHON")`, +3. `Sys.getenv("CONDA_PREFIX")`. + +`setup_anndata_python()` is a small helper that runs the same +resolution chain and emits a warning instead of a hard error on +misconfiguration: + +```r +convert2anndata::setup_anndata_python("my-anndata-env") +# or: +Sys.setenv(RETICULATE_PYTHON = "/path/to/python") +``` + +Use `check_anndata_python()` at the start of a script to fail fast +with an actionable message if Python, the `anndata` module, or numpy +aren't reachable. The path entrypoints +(`convert_anndata_to_seurat("file.h5ad")` and +`convert_anndata_to_sce("file.h5ad")`) call this automatically. + +### Troubleshooting + +**`No Python interpreter available to reticulate.`** +Reticulate could not find any Python. Set `RETICULATE_PYTHON` or run +`anndata::install_anndata()`. + +**`Python module 'anndata' is not importable.`** +Reticulate found a Python, but `anndata` isn't installed in it. Either +install into the active env (`reticulate::py_install("anndata")`) or +point `RETICULATE_PYTHON` at a different one. + +**`ImportError: Error importing numpy: you should not try to import numpy from its source directory`** +Almost always caused by a polluted `PYTHONPATH`. Common on HPC where +loading an R module also exports site-packages paths for a *different* +Python version (e.g. `/app/software/SciPy-bundle/.../python3.11/...`). +The fix is to clear the variable before launching R: + +```sh +unset PYTHONPATH +Rscript your-script.R +``` + +Or from inside R, **before any reticulate import**: + +```r +Sys.unsetenv("PYTHONPATH") +``` + +(after which a fresh R session is safest). + +**`use_condaenv("missing-env")` silently "succeeds" then explodes +later.** With `required = FALSE` (the default), reticulate records +the preference without validating it. Either pass `required = TRUE` +or rely on `check_anndata_python()` to fail fast at the next call. + +**Differences between conda envs:** `anndata` 0.8 / 0.9 / 0.12 differ +slightly. The package is tested against 0.8.x. If you hit a method-not-found +error from inside conversion, check `reticulate::py_config()` and the +Python `anndata.__version__`. + ## License This project is licensed under the GPL-3 License - see the [LICENSE](LICENSE) file for details. diff --git a/_pkgdown.yml b/_pkgdown.yml index 0458869..bf91457 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -2,6 +2,7 @@ template: bootstrap: 5 url: https://settylab.github.io/convert2anndata + authors: - name: "Setty-Lab" href: http://setty-lab.org @@ -9,6 +10,74 @@ authors: home: title: "convert2anndata" description: > - An R package to convert SingeCellExperiment and Seurat objects into - anndata as comprehensively as possible. + An R package for bidirectional conversion between AnnData (.h5ad) and + either SingleCellExperiment or Seurat objects, aiming for a faithful + roundtrip of assays, layers, dimensional reductions, metadata, and + cell/feature graphs. path: README.md + +reference: + - title: "Bidirectional conversion" + desc: > + Main entry points. Converters accept either an in-memory object or + a path to an .h5ad / .rds file; cli_convert() is the command-line + dispatcher that picks a direction from the file extension. + contents: + - convert_anndata_to_seurat + - convert_anndata_to_sce + - convert_to_anndata + - convert_seurat_to_sce + - cli_convert + + - title: "Python environment" + desc: > + Helpers for wiring reticulate to a Python interpreter that has the + anndata module installed, and for diagnosing misconfiguration. + contents: + - setup_anndata_python + - check_anndata_python + - diagnose_anndata_python + + - title: "AnnData component extractors" + desc: > + Lower-level accessors that pull individual pieces out of an AnnData + object as R-native matrices / data frames. Useful when building a + custom conversion. + contents: + - extract_anndata_X + - extract_anndata_layers + - extract_anndata_obsm + - extract_anndata_obsp + - extract_anndata_raw + - anndata_names + + - title: "Attaching components to R objects" + contents: + - attach_reductions_seurat + - attach_alt_experiments_sce + - convert_graph_to_colPair + + - title: "Internal building blocks" + desc: > + Exported for advanced use and testing, but not part of the + recommended user-facing API. Most users only need the converters + in the first section. + contents: + - align_metadata_seurat + - convert_commands + - create_combined_assay + - default_reduction_map + - ensure_csparse_matrix + - extract_counts_matrix + - extract_data + - extract_pairs + - identify_alt_exps_seurat + - preserve_reductions_seurat + - process_alt_experiments + - process_dimensional_reductions + - process_layers + - process_main_assay + - process_metadata_and_pairwise + - process_other_assays + - timestamped_cat + - update_seurat_object diff --git a/man/anndata_mapping_keys.Rd b/man/anndata_mapping_keys.Rd new file mode 100644 index 0000000..6407ed9 --- /dev/null +++ b/man/anndata_mapping_keys.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/anndata_keys.R +\name{anndata_mapping_keys} +\alias{anndata_mapping_keys} +\title{Extract dict-like keys from an AnnData mapping (layers, obsm, uns, ...)} +\usage{ +anndata_mapping_keys(mapping) +} +\arguments{ +\item{mapping}{The mapping attribute (e.g. `adata$layers`).} +} +\value{ +A character vector of keys, or `character(0)` on failure. +} +\description{ +AnnData's mapping attributes (`layers`, `obsm`, `uns`, ...) expose a +`.keys()` method. Different combinations of `reticulate` and `anndata` +return that result in different shapes: +} +\details{ +- newer reticulate auto-converts the Python `KeysView` to an R character + vector (so `as.character(builtins$list(x$keys()))` runs `list()` on a + char-vec and splits each name into individual characters); +- older versions hand back a Python object that needs `builtins$list()` + to materialise. + +This helper handles both cases, plus the edge case where the mapping has +no `.keys()` method but `names()` works. +} +\keyword{internal} diff --git a/man/anndata_names.Rd b/man/anndata_names.Rd new file mode 100644 index 0000000..079adc2 --- /dev/null +++ b/man/anndata_names.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/anndata_names.R +\name{anndata_names} +\alias{anndata_names} +\title{Extract obs/var names from an AnnData object} +\usage{ +anndata_names(adata) +} +\arguments{ +\item{adata}{An AnnData object (e.g. as returned by `anndata::read_h5ad`).} +} +\value{ +A list with elements `obs_names` (cells) and `var_names` (features), + each a character vector or `NULL` if extraction failed. +} +\description{ +Cell and feature names on AnnData objects are Python pandas Indexes; +R's `rownames()`/`colnames()` do not return them reliably. This helper +coerces them to character vectors via Python's `list()` builtin, falling +back to `names()` if that path fails. +} diff --git a/man/attach_reductions_seurat.Rd b/man/attach_reductions_seurat.Rd new file mode 100644 index 0000000..f9a057f --- /dev/null +++ b/man/attach_reductions_seurat.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/attach_reductions_seurat.R +\name{attach_reductions_seurat} +\alias{attach_reductions_seurat} +\title{Attach obsm embeddings to a Seurat object as dimensional reductions} +\usage{ +attach_reductions_seurat( + seurat_obj, + obsm, + assay = "RNA", + reduction_map = list() +) +} +\arguments{ +\item{seurat_obj}{A Seurat object to attach reductions to.} + +\item{obsm}{Named list of numeric matrices (cells x dim), as returned by +`extract_anndata_obsm()`.} + +\item{assay}{Assay name to associate the reductions with. Defaults to "RNA".} + +\item{reduction_map}{Named list overriding or extending +`default_reduction_map()`. Each entry maps an obsm key (e.g. `"X_pca"`) +to a list with `name` and `key`. The supplied map is merged on top of +the defaults; pass an empty list to keep defaults, or pass an entry +with `name = NA` to disable a default.} +} +\value{ +The Seurat object with reductions attached. +} +\description{ +Uses an obsm-key -> (reduction-name, key-prefix) mapping. Keys not in the +mapping fall back to a derived name: leading `X_` stripped, lowercased, +with a sanitized key prefix. +} diff --git a/man/check_anndata_python.Rd b/man/check_anndata_python.Rd new file mode 100644 index 0000000..8c13e0a --- /dev/null +++ b/man/check_anndata_python.Rd @@ -0,0 +1,60 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check_anndata_python.R +\name{check_anndata_python} +\alias{check_anndata_python} +\title{Check that reticulate has a working Python with the `anndata` module} +\usage{ +check_anndata_python(action = c("stop", "warn", "silent"), verbose = FALSE) +} +\arguments{ +\item{action}{What to do on failure: `"stop"` (default) raises an +error; `"warn"` emits a warning and returns `FALSE`; `"silent"` +returns `FALSE` quietly.} + +\item{verbose}{Logical. When `TRUE`, prints a full diagnostic block +on success showing Python path, version, anndata version, numpy +version, `RETICULATE_PYTHON` and `CONDA_PREFIX`. Useful as a +one-stop "what does my env actually look like" call.} +} +\value{ +`TRUE` (invisibly) when all checks pass, `FALSE` when any + check fails and `action != "stop"`. +} +\description{ +Reticulate-based packages typically fail late and cryptically when the +Python side isn't wired up correctly: the user gets a numpy import +error from deep inside Python, far from the call site, with no hint +as to what to fix. This helper runs a layered check at the start of +a conversion call and stops with a single message that says exactly +what is wrong and how to fix it. +} +\details{ +Checks performed, in order, with stop-at-first-failure semantics: + +1. **A Python interpreter is reachable.** If `reticulate::py_available()` + is `FALSE` even after initialisation, no Python could be located. +2. **`PYTHONPATH` does not point at a different Python's site-packages.** + A common HPC failure mode is that loading an R module exports + paths like `/app/.../Python-3.11/site-packages` while reticulate + has selected a 3.8 / 3.12 interpreter, producing the cryptic + "Error importing numpy: you should not try to import numpy from + its source directory" error. We detect this *before* it fires. +3. **The Python `anndata` module is importable.** +4. **`numpy` imports without complaint** (catches any remaining ABI + mismatches). +5. **An end-to-end smoke probe**: round-trip a 2x2 AnnData object + through reticulate. This is the only way to be sure the env is + actually usable for conversion work. + +Each failure produces a tailored error message including the Python +interpreter that was selected and a one-line suggested fix. +} +\examples{ +\dontrun{ +check_anndata_python() # stops with a helpful message +check_anndata_python(verbose = TRUE) # also prints env diagnostics +if (check_anndata_python(action = "silent")) { + adata <- anndata::read_h5ad(path) +} +} +} diff --git a/man/cli_convert.Rd b/man/cli_convert.Rd index d7731f1..0a2132c 100644 --- a/man/cli_convert.Rd +++ b/man/cli_convert.Rd @@ -7,7 +7,9 @@ cli_convert() } \description{ -This function serves as a command line interface for the convert2anndata package. -It parses command line arguments and calls the appropriate functions to convert -a SingleCellExperiment or Seurat object to an AnnData object. +Bidirectional CLI: dispatches on the input file extension. An `.rds` input +is treated as a Seurat or SingleCellExperiment object and converted to an +AnnData `.h5ad` file. An `.h5ad` input is read with `anndata::read_h5ad()` +and converted to a Seurat (default) or SingleCellExperiment object saved +to `.rds`. } diff --git a/man/convert_anndata_to_sce.Rd b/man/convert_anndata_to_sce.Rd new file mode 100644 index 0000000..315fe43 --- /dev/null +++ b/man/convert_anndata_to_sce.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/convert_anndata_to_sce.R +\name{convert_anndata_to_sce} +\alias{convert_anndata_to_sce} +\title{Convert an AnnData object (or a .h5ad file) to a SingleCellExperiment} +\usage{ +convert_anndata_to_sce( + adata, + counts_layer = c("counts", "raw_counts", "raw_count"), + conda_env = NULL +) +} +\arguments{ +\item{adata}{An AnnData object, OR a character path to an `.h5ad` file +(in which case it is read with `anndata::read_h5ad()`).} + +\item{counts_layer}{Name(s) of the layer in `adata$layers` to use as the +`counts` assay. May be a single string or a character vector of +candidates; the first one present in `adata$layers` is used. If none +are present, `adata$X` is used as `counts`. Defaults to +`c("counts", "raw_counts", "raw_count")`.} + +\item{conda_env}{Optional conda environment to activate before reading +the file. Equivalent to calling `setup_anndata_python(conda_env)` +first. Only relevant when `adata` is a path.} +} +\value{ +A SingleCellExperiment object. +} +\description{ +Reverse direction of `convert_to_anndata()` for the SCE branch. +`adata$X` (or `adata$layers[[counts_layer]]` if present) becomes the +`counts` assay, additional layers become other assays, `adata$obs` is +written to `colData`, `adata$var` to `rowData`, and `adata$obsm` entries +to `reducedDims`. +} +\examples{ +\dontrun{ +# Path entrypoint: +sce <- convert_anndata_to_sce("pbmc.h5ad") +# Object entrypoint: +adata <- anndata::read_h5ad("pbmc.h5ad") +sce <- convert_anndata_to_sce(adata) +} +} diff --git a/man/convert_anndata_to_seurat.Rd b/man/convert_anndata_to_seurat.Rd new file mode 100644 index 0000000..22903ac --- /dev/null +++ b/man/convert_anndata_to_seurat.Rd @@ -0,0 +1,90 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/convert_anndata_to_seurat.R +\name{convert_anndata_to_seurat} +\alias{convert_anndata_to_seurat} +\title{Convert an AnnData object (or a .h5ad file) to a Seurat object} +\usage{ +convert_anndata_to_seurat( + adata, + counts_layer = c("counts", "raw_counts", "raw_count"), + assay = "RNA", + reduction_map = list(), + conda_env = NULL, + orig.ident = NULL, + use_raw = c("auto", "always", "never"), + attach_obsp = TRUE +) +} +\arguments{ +\item{adata}{An AnnData object, OR a character path to an `.h5ad` file +(in which case it is read with `anndata::read_h5ad()`).} + +\item{counts_layer}{Name(s) of the layer in `adata$layers` to use as the +counts matrix. May be a single string or a character vector of +candidate names; the first one present in `adata$layers` is used. +If none are present, `adata$X` is used as counts and no separate +`data` layer is added. Defaults to +`c("counts", "raw_counts", "raw_count")`.} + +\item{assay}{Name of the resulting Seurat assay. Defaults to `"RNA"`.} + +\item{reduction_map}{Optional named list passed to +`attach_reductions_seurat()` to override or extend the default obsm-key +to Seurat-reduction mapping. See `default_reduction_map()`.} + +\item{conda_env}{Optional conda environment to activate before reading +the file. Equivalent to calling `setup_anndata_python(conda_env)` +first. Only relevant when `adata` is a path.} + +\item{orig.ident}{Optional value for the `orig.ident` column of the +resulting Seurat meta.data. If `NULL` (default), the value is taken +from `adata$uns$conversion_source` if present, otherwise from the file +basename when `adata` was passed as a path, otherwise `"AnnData"`.} + +\item{use_raw}{How to handle `adata$raw` (a separate AnnData snapshot +often used by scanpy to retain raw counts after normalising `X`). +`"auto"` (default) uses `raw$X` as the counts layer when no +`counts_layer` candidate matched and `adata$raw` is set; `TRUE` +always uses `raw$X` as counts (and falls back to candidates / `X` +if it is missing); `FALSE` ignores raw entirely.} + +\item{attach_obsp}{Logical. When `TRUE` (default), entries of +`adata$obsp` (cell x cell pairwise matrices, e.g. nearest-neighbor +graphs from `sc.pp.neighbors`) are attached as Seurat `Graphs`.} +} +\value{ +A Seurat object. +} +\description{ +Reverse direction of `convert_to_anndata()`: takes an AnnData object +(typically loaded with `anndata::read_h5ad()`) or a path to an `.h5ad` +file and constructs a Seurat object. Counts go into the `counts` layer, +`adata$X` into the `data` layer, `adata$obs` into the meta.data, and +entries of `adata$obsm` are attached as `DimReduc` objects. +} +\details{ +The returned object has the AnnData orientation transposed: Seurat + stores genes x cells while AnnData stores cells x genes. Cell and feature + names are taken from `adata$obs_names` and `adata$var_names`. + + `adata$obs` is attached as `meta.data`. `adata$var` is attached via + `seurat_obj[["RNA"]][[]] <- ...` when present and dimensionally + compatible. +} +\examples{ +\dontrun{ +# Path entrypoint: +seurat_obj <- convert_anndata_to_seurat("pbmc.h5ad") + +# Object entrypoint: +adata <- anndata::read_h5ad("pbmc.h5ad") +seurat_obj <- convert_anndata_to_seurat(adata) + +# Custom layer + extra reductions: +seurat_obj <- convert_anndata_to_seurat( + "pbmc.h5ad", + counts_layer = c("counts", "raw_counts"), + reduction_map = list(X_my_emb = list(name = "myemb", key = "MyEmb_")) +) +} +} diff --git a/man/default_reduction_map.Rd b/man/default_reduction_map.Rd new file mode 100644 index 0000000..3590075 --- /dev/null +++ b/man/default_reduction_map.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/attach_reductions_seurat.R +\name{default_reduction_map} +\alias{default_reduction_map} +\title{Default mapping from AnnData obsm keys to Seurat reduction (name, key) pairs} +\usage{ +default_reduction_map() +} +\value{ +A named list. Each name is an obsm key (e.g. `X_pca`); each + element is a list with `name` (Seurat reduction name, e.g. `"pca"`) + and `key` (column-name prefix, e.g. `"PC_"`). +} +\description{ +Provides the canonical Seurat naming for the obsm keys conventionally +produced by scanpy. Users who want extra mappings should pass a list with +the same shape into `attach_reductions_seurat()` (or its callers). +} diff --git a/man/diagnose_anndata_python.Rd b/man/diagnose_anndata_python.Rd new file mode 100644 index 0000000..7bb7480 --- /dev/null +++ b/man/diagnose_anndata_python.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check_anndata_python.R +\name{diagnose_anndata_python} +\alias{diagnose_anndata_python} +\title{Print a diagnostic block for the currently-active Python env} +\usage{ +diagnose_anndata_python() +} +\value{ +Logical, invisibly. +} +\description{ +Convenience wrapper for `check_anndata_python(verbose = TRUE, +action = "silent")`. Always prints the diagnostic block; never +raises. Returns `TRUE` if the env is usable, `FALSE` otherwise. +} diff --git a/man/extract_anndata_X.Rd b/man/extract_anndata_X.Rd new file mode 100644 index 0000000..8d645da --- /dev/null +++ b/man/extract_anndata_X.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/extract_anndata_X.R +\name{extract_anndata_X} +\alias{extract_anndata_X} +\title{Extract the primary count matrix from an AnnData object} +\usage{ +extract_anndata_X(adata, layer = NULL, obs_names = NULL, var_names = NULL) +} +\arguments{ +\item{adata}{An AnnData object.} + +\item{layer}{Optional name of a layer in `adata$layers` to use as the +primary matrix. If `NULL` or missing, `adata$X` is used.} + +\item{obs_names}{Character vector of cell names (rows). Optional.} + +\item{var_names}{Character vector of feature names (columns). Optional.} +} +\value{ +A CsparseMatrix of shape `n_obs x n_vars` (cells x features). +} +\description{ +Selects a layer from `adata$layers` if `layer` is supplied and present; +otherwise falls back to `adata$X`. The returned matrix is in +AnnData orientation (cells x features) and converted to CsparseMatrix. +Row and column names are set from `obs_names` / `var_names` when provided. +} diff --git a/man/extract_anndata_layers.Rd b/man/extract_anndata_layers.Rd new file mode 100644 index 0000000..1527709 --- /dev/null +++ b/man/extract_anndata_layers.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/extract_anndata_layers.R +\name{extract_anndata_layers} +\alias{extract_anndata_layers} +\title{Extract layers from an AnnData object as a named list of matrices} +\usage{ +extract_anndata_layers( + adata, + obs_names = NULL, + var_names = NULL, + exclude = character(0) +) +} +\arguments{ +\item{adata}{An AnnData object.} + +\item{obs_names}{Character vector of cell names.} + +\item{var_names}{Character vector of feature names.} + +\item{exclude}{Character vector of layer names to skip (e.g. one already +used as the primary `X`).} +} +\value{ +A named list of CsparseMatrix objects (cells x features). Empty + list if the AnnData has no layers. +} +\description{ +Each layer is converted to CsparseMatrix and given dimnames from +`obs_names` / `var_names`. The orientation is preserved as +AnnData-native (cells x features); callers that need genes x cells +(e.g. Seurat) should transpose. +} diff --git a/man/extract_anndata_obsm.Rd b/man/extract_anndata_obsm.Rd new file mode 100644 index 0000000..c698709 --- /dev/null +++ b/man/extract_anndata_obsm.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/extract_anndata_obsm.R +\name{extract_anndata_obsm} +\alias{extract_anndata_obsm} +\title{Extract obsm embeddings from an AnnData object} +\usage{ +extract_anndata_obsm(adata, obs_names = NULL) +} +\arguments{ +\item{adata}{An AnnData object.} + +\item{obs_names}{Character vector of cell names; used both for sanity-checking +row count and for setting rownames.} +} +\value{ +A named list of numeric matrices (cells x dim). +} +\description{ +Returns a named list of numeric matrices, one per `adata$obsm` key. +Each matrix has rows in `obs_names` order. Non-numeric or +dimension-mismatched entries are skipped with a warning. +} diff --git a/man/extract_anndata_obsp.Rd b/man/extract_anndata_obsp.Rd new file mode 100644 index 0000000..65798b3 --- /dev/null +++ b/man/extract_anndata_obsp.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/extract_anndata_obsp.R +\name{extract_anndata_obsp} +\alias{extract_anndata_obsp} +\title{Extract obsp (cell x cell) matrices from an AnnData object} +\usage{ +extract_anndata_obsp(adata, obs_names = NULL) +} +\arguments{ +\item{adata}{An AnnData object.} + +\item{obs_names}{Character vector of cell names; used both for sanity- +checking shape and for setting dimnames.} +} +\value{ +A named list of sparse matrices (`n_obs x n_obs`). Empty + list if `adata$obsp` is empty / missing. +} +\description{ +AnnData stores cell-pair matrices (e.g. nearest-neighbor graphs from +`sc.pp.neighbors`: `connectivities`, `distances`) in `adata.obsp`. +This helper returns them as a named list of sparse cell x cell +CsparseMatrix objects, ready to be attached as Seurat `Graphs` or +SCE `colPairs`. +} diff --git a/man/extract_anndata_raw.Rd b/man/extract_anndata_raw.Rd new file mode 100644 index 0000000..ddaf56a --- /dev/null +++ b/man/extract_anndata_raw.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/extract_anndata_raw.R +\name{extract_anndata_raw} +\alias{extract_anndata_raw} +\title{Extract `adata.raw` as a CsparseMatrix} +\usage{ +extract_anndata_raw(adata, obs_names = NULL) +} +\arguments{ +\item{adata}{An AnnData object.} + +\item{obs_names}{Character vector of cell names. Optional.} +} +\value{ +A CsparseMatrix of shape `n_obs x n_raw_vars`, or `NULL`. +} +\description{ +`adata.raw` (a snapshot of an AnnData often used by scanpy to retain +raw counts after normalisation) is a separate AnnData-like object +with its own `.X`, `.var`, and `.var_names`. It may have a different +number of features from the main AnnData object. +} +\details{ +This helper returns the raw matrix in AnnData orientation (cells x +features), with cell rownames from `obs_names` (the active AnnData) +and feature column names from `adata$raw$var_names`. Returns `NULL` +if `adata$raw` is unset. +} diff --git a/man/setup_anndata_python.Rd b/man/setup_anndata_python.Rd new file mode 100644 index 0000000..e8cf0f1 --- /dev/null +++ b/man/setup_anndata_python.Rd @@ -0,0 +1,52 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/setup_anndata_python.R +\name{setup_anndata_python} +\alias{setup_anndata_python} +\title{Configure the Python interpreter used by reticulate for AnnData I/O} +\usage{ +setup_anndata_python(conda_env = NULL, required = FALSE, validate = TRUE) +} +\arguments{ +\item{conda_env}{Optional. A conda environment name (resolved against +common locations: micromamba, miniconda3, conda) or an absolute path +to an environment prefix.} + +\item{required}{Logical; passed to the underlying reticulate calls. +Defaults to `FALSE` so that misconfiguration emits a warning rather +than aborting.} + +\item{validate}{Logical. After setting the interpreter preference, run +`check_anndata_python(action = "warn")` to confirm the env actually +imports `anndata`. Defaults to `TRUE`. With `required = FALSE` +reticulate's own validation is a no-op, so this is the only +protection against silent typos in `conda_env`.} +} +\value{ +Invisibly returns a single-element character vector describing + which path was used: one of `"conda_env"`, `"conda_env_resolved"`, + `"reticulate_python"`, `"conda_prefix"`, or `"none"`. +} +\description{ +Call this once at the start of an R session to point reticulate at a +Python interpreter that has the `anndata` module installed. The +resolution order is: +} +\details{ +1. `conda_env` (if supplied) — first tried as a conda environment name, + then as a path to an environment prefix, then as a path under + `~/micromamba/envs/` or `~/miniconda3/envs/`. +2. `RETICULATE_PYTHON` environment variable — used directly as a Python + binary path if it exists. +3. `CONDA_PREFIX` environment variable — treated as the active + conda/micromamba environment prefix. +4. No-op (reticulate falls back to its own discovery). + +Failures emit a warning and continue rather than throwing, so users +who already have Python wired up don't have to special-case this call. +} +\examples{ +\dontrun{ +setup_anndata_python("my-anndata-env") +adata <- anndata::read_h5ad("pbmc.h5ad") +} +} diff --git a/tests/testthat/helper-roundtrip.R b/tests/testthat/helper-roundtrip.R new file mode 100644 index 0000000..ed9477b --- /dev/null +++ b/tests/testthat/helper-roundtrip.R @@ -0,0 +1,214 @@ +# Shared scaffolding for the complex round-trip test files +# (test-roundtrip_complex_{anndata,sce,seurat}.R). +# +# Reuses the same skip guard and seeded-RNG idiom already established across +# the suite (test-comprehensive_anndata_to_seurat.R, test-raw_and_obsp.R, +# test-realistic_scanpy_dataset.R), centralised here so the three round-trip +# files do not each rebuild the (large) complex fixtures. testthat sources +# helper-*.R before the test files, so these are visible everywhere. + +# ---- Guards / RNG (mirror the existing in-file definitions) ---------------- + +skip_if_no_anndata <- function() { + skip_if_not_installed("anndata") + skip_if_not_installed("reticulate") + if (!reticulate::py_available(initialize = TRUE)) { + skip("Python not available for reticulate.") + } + if (!reticulate::py_module_available("anndata")) { + skip("Python 'anndata' module not installed.") + } +} + +# Repeatable RNG without polluting the user's global stream. +with_seed <- function(seed, expr) { + if (exists(".Random.seed", envir = .GlobalEnv, inherits = FALSE)) { + old <- get(".Random.seed", envir = .GlobalEnv, inherits = FALSE) + on.exit(assign(".Random.seed", old, envir = .GlobalEnv)) + } else { + on.exit(suppressWarnings(rm(".Random.seed", envir = .GlobalEnv))) + } + set.seed(seed) + force(expr) +} + +# ---- Small utilities ------------------------------------------------------- + +# The conversion functions emit a lot of timestamped status via cat(); swallow +# stdout but let warnings/errors surface so the test log stays readable. +rt_quiet <- function(expr) { + utils::capture.output(res <- force(expr)) + res +} + +# Robust AnnData mapping keys (layers/obsm/obsp/varm/varp/uns) -> character. +rt_keys <- function(mapping) { + bi <- reticulate::import_builtins() + tryCatch(as.character(bi$list(mapping$keys())), + error = function(e) as.character(names(mapping))) +} + +# scanpy obsm keys carry an "X_" prefix; reducedDimNames do not. Normalise so +# X_pca / PCA / pca all compare equal. +rt_norm_key <- function(k) tolower(sub("^X_", "", k)) + +rt_max_abs_diff <- function(a, b) { + a <- as.matrix(a); b <- as.matrix(b) + if (!all(dim(a) == dim(b))) return(Inf) + max(abs(unname(a) - unname(b))) +} + +# Logical sparsity pattern (which entries are non-zero), for graph/obsp +# comparisons where edge *weights* may be dropped but connectivity must hold. +rt_pattern <- function(m) { + m <- as.matrix(m) + unname(m != 0) +} + +# Full AnnData round trip through an on-disk .h5ad (the realistic path). +rt_write_read <- function(adata_obj) { + tmp <- tempfile(fileext = ".h5ad") + on.exit(unlink(tmp), add = TRUE) + anndata::write_h5ad(adata_obj, tmp) + anndata::read_h5ad(tmp) +} + +# ---- Complex fixtures ------------------------------------------------------ + +# A COMPLEX AnnData exercising every component the converters touch: +# sparse OR dense X, two layers (counts/logcounts), two obsm reductions, +# varm gene loadings, two obsp graphs, a varp graph, raw, and obs/var with +# factor / NA / logical / character / numeric columns. +build_complex_anndata <- function(n_cells = 24L, n_genes = 12L, + sparse_X = TRUE, with_raw = TRUE, seed = 1L) { + with_seed(seed, { + np <- reticulate::import("numpy") + sp <- reticulate::import("scipy.sparse") + + X <- matrix(rpois(n_cells * n_genes, lambda = 3), n_cells, n_genes) + rownames(X) <- sprintf("cell-%02d", seq_len(n_cells)) + colnames(X) <- sprintf("gene-%02d", seq_len(n_genes)) + logX <- log1p(X) + + obs <- data.frame( + cell_type = factor(rep(c("A", "B", "C"), length.out = n_cells), + levels = c("A", "B", "C")), + n_counts = rowSums(X), + pct_mt = c(NA_real_, runif(n_cells - 1L, 0, 10)), + is_ctrl = rep(c(TRUE, FALSE), length.out = n_cells), + donor = sprintf("D%d", rep(1:2, length.out = n_cells)), + row.names = rownames(X), stringsAsFactors = FALSE + ) + var <- data.frame( + gene_class = factor(rep(c("pc", "lnc"), length.out = n_genes)), + highly_variable = rep(c(TRUE, FALSE), length.out = n_genes), + mean_expr = colMeans(X), + row.names = colnames(X), stringsAsFactors = FALSE + ) + + conn <- abs(Matrix::rsparsematrix(n_cells, n_cells, + density = 0.3, symmetric = TRUE)) + dimnames(conn) <- list(rownames(X), rownames(X)) + + X_in <- if (sparse_X) sp$csr_matrix(np$asarray(logX, dtype = "float32")) else logX + + ad <- anndata::AnnData( + X = X_in, + obs = obs, + var = var, + layers = list(counts = X, logcounts = logX), + obsm = list(X_pca = matrix(rnorm(n_cells * 5), n_cells, 5), + X_umap = matrix(rnorm(n_cells * 2), n_cells, 2)), + varm = list(PCs = matrix(rnorm(n_genes * 5), n_genes, 5)), + obsp = list(connectivities = conn, distances = conn * 2), + varp = list(corr = abs(Matrix::rsparsematrix(n_genes, n_genes, + density = 0.4, symmetric = TRUE))), + uns = list(conversion_source = "complex_fixture") + ) + if (with_raw) ad$raw <- anndata::AnnData(X = X) # same gene set raw counts + ad + }) +} + +# A COMPLEX SingleCellExperiment: three assays (sparse counts + dense +# logcounts + dense scaledata), two reducedDims, an altExp (multimodal ADT), +# a colPair kNN graph, colData (factor/NA/mixed) and rowData metadata. +build_complex_sce <- function(n_cells = 30L, n_genes = 15L, seed = 2L) { + with_seed(seed, { + cnt <- abs(Matrix::rsparsematrix(n_genes, n_cells, density = 0.5)) + dimnames(cnt) <- list(sprintf("g%02d", seq_len(n_genes)), + sprintf("c%02d", seq_len(n_cells))) + dense <- as.matrix(cnt) + logc <- log1p(dense) # dense + # A third, distinct dense assay. Deterministic and NaN-free (avoids the + # zero-variance-gene trap that scale() would hit on a sparse count matrix). + scaled <- (dense - rowMeans(dense)) / (apply(dense, 1, sd) + 1) + + cd <- S4Vectors::DataFrame( + cell_type = factor(rep(c("X", "Y"), length.out = n_cells)), + qc = c(NA_real_, runif(n_cells - 1L)), + batch = rep(c("b1", "b2", "b3"), length.out = n_cells), + row.names = colnames(cnt) + ) + rd <- S4Vectors::DataFrame( + symbol = sprintf("SYM%d", seq_len(n_genes)), + is_hvg = rep(c(TRUE, FALSE), length.out = n_genes), + row.names = rownames(cnt) + ) + pca <- matrix(rnorm(n_cells * 6), n_cells, 6, dimnames = list(colnames(cnt), NULL)) + umap <- matrix(rnorm(n_cells * 2), n_cells, 2, dimnames = list(colnames(cnt), NULL)) + + sce <- SingleCellExperiment::SingleCellExperiment( + assays = list(counts = cnt, logcounts = logc, scaledata = scaled), + colData = cd, rowData = rd, + reducedDims = list(PCA = pca, UMAP = umap) + ) + + adt <- matrix(abs(rnorm(4 * n_cells)), 4, n_cells, + dimnames = list(sprintf("ADT%d", 1:4), colnames(cnt))) + SingleCellExperiment::altExp(sce, "ADT") <- + SingleCellExperiment::SingleCellExperiment(assays = list(counts = adt)) + + knn <- BiocNeighbors::findKNN(pca, k = 5) + SingleCellExperiment::colPair(sce, "knn") <- S4Vectors::SelfHits( + from = rep(seq_len(n_cells), each = 5), + to = as.vector(t(knn$index)), + x = as.vector(t(knn$distance)), + nnode = n_cells + ) + S4Vectors::metadata(sce)$run_info <- "complex-sce" + sce + }) +} + +# A COMPLEX Seurat object: a processed RNA assay (normalised + scaled + PCA), +# a manually-attached UMAP reduction (avoids the uwot dependency / nondeterminism), +# nearest-neighbour graphs, a second ADT assay, and factor + NA cell metadata. +build_complex_seurat <- function(n_genes = 40L, n_cells = 60L, seed = 3L) { + with_seed(seed, { + cm <- matrix(rpois(n_genes * n_cells, lambda = 3), n_genes, n_cells, + dimnames = list(sprintf("g%02d", seq_len(n_genes)), + sprintf("c%02d", seq_len(n_cells)))) + seu <- Seurat::CreateSeuratObject(counts = cm) + seu <- Seurat::NormalizeData(seu, verbose = FALSE) + seu <- Seurat::FindVariableFeatures(seu, verbose = FALSE) + seu <- Seurat::ScaleData(seu, verbose = FALSE) + seu <- Seurat::RunPCA(seu, npcs = 10, verbose = FALSE) + seu <- Seurat::FindNeighbors(seu, dims = 1:10, verbose = FALSE) + + # Manual UMAP reduction from the PCA embeddings: deterministic, no uwot. + emb <- Seurat::Embeddings(seu, "pca")[, 1:2, drop = FALSE] + colnames(emb) <- c("UMAP_1", "UMAP_2") + seu[["umap"]] <- Seurat::CreateDimReducObject( + embeddings = emb, key = "UMAP_", assay = "RNA" + ) + + seu$cell_type <- factor(rep(c("T", "B", "NK"), length.out = n_cells)) + seu$qc_na <- c(NA_real_, runif(n_cells - 1L)) + + adt <- matrix(rpois(6 * n_cells, lambda = 5), 6, n_cells, + dimnames = list(sprintf("adt%d", 1:6), colnames(seu))) + seu[["ADT"]] <- Seurat::CreateAssayObject(counts = adt) + seu + }) +} diff --git a/tests/testthat/test-check_anndata_python.R b/tests/testthat/test-check_anndata_python.R new file mode 100644 index 0000000..a89c74a --- /dev/null +++ b/tests/testthat/test-check_anndata_python.R @@ -0,0 +1,142 @@ +library(testthat) +library(convert2anndata) + +test_that("check_anndata_python returns TRUE invisibly when env is fully working", { + skip_if_not_installed("reticulate") + if (!reticulate::py_module_available("anndata")) { + skip("anndata module not available; the success path can't be exercised.") + } + withr::with_envvar(c(PYTHONPATH = ""), { + expect_true(check_anndata_python()) + }) +}) + +test_that("check_anndata_python action='silent' returns FALSE when anndata is missing", { + local_mocked_bindings( + py_available = function(...) TRUE, + py_config = function(...) list(python = "/fake/py", version = "3.10"), + py_module_available = function(name) FALSE, + .package = "reticulate" + ) + withr::with_envvar(c(PYTHONPATH = ""), { + expect_false(check_anndata_python(action = "silent")) + }) +}) + +test_that("check_anndata_python action='warn' produces a friendly warning", { + local_mocked_bindings( + py_available = function(...) TRUE, + py_config = function(...) list(python = "/fake/py", version = "3.10"), + py_module_available = function(name) FALSE, + .package = "reticulate" + ) + withr::with_envvar(c(PYTHONPATH = ""), { + expect_warning( + res <- check_anndata_python(action = "warn"), + regexp = "anndata.*not importable" + ) + expect_false(res) + }) +}) + +test_that("check_anndata_python action='stop' raises with an actionable message", { + local_mocked_bindings( + py_available = function(...) FALSE, + py_config = function(...) list(python = "/fake/py", version = "3.10"), + .package = "reticulate" + ) + expect_error( + check_anndata_python(), + regexp = "No Python interpreter available" + ) +}) + +test_that("PYTHONPATH pointing at a different Python version is detected and reported", { + local_mocked_bindings( + py_available = function(...) TRUE, + py_config = function(...) list(python = "/fake/py3.8/bin/python", + version = "3.8.17"), + .package = "reticulate" + ) + withr::with_envvar(c(PYTHONPATH = "/app/Python/3.11/site-packages"), { + expect_error( + check_anndata_python(), + regexp = "PYTHONPATH appears to point at a different Python" + ) + }) +}) + +test_that("PYTHONPATH that matches the active Python version is allowed", { + local_mocked_bindings( + py_available = function(...) TRUE, + py_config = function(...) list(python = "/fake/py3.10/bin/python", + version = "3.10.12"), + py_module_available = function(name) TRUE, + py_run_string = function(...) invisible(NULL), + .package = "reticulate" + ) + withr::with_envvar(c(PYTHONPATH = "/some/path/python3.10/site-packages"), { + expect_true(check_anndata_python()) + }) +}) + +test_that("numpy import error gets a tailored hint", { + fake_run <- function(code, ...) { + if (grepl("import numpy", code, fixed = TRUE)) { + stop("ImportError: Error importing numpy: you should not try to import numpy from its source directory") + } + invisible(NULL) + } + local_mocked_bindings( + py_available = function(...) TRUE, + py_config = function(...) list(python = "/fake/py", version = "3.10"), + py_module_available = function(name) TRUE, + py_run_string = fake_run, + .package = "reticulate" + ) + withr::with_envvar(c(PYTHONPATH = ""), { + expect_error( + check_anndata_python(), + regexp = "PYTHONPATH is likely polluted" + ) + }) +}) + +test_that("anndata smoke-probe failure surfaces a tailored message", { + fake_run <- function(code, ...) { + if (grepl("anndata.AnnData", code, fixed = TRUE)) { + stop("ValueError: numpy.dtype size changed (ABI break)") + } + invisible(NULL) + } + local_mocked_bindings( + py_available = function(...) TRUE, + py_config = function(...) list(python = "/fake/py", version = "3.10"), + py_module_available = function(name) TRUE, + py_run_string = fake_run, + .package = "reticulate" + ) + withr::with_envvar(c(PYTHONPATH = ""), { + expect_error( + check_anndata_python(), + regexp = "smoke probe failed" + ) + }) +}) + +test_that("diagnose_anndata_python prints a diagnostic block and returns logical", { + out <- capture.output(res <- diagnose_anndata_python()) + expect_true(any(grepl("convert2anndata Python environment", out))) + expect_true(any(grepl("python:", out))) + expect_true(any(grepl("RETICULATE_PYTHON", out))) + expect_type(res, "logical") +}) + +test_that("verbose=TRUE on success prints the env diagnostic block", { + skip_if_not_installed("reticulate") + if (!reticulate::py_module_available("anndata")) skip("anndata not available") + withr::with_envvar(c(PYTHONPATH = ""), { + out <- capture.output(check_anndata_python(verbose = TRUE)) + expect_true(any(grepl("convert2anndata Python environment", out))) + }) +}) diff --git a/tests/testthat/test-cli_convert.R b/tests/testthat/test-cli_convert.R index 589b5dd..46820ca 100644 --- a/tests/testthat/test-cli_convert.R +++ b/tests/testthat/test-cli_convert.R @@ -1,6 +1,32 @@ library(testthat) library(withr) +# These tests spawn subprocess Rscripts that load `library(convert2anndata)`. +# That requires the package to be *installed* in a libpath the subprocess +# can see. Under `devtools::test()` (where the package is only loaded via +# load_all), the subprocess won't find it and every test errors. Skip the +# whole file in that situation rather than emitting noise. +package_installed <- nzchar(suppressWarnings( + system2( + file.path(R.home(), "bin", "Rscript"), + args = c("-e", "cat(requireNamespace('convert2anndata', quietly=TRUE))"), + stdout = TRUE, stderr = FALSE + )[1] +)) && identical( + suppressWarnings(system2( + file.path(R.home(), "bin", "Rscript"), + args = c("-e", "cat(requireNamespace('convert2anndata', quietly=TRUE))"), + stdout = TRUE, stderr = FALSE + )[1]), + "TRUE" +) +if (!package_installed) { + skip_file <- function() invisible(NULL) + test_that("cli_convert tests skipped (package not installed in subprocess libpath)", { + skip("convert2anndata is not installed in a libpath visible to subprocess Rscripts; run after R CMD INSTALL.") + }) +} else { + # Function to run shell commands and capture output run_shell_command <- function(cmd) { cat("Running: ", cmd, "\n") @@ -45,3 +71,5 @@ test_that("cli_convert stops without input", { cat("Result:\n", paste(result, collapse = "\n"), "\n") expect_true(any(grepl("Test passed: No input", result))) }) + +} # end if (package_installed) diff --git a/tests/testthat/test-comprehensive_anndata_to_seurat.R b/tests/testthat/test-comprehensive_anndata_to_seurat.R new file mode 100644 index 0000000..c285f68 --- /dev/null +++ b/tests/testthat/test-comprehensive_anndata_to_seurat.R @@ -0,0 +1,638 @@ +library(testthat) +library(convert2anndata) +library(Matrix) +library(anndata) +library(Seurat) + +# Comprehensive coverage of AnnData -> Seurat conversion. +# +# Each test builds a small AnnData with one feature exercised in +# isolation, then asserts that the Seurat object preserves the +# semantics that a downstream analyst would expect. + +skip_if_no_anndata <- function() { + skip_if_not_installed("anndata") + skip_if_not_installed("reticulate") + if (!reticulate::py_available(initialize = TRUE)) { + skip("Python not available for reticulate.") + } + if (!reticulate::py_module_available("anndata")) { + skip("Python 'anndata' module not installed.") + } +} + +# Repeatable RNG without polluting the user's global stream. +with_seed <- function(seed, expr) { + old <- .Random.seed + on.exit(assign(".Random.seed", old, envir = .GlobalEnv)) + set.seed(seed) + force(expr) +} + +build_dense_X <- function(n_cells, n_genes, lambda = 2.5, seed = 0L) { + with_seed(seed, { + X <- matrix(rpois(n_cells * n_genes, lambda), + nrow = n_cells, ncol = n_genes) + }) + # Names use hyphens, not underscores: Seurat rewrites underscores in + # feature names to dashes (a documented Seurat policy), which would + # confound assertions on label preservation. Cells are not sanitized + # but we keep them consistent. + rownames(X) <- sprintf("cell-%03d", seq_len(n_cells)) + colnames(X) <- sprintf("gene-%03d", seq_len(n_genes)) + X +} + +# ---------------------------------------------------------------------- +# A. Matrix variants: dense, CSR, CSC, dtypes +# ---------------------------------------------------------------------- + +test_that("dense X is preserved through the conversion", { + skip_if_no_anndata() + X <- build_dense_X(15, 10) + ad <- AnnData(X = X) + s <- convert_anndata_to_seurat(ad, counts_layer = "counts") + counts <- as.matrix(GetAssayData(s, layer = "counts")) + expect_equal(unname(counts), unname(t(X))) + expect_equal(rownames(s), colnames(X)) + expect_equal(colnames(s), rownames(X)) +}) + +test_that("CSR sparse X is preserved", { + skip_if_no_anndata() + sp <- reticulate::import("scipy.sparse") + np <- reticulate::import("numpy") + X <- build_dense_X(20, 12) + X_py <- np$asarray(X, dtype = "float32") + ad <- AnnData(X = sp$csr_matrix(X_py)) + s <- convert_anndata_to_seurat(ad) + counts <- as.matrix(GetAssayData(s, layer = "counts")) + expect_equal(unname(counts), unname(t(X))) +}) + +test_that("CSC sparse X is preserved", { + skip_if_no_anndata() + sp <- reticulate::import("scipy.sparse") + np <- reticulate::import("numpy") + X <- build_dense_X(20, 12) + X_py <- np$asarray(X, dtype = "float32") + ad <- AnnData(X = sp$csc_matrix(X_py)) + s <- convert_anndata_to_seurat(ad) + counts <- as.matrix(GetAssayData(s, layer = "counts")) + expect_equal(unname(counts), unname(t(X))) +}) + +test_that("float vs int dtype both round-trip", { + skip_if_no_anndata() + np <- reticulate::import("numpy") + X <- build_dense_X(10, 6) + ad_float <- AnnData(X = np$asarray(X, dtype = "float64")) + ad_int <- AnnData(X = np$asarray(X, dtype = "int32")) + s_f <- convert_anndata_to_seurat(ad_float) + s_i <- convert_anndata_to_seurat(ad_int) + expect_equal( + unname(as.matrix(GetAssayData(s_f, layer = "counts"))), + unname(as.matrix(GetAssayData(s_i, layer = "counts"))) + ) +}) + +# ---------------------------------------------------------------------- +# B. Metadata fidelity (obs / var) +# ---------------------------------------------------------------------- + +test_that("obs preserves character, numeric, logical, and categorical columns", { + skip_if_no_anndata() + X <- build_dense_X(12, 8) + obs <- data.frame( + sample_id = paste0("S", seq_len(12)), + n_counts = rowSums(X), + is_ctrl = rep(c(TRUE, FALSE), 6), + cell_type = factor(rep(c("A", "B", "C"), 4), + levels = c("A", "B", "C")), + stringsAsFactors = FALSE, + row.names = rownames(X) + ) + ad <- AnnData(X = X, obs = obs) + s <- convert_anndata_to_seurat(ad) + md <- s@meta.data + + expect_setequal( + setdiff(colnames(md), c("orig.ident", "nCount_RNA", "nFeature_RNA")), + c("sample_id", "n_counts", "is_ctrl", "cell_type") + ) + expect_equal(as.character(md$sample_id), as.character(obs$sample_id)) + expect_equal(unname(md$n_counts), unname(obs$n_counts)) + expect_equal(as.logical(md$is_ctrl), obs$is_ctrl) + # cell_type round-trips through pandas Categorical; we only require + # that the levels and per-cell labels are preserved, not the storage class. + expect_equal(as.character(md$cell_type), as.character(obs$cell_type)) +}) + +test_that("obs with NA values in numeric and categorical survive", { + skip_if_no_anndata() + X <- build_dense_X(12, 8) + obs <- data.frame( + n_counts = c(NA_real_, rowSums(X)[-1]), + cell_type = factor(c(rep("A", 6), rep(NA, 6)), levels = c("A", "B")), + row.names = rownames(X), + stringsAsFactors = FALSE + ) + ad <- AnnData(X = X, obs = obs) + s <- convert_anndata_to_seurat(ad) + md <- s@meta.data + expect_true(is.na(md$n_counts[1])) + expect_true(any(is.na(md$cell_type))) +}) + +test_that("an explicit orig.ident column in obs is preserved verbatim", { + skip_if_no_anndata() + X <- build_dense_X(8, 5) + obs <- data.frame( + orig.ident = paste0("donor_", rep(1:2, each = 4)), + row.names = rownames(X), + stringsAsFactors = FALSE + ) + ad <- AnnData(X = X, obs = obs) + s <- convert_anndata_to_seurat(ad) + expect_equal(as.character(s@meta.data$orig.ident), obs$orig.ident) +}) + +test_that("var attaches to the assay's feature meta and aligns to var_names", { + skip_if_no_anndata() + X <- build_dense_X(10, 6) + var <- data.frame( + gene_class = c("protein_coding", "lncRNA", "lncRNA", "protein_coding", + "protein_coding", "miRNA"), + highly_variable = c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE), + row.names = colnames(X), + stringsAsFactors = FALSE + ) + ad <- AnnData(X = X, var = var) + s <- convert_anndata_to_seurat(ad) + fm <- s[["RNA"]][[]] + expect_true("gene_class" %in% colnames(fm)) + expect_true("highly_variable" %in% colnames(fm)) + expect_equal(rownames(fm), colnames(X)) + expect_equal(as.character(fm$gene_class), var$gene_class) +}) + +test_that("special characters in obs/var names survive", { + skip_if_no_anndata() + n_cells <- 6L; n_genes <- 4L + X <- matrix(seq_len(n_cells * n_genes) + 0, + nrow = n_cells, ncol = n_genes) + rownames(X) <- c("cell-1", "cell.2", "cell_3", "cell:4", "cell 5", "cell/6") + colnames(X) <- c("ENSG-1", "MT-CO1", "RP/L7", "lncRNA.1") + ad <- AnnData(X = X) + s <- convert_anndata_to_seurat(ad) + expect_equal(colnames(s), rownames(X)) + expect_equal(rownames(s), colnames(X)) +}) + +# ---------------------------------------------------------------------- +# C. Layer handling and counts_layer candidate chain +# ---------------------------------------------------------------------- + +test_that("counts_layer = first match wins when multiple candidates are present", { + skip_if_no_anndata() + X <- build_dense_X(10, 6, lambda = 5) + X2 <- X * 2L + ad <- AnnData(X = X, layers = list(raw_counts = X, counts = X2)) + s <- convert_anndata_to_seurat( + ad, + counts_layer = c("counts", "raw_counts", "raw_count") + ) + # 'counts' is first in the candidate vector AND present, so it wins. + expect_equal( + unname(as.matrix(GetAssayData(s, layer = "counts"))), + unname(t(X2)) + ) +}) + +test_that("counts_layer falls through to a later candidate", { + skip_if_no_anndata() + X <- build_dense_X(10, 6) + ad <- AnnData(X = X, layers = list(raw_count = X)) + s <- convert_anndata_to_seurat( + ad, + counts_layer = c("counts", "raw_counts", "raw_count") + ) + expect_equal( + unname(as.matrix(GetAssayData(s, layer = "counts"))), + unname(t(X)) + ) +}) + +test_that("when no candidate matches, X becomes counts and no separate data layer is added", { + skip_if_no_anndata() + X <- build_dense_X(10, 6) + X_log <- log1p(X) + ad <- AnnData(X = X, layers = list(logcounts = X_log)) + s <- convert_anndata_to_seurat(ad, counts_layer = c("counts", "raw_counts")) + # counts == X + expect_equal(unname(as.matrix(GetAssayData(s, layer = "counts"))), + unname(t(X))) + # We did NOT explicitly call SetAssayData(layer = "data") because there + # was no separate counts layer to fall back to. In Seurat 5 that means the + # 'data' layer is absent (Layers() omits it). + expect_false("data" %in% SeuratObject::Layers(s[["RNA"]])) +}) + +test_that("when a counts layer is present, counts != data and adata$X becomes data", { + skip_if_no_anndata() + X <- build_dense_X(10, 6) + raw <- X * 3L + ad <- AnnData(X = X, layers = list(counts = raw)) + s <- convert_anndata_to_seurat(ad, counts_layer = "counts") + counts <- as.matrix(GetAssayData(s, layer = "counts")) + data <- as.matrix(GetAssayData(s, layer = "data")) + expect_equal(unname(counts), unname(t(raw))) + expect_equal(unname(data), unname(t(X))) + expect_false(isTRUE(all.equal(unname(counts), unname(data)))) +}) + +test_that("counts_layer = NULL or empty string falls back to X without warning", { + skip_if_no_anndata() + X <- build_dense_X(10, 6) + ad <- AnnData(X = X, layers = list(counts = X * 2L)) + + # Capture warnings without suppressing the function's normal status messages. + captured <- list() + run <- function(arg) { + withCallingHandlers( + suppressMessages(convert_anndata_to_seurat(ad, counts_layer = arg)), + warning = function(w) { + captured[[length(captured) + 1L]] <<- conditionMessage(w) + invokeRestart("muffleWarning") + } + ) + } + s1 <- run(NULL) + s2 <- run("") + expect_length(captured, 0L) + + # Both should use X as counts (since we asked for nothing in particular). + expect_equal(unname(as.matrix(GetAssayData(s1, layer = "counts"))), + unname(t(X))) + expect_equal(unname(as.matrix(GetAssayData(s2, layer = "counts"))), + unname(t(X))) +}) + +test_that("Seurat-policy: feature names with underscores get sanitized to dashes (documented)", { + skip_if_no_anndata() + # Seurat policy: feature names cannot contain '_'. The conversion still + # succeeds (the dash-rewritten names match between counts/data/var) and + # Seurat itself emits the warning. We assert that: + # - the warning fires (proving the policy is engaged), and + # - the data layer + var feature meta still attach successfully. + n_cells <- 8L; n_genes <- 4L + X <- matrix(seq_len(n_cells * n_genes) + 0, + nrow = n_cells, ncol = n_genes) + rownames(X) <- sprintf("cell_%02d", seq_len(n_cells)) + colnames(X) <- sprintf("gene_%02d", seq_len(n_genes)) + var <- data.frame( + gene_class = rep("protein_coding", n_genes), + row.names = colnames(X), + stringsAsFactors = FALSE + ) + ad <- AnnData(X = X, var = var, layers = list(counts = X * 2L)) + + warned <- FALSE + withCallingHandlers( + s <- suppressMessages(convert_anndata_to_seurat(ad, counts_layer = "counts")), + warning = function(w) { + if (grepl("underscore", conditionMessage(w))) warned <<- TRUE + invokeRestart("muffleWarning") + } + ) + expect_true(warned) + expect_true(all(grepl("^gene-", rownames(s)))) + # Both counts and data attached cleanly (the bug we fixed). + expect_true("counts" %in% SeuratObject::Layers(s[["RNA"]])) + expect_true("data" %in% SeuratObject::Layers(s[["RNA"]])) + fm <- s[["RNA"]][[]] + expect_equal(rownames(fm), rownames(s)) + expect_true("gene_class" %in% colnames(fm)) +}) + +# ---------------------------------------------------------------------- +# D. obsm / reductions +# ---------------------------------------------------------------------- + +test_that("default reduction_map maps standard scanpy keys", { + skip_if_no_anndata() + n <- 14 + X <- build_dense_X(n, 8) + obsm <- list( + X_pca = matrix(rnorm(n * 5), n, 5), + X_umap = matrix(rnorm(n * 2), n, 2), + X_tsne = matrix(rnorm(n * 2), n, 2) + ) + ad <- AnnData(X = X, obsm = obsm) + s <- convert_anndata_to_seurat(ad) + expect_setequal(names(s@reductions), c("pca", "umap", "tsne")) + expect_equal(s@reductions$pca@key, "PC_") + expect_equal(s@reductions$umap@key, "UMAP_") + expect_equal(s@reductions$tsne@key, "tSNE_") + expect_equal(unname(s@reductions$pca@cell.embeddings), unname(obsm$X_pca)) +}) + +test_that("non-standard obsm keys get a derived reduction name", { + skip_if_no_anndata() + n <- 10 + X <- build_dense_X(n, 5) + obsm <- list(X_my_emb = matrix(rnorm(n * 4), n, 4)) + ad <- AnnData(X = X, obsm = obsm) + s <- convert_anndata_to_seurat(ad) + expect_true("my_emb" %in% names(s@reductions)) + # Embeddings preserved + expect_equal(unname(s@reductions$my_emb@cell.embeddings), unname(obsm$X_my_emb)) +}) + +test_that("user-supplied reduction_map renames and adds embeddings", { + skip_if_no_anndata() + n <- 10 + X <- build_dense_X(n, 5) + obsm <- list( + X_pca = matrix(rnorm(n * 3), n, 3), + X_special = matrix(rnorm(n * 2), n, 2) + ) + ad <- AnnData(X = X, obsm = obsm) + s <- convert_anndata_to_seurat( + ad, + reduction_map = list( + X_pca = list(name = "renamedPCA", key = "rPCA_"), + X_special = list(name = "special", key = "Sp_") + ) + ) + expect_true(all(c("renamedPCA", "special") %in% names(s@reductions))) + expect_equal(s@reductions$renamedPCA@key, "rPCA_") + expect_equal(s@reductions$special@key, "Sp_") +}) + +test_that("reduction_map with name=NA disables a default", { + skip_if_no_anndata() + n <- 10 + X <- build_dense_X(n, 5) + obsm <- list( + X_pca = matrix(rnorm(n * 3), n, 3), + X_umap = matrix(rnorm(n * 2), n, 2) + ) + ad <- AnnData(X = X, obsm = obsm) + s <- convert_anndata_to_seurat( + ad, + reduction_map = list(X_umap = list(name = NA, key = NA)) + ) + expect_true("pca" %in% names(s@reductions)) + expect_false("umap" %in% names(s@reductions)) +}) + +# ---------------------------------------------------------------------- +# E. uns / orig.ident logic +# ---------------------------------------------------------------------- + +test_that("uns$conversion_source becomes orig.ident when no obs$orig.ident", { + skip_if_no_anndata() + X <- build_dense_X(8, 5) + ad <- AnnData(X = X, uns = list(conversion_source = "fixture42")) + s <- convert_anndata_to_seurat(ad) + expect_equal(unique(as.character(s@meta.data$orig.ident)), "fixture42") +}) + +test_that("uns with arbitrary unrelated keys does not break", { + skip_if_no_anndata() + X <- build_dense_X(8, 5) + ad <- AnnData(X = X, uns = list(some_param = 42, notes = "hello")) + s <- convert_anndata_to_seurat(ad) + expect_equal(unique(as.character(s@meta.data$orig.ident)), "AnnData") +}) + +test_that("explicit orig.ident parameter beats uns and file basename", { + skip_if_no_anndata() + X <- build_dense_X(6, 4) + ad <- AnnData(X = X, uns = list(conversion_source = "from_uns")) + s <- convert_anndata_to_seurat(ad, orig.ident = "explicit_value") + expect_equal(unique(as.character(s@meta.data$orig.ident)), "explicit_value") +}) + +# ---------------------------------------------------------------------- +# F. File-path entry +# ---------------------------------------------------------------------- + +test_that("path entry: orig.ident defaults to file basename when no uns", { + skip_if_no_anndata() + X <- build_dense_X(8, 4) + ad <- AnnData(X = X) + path <- file.path(tempdir(), "my_sample_42.h5ad") + on.exit(unlink(path), add = TRUE) + write_h5ad(ad, path) + s <- convert_anndata_to_seurat(path) + expect_equal(unique(as.character(s@meta.data$orig.ident)), "my_sample_42") +}) + +test_that("path entry: tilde expansion works", { + skip_if_no_anndata() + X <- build_dense_X(6, 4) + ad <- AnnData(X = X) + rel <- file.path("~", "convert2anndata_test_tilde.h5ad") + abs <- path.expand(rel) + on.exit(unlink(abs), add = TRUE) + write_h5ad(ad, abs) + s <- convert_anndata_to_seurat(rel) + expect_s4_class(s, "Seurat") +}) + +test_that("path entry: nonexistent file gives a clear error", { + expect_error( + convert_anndata_to_seurat("/nope/does/not/exist.h5ad"), + regexp = "does not exist" + ) +}) + +test_that("path entry: invalid h5ad file gives a wrapped error", { + skip_if_no_anndata() + bogus <- tempfile(fileext = ".h5ad") + writeLines("this is not an h5ad file", bogus) + on.exit(unlink(bogus), add = TRUE) + expect_error( + convert_anndata_to_seurat(bogus), + regexp = "Failed to read AnnData h5ad file" + ) +}) + +# ---------------------------------------------------------------------- +# G. Custom assay name +# ---------------------------------------------------------------------- + +test_that("assay name can be overridden and the data layer attaches to it", { + skip_if_no_anndata() + X <- build_dense_X(10, 6) + raw <- X * 2L + ad <- AnnData(X = X, layers = list(counts = raw)) + s <- convert_anndata_to_seurat(ad, counts_layer = "counts", assay = "ATAC") + # Use names(s@assays) instead of Seurat::Assays() because the latter + # generic gets shadowed by SummarizedExperiment in cross-file test_dir runs. + expect_true("ATAC" %in% names(s@assays)) + expect_false("RNA" %in% names(s@assays)) + counts <- as.matrix(GetAssayData(s, assay = "ATAC", layer = "counts")) + data <- as.matrix(GetAssayData(s, assay = "ATAC", layer = "data")) + expect_equal(unname(counts), unname(t(raw))) + expect_equal(unname(data), unname(t(X))) +}) + +test_that("reductions get the custom assay name when assay is overridden", { + skip_if_no_anndata() + n <- 10 + X <- build_dense_X(n, 6) + ad <- AnnData(X = X, obsm = list(X_pca = matrix(rnorm(n * 3), n, 3))) + s <- convert_anndata_to_seurat(ad, assay = "ATAC") + expect_equal(s@reductions$pca@assay.used, "ATAC") +}) + +# ---------------------------------------------------------------------- +# H. Round-trip integrity +# ---------------------------------------------------------------------- + +test_that("AnnData -> Seurat -> SCE -> AnnData -> Seurat preserves counts and obs", { + skip_if_no_anndata() + n <- 16; m <- 10 + X <- build_dense_X(n, m, lambda = 2) + obs <- data.frame( + cell_type = factor(rep(c("A", "B"), n / 2)), + n_counts = rowSums(X), + row.names = rownames(X) + ) + ad <- AnnData(X = X, obs = obs, + layers = list(counts = X), + obsm = list(X_pca = matrix(rnorm(n * 4), n, 4))) + + s1 <- convert_anndata_to_seurat(ad, counts_layer = "counts") + sce <- convert_seurat_to_sce(s1) + ad2 <- convert_to_anndata(sce, assayName = "counts") + s2 <- convert_anndata_to_seurat(ad2, counts_layer = "counts") + + # Counts preserved + expect_equal( + unname(as.matrix(GetAssayData(s1, layer = "counts"))), + unname(as.matrix(GetAssayData(s2, layer = "counts"))) + ) + # cell_type column survives + expect_equal( + as.character(s1@meta.data$cell_type), + as.character(s2@meta.data$cell_type) + ) + # obs/var names preserved + expect_equal(colnames(s1), colnames(s2)) + expect_equal(rownames(s1), rownames(s2)) +}) + +# ---------------------------------------------------------------------- +# I. Robustness +# ---------------------------------------------------------------------- + +test_that("empty obsm produces zero reductions", { + skip_if_no_anndata() + X <- build_dense_X(8, 5) + ad <- AnnData(X = X) + s <- convert_anndata_to_seurat(ad) + expect_length(s@reductions, 0) +}) + +test_that("obsm with mismatched cell count is skipped (warn) but valid ones attach", { + skip_if_no_anndata() + n <- 8 + X <- build_dense_X(n, 5) + good <- matrix(rnorm(n * 2), n, 2) + bad <- matrix(rnorm((n - 1) * 2), n - 1, 2) + ad <- AnnData(X = X, obsm = list(X_pca = good)) + + # extract_anndata_obsm + attach_reductions_seurat is the public path; bad + # obsm is normally rejected by AnnData itself, so we feed it directly to + # attach_reductions_seurat to verify the guard. + s <- convert_anndata_to_seurat(ad) + bad_named <- list(X_pca = good, X_bad = bad) + expect_warning( + s2 <- attach_reductions_seurat(s, bad_named), + regexp = "skipping" + ) + expect_true("pca" %in% names(s2@reductions)) +}) + +test_that("very small AnnData (3x2) and skinny (2x100) do not break the converter", { + skip_if_no_anndata() + ad_small <- AnnData(X = matrix(c(1, 2, 3, 4, 5, 6), 3, 2)) + s1 <- convert_anndata_to_seurat(ad_small) + expect_equal(ncol(s1), 3); expect_equal(nrow(s1), 2) + + ad_skinny <- AnnData(X = matrix(rpois(200, 1), 2, 100)) + s2 <- convert_anndata_to_seurat(ad_skinny) + expect_equal(ncol(s2), 2); expect_equal(nrow(s2), 100) +}) + +test_that("standalone shim and package function produce identical Seurat objects", { + skip_if_no_anndata() + src <- "/fh/fast/setty_m/user/yhuang2/sarah-nexus/work/convert2seurat/convert_anndata2seurat.R" + skip_if_not(file.exists(src), "standalone shim not available") + source(src, local = TRUE) + + X <- build_dense_X(20, 8) + ad <- AnnData(X = X, layers = list(counts = X * 2L), + obsm = list(X_pca = matrix(rnorm(60), 20, 3))) + path <- tempfile(fileext = ".h5ad") + on.exit(unlink(path), add = TRUE) + write_h5ad(ad, path) + + s_shim <- convert_anndata2seurat(path, raw_layer_name = "counts") + s_pkg <- convert_anndata_to_seurat(path, counts_layer = "counts") + + expect_equal(dim(s_shim), dim(s_pkg)) + expect_equal( + unname(as.matrix(GetAssayData(s_shim, layer = "counts"))), + unname(as.matrix(GetAssayData(s_pkg, layer = "counts"))) + ) + expect_equal( + unname(as.matrix(GetAssayData(s_shim, layer = "data"))), + unname(as.matrix(GetAssayData(s_pkg, layer = "data"))) + ) + expect_equal(sort(names(s_shim@reductions)), sort(names(s_pkg@reductions))) +}) + +# ---------------------------------------------------------------------- +# J. orig.ident precedence laddered +# ---------------------------------------------------------------------- + +test_that("orig.ident precedence: obs column > arg > uns > path-basename > 'AnnData'", { + skip_if_no_anndata() + X <- build_dense_X(6, 4) + + # 1. obs.orig.ident wins over everything (we don't override an existing column). + obs1 <- data.frame(orig.ident = paste0("from_obs_", 1:6), + row.names = rownames(X)) + ad1 <- AnnData(X = X, obs = obs1, uns = list(conversion_source = "from_uns")) + path1 <- tempfile(fileext = ".h5ad"); on.exit(unlink(path1), add = TRUE) + write_h5ad(ad1, path1) + s1 <- convert_anndata_to_seurat(path1, orig.ident = "from_arg") + expect_equal(s1@meta.data$orig.ident, paste0("from_obs_", 1:6)) + + # 2. With no obs.orig.ident, the explicit arg wins over uns and path. + ad2 <- AnnData(X = X, uns = list(conversion_source = "from_uns")) + path2 <- tempfile(fileext = ".h5ad"); on.exit(unlink(path2), add = TRUE) + write_h5ad(ad2, path2) + s2 <- convert_anndata_to_seurat(path2, orig.ident = "from_arg") + expect_equal(unique(s2@meta.data$orig.ident), "from_arg") + + # 3. With no obs and no arg, uns wins over path. + s3 <- convert_anndata_to_seurat(path2) + expect_equal(unique(s3@meta.data$orig.ident), "from_uns") + + # 4. Without uns either, path basename wins. + ad4 <- AnnData(X = X) + path4 <- file.path(tempdir(), "named_sample.h5ad"); on.exit(unlink(path4), add = TRUE) + write_h5ad(ad4, path4) + s4 <- convert_anndata_to_seurat(path4) + expect_equal(unique(s4@meta.data$orig.ident), "named_sample") + + # 5. No obs, no uns, no path -> "AnnData". + s5 <- convert_anndata_to_seurat(ad4) + expect_equal(unique(s5@meta.data$orig.ident), "AnnData") +}) diff --git a/tests/testthat/test-convert_anndata_to_seurat.R b/tests/testthat/test-convert_anndata_to_seurat.R new file mode 100644 index 0000000..1780fa1 --- /dev/null +++ b/tests/testthat/test-convert_anndata_to_seurat.R @@ -0,0 +1,127 @@ +library(testthat) +library(convert2anndata) +library(SingleCellExperiment) +library(SummarizedExperiment) +library(S4Vectors) +library(Matrix) +library(anndata) + +skip_if_no_anndata <- function() { + skip_if_not_installed("anndata") + skip_if_not_installed("reticulate") + if (!reticulate::py_available(initialize = TRUE)) { + skip("Python not available for reticulate.") + } + if (!reticulate::py_module_available("anndata")) { + skip("Python 'anndata' module not installed.") + } +} + +mock_h5ad_path <- function() { + skip_if_no_anndata() + n_cells <- 20 + n_genes <- 30 + X <- matrix(rpois(n_cells * n_genes, lambda = 3), nrow = n_cells, ncol = n_genes) + rownames(X) <- paste0("cell", seq_len(n_cells)) + colnames(X) <- paste0("gene", seq_len(n_genes)) + + obs <- data.frame( + cell_type = rep(c("A", "B"), n_cells / 2), + n_counts = rowSums(X), + row.names = rownames(X) + ) + var <- data.frame( + gene_type = rep(c("g1", "g2"), n_genes / 2), + row.names = colnames(X) + ) + pca <- matrix(runif(n_cells * 3), nrow = n_cells, ncol = 3) + rownames(pca) <- rownames(X) + + adata <- AnnData( + X = X, + obs = obs, + var = var, + layers = list(counts = X), + obsm = list(X_pca = pca, X_umap = pca[, 1:2]) + ) + path <- tempfile(fileext = ".h5ad") + write_h5ad(adata, path) + path +} + +test_that("convert_anndata_to_seurat preserves dims, metadata, and reductions", { + skip_if_not_installed("Seurat") + path <- mock_h5ad_path() + adata <- read_h5ad(path) + + seurat_obj <- convert_anndata_to_seurat(adata, counts_layer = "counts") + + expect_s4_class(seurat_obj, "Seurat") + expect_equal(ncol(seurat_obj), nrow(adata)) + expect_equal(nrow(seurat_obj), ncol(adata)) + expect_equal(colnames(seurat_obj), as.character(adata$obs_names)) + expect_equal(rownames(seurat_obj), as.character(adata$var_names)) + + expect_true("cell_type" %in% colnames(seurat_obj@meta.data)) + expect_equal( + as.character(seurat_obj@meta.data$cell_type), + as.character(adata$obs$cell_type) + ) + + expect_true("pca" %in% names(seurat_obj@reductions)) + expect_true("umap" %in% names(seurat_obj@reductions)) + expect_equal( + unname(seurat_obj@reductions$pca@cell.embeddings), + unname(as.matrix(adata$obsm[["X_pca"]])) + ) +}) + +test_that("convert_anndata_to_sce preserves dims and reducedDims", { + path <- mock_h5ad_path() + adata <- read_h5ad(path) + + sce <- convert_anndata_to_sce(adata, counts_layer = "counts") + + expect_s4_class(sce, "SingleCellExperiment") + expect_equal(ncol(sce), nrow(adata)) + expect_equal(nrow(sce), ncol(adata)) + expect_true("PCA" %in% names(reducedDims(sce))) + expect_equal( + unname(reducedDim(sce, "PCA")), + unname(as.matrix(adata$obsm[["X_pca"]])) + ) + expect_equal(as.character(colData(sce)$cell_type), as.character(adata$obs$cell_type)) +}) + +test_that("anndata -> seurat -> anndata roundtrip preserves counts", { + skip_if_not_installed("Seurat") + path <- mock_h5ad_path() + adata <- read_h5ad(path) + + seurat_obj <- convert_anndata_to_seurat(adata, counts_layer = "counts") + sce_back <- convert_seurat_to_sce(seurat_obj) + adata_back <- convert_to_anndata(sce_back, assayName = "counts") + + expect_equal(dim(adata_back$X), dim(adata$X)) + expect_equal( + unname(as.matrix(adata_back$X)), + unname(as.matrix(adata$X)) + ) +}) + +test_that("convert_anndata_to_seurat handles missing counts layer", { + skip_if_not_installed("Seurat") + skip_if_no_anndata() + n_cells <- 10 + n_genes <- 12 + X <- matrix(rpois(n_cells * n_genes, lambda = 2), nrow = n_cells, ncol = n_genes) + rownames(X) <- paste0("c", seq_len(n_cells)) + colnames(X) <- paste0("g", seq_len(n_genes)) + adata <- AnnData(X = X) + + seurat_obj <- convert_anndata_to_seurat(adata, counts_layer = "counts") + + expect_s4_class(seurat_obj, "Seurat") + expect_equal(ncol(seurat_obj), n_cells) + expect_equal(nrow(seurat_obj), n_genes) +}) diff --git a/tests/testthat/test-ensure_csparse_matrix.R b/tests/testthat/test-ensure_csparse_matrix.R index 36d4a64..a25a1be 100644 --- a/tests/testthat/test-ensure_csparse_matrix.R +++ b/tests/testthat/test-ensure_csparse_matrix.R @@ -13,9 +13,12 @@ test_that("ensure_csparse_matrix handles various matrix types correctly", { expect_true(methods::is(result_dgc, "CsparseMatrix")) # Test with dgTMatrix (triplet format, not CsparseMatrix) + # Matrix >= 1.5 returns dgCMatrix by default from sparseMatrix(); use + # repr = "T" to force the triplet representation we actually want to test. ij <- expand.grid(i = 1:5, j = 1:5) ij <- ij[sample(nrow(ij), 10), ] - dgt_mat <- Matrix::sparseMatrix(i = ij$i, j = ij$j, x = 1:10, dims = c(5, 5)) + dgt_mat <- Matrix::sparseMatrix(i = ij$i, j = ij$j, x = 1:10, + dims = c(5, 5), repr = "T") expect_false(methods::is(dgt_mat, "CsparseMatrix")) result_dgt <- ensure_csparse_matrix(dgt_mat) expect_true(methods::is(result_dgt, "CsparseMatrix")) diff --git a/tests/testthat/test-flexible_anndata_to_seurat.R b/tests/testthat/test-flexible_anndata_to_seurat.R new file mode 100644 index 0000000..a493175 --- /dev/null +++ b/tests/testthat/test-flexible_anndata_to_seurat.R @@ -0,0 +1,174 @@ +library(testthat) +library(convert2anndata) +library(Matrix) +library(anndata) + +skip_if_no_anndata <- function() { + skip_if_not_installed("anndata") + skip_if_not_installed("reticulate") + if (!reticulate::py_available(initialize = TRUE)) { + skip("Python not available for reticulate.") + } + if (!reticulate::py_module_available("anndata")) { + skip("Python 'anndata' module not installed.") + } +} + +build_adata <- function(layer_names = "counts", + obsm_keys = c("X_pca"), + n_cells = 12, n_genes = 8) { + X <- matrix(rpois(n_cells * n_genes, lambda = 3), + nrow = n_cells, ncol = n_genes) + rownames(X) <- paste0("cell", seq_len(n_cells)) + colnames(X) <- paste0("gene", seq_len(n_genes)) + + layers <- if (length(layer_names)) { + setNames(replicate(length(layer_names), X, simplify = FALSE), layer_names) + } else NULL + + obsm <- if (length(obsm_keys)) { + setNames( + lapply(obsm_keys, function(k) matrix(rnorm(n_cells * 3), n_cells, 3)), + obsm_keys + ) + } else NULL + + AnnData(X = X, obs = data.frame(cell_type = rep(c("A", "B"), n_cells / 2), + row.names = rownames(X)), + var = data.frame(gene_type = rep("g", n_genes), row.names = colnames(X)), + layers = layers, obsm = obsm) +} + +test_that("counts_layer accepts a vector of candidates", { + skip_if_no_anndata() + skip_if_not_installed("Seurat") + ad <- build_adata(layer_names = "raw_counts") + s <- convert_anndata_to_seurat( + ad, + counts_layer = c("counts", "raw_counts", "raw_count") + ) + expect_s4_class(s, "Seurat") + counts <- as.matrix(Seurat::GetAssayData(s, layer = "counts")) + raw <- as.matrix(ad$layers[["raw_counts"]]) + expect_equal(unname(counts), unname(t(raw))) +}) + +test_that("counts_layer falls back to X when no candidate matches", { + skip_if_no_anndata() + skip_if_not_installed("Seurat") + ad <- build_adata(layer_names = "logcounts") # neither counts nor raw_counts + s <- convert_anndata_to_seurat( + ad, counts_layer = c("counts", "raw_counts") + ) + expect_s4_class(s, "Seurat") + # No data layer should be added because we used X as counts. + layers <- SeuratObject::Layers(s[["RNA"]]) + expect_false("data" %in% layers && length(layers) > 1 && + !identical(unname(as.matrix(Seurat::GetAssayData(s, layer = "data"))), + unname(as.matrix(Seurat::GetAssayData(s, layer = "counts"))))) +}) + +test_that("convert_anndata_to_seurat accepts a file path and reads it", { + skip_if_no_anndata() + skip_if_not_installed("Seurat") + ad <- build_adata() + path <- tempfile(fileext = ".h5ad") + write_h5ad(ad, path) + + s <- convert_anndata_to_seurat(path, counts_layer = "counts") + expect_s4_class(s, "Seurat") + expect_equal(ncol(s), 12) + expect_equal(nrow(s), 8) + # orig.ident should be the file basename when no uns$conversion_source is set + expect_true("orig.ident" %in% colnames(s@meta.data)) + expect_equal(unique(as.character(s@meta.data$orig.ident)), + tools::file_path_sans_ext(basename(path))) +}) + +test_that("convert_anndata_to_seurat respects an explicit orig.ident", { + skip_if_no_anndata() + skip_if_not_installed("Seurat") + ad <- build_adata() + s <- convert_anndata_to_seurat(ad, counts_layer = "counts", + orig.ident = "my_sample") + expect_equal(unique(as.character(s@meta.data$orig.ident)), "my_sample") +}) + +test_that("missing input file raises a clear error", { + expect_error( + convert_anndata_to_seurat("/no/such/file.h5ad"), + "does not exist" + ) +}) + +test_that("default_reduction_map covers common scanpy embeddings", { + m <- default_reduction_map() + expect_true(all(c("X_pca", "X_umap", "X_tsne") %in% names(m))) + expect_equal(m$X_pca$name, "pca") + expect_equal(m$X_pca$key, "PC_") +}) + +test_that("reduction_map overrides defaults and adds new keys", { + skip_if_no_anndata() + skip_if_not_installed("Seurat") + ad <- build_adata(obsm_keys = c("X_pca", "X_my_emb")) + custom <- list( + X_pca = list(name = "PCA_renamed", key = "Renamed_"), + X_my_emb = list(name = "myemb", key = "MyEmb_") + ) + s <- convert_anndata_to_seurat(ad, counts_layer = "counts", + reduction_map = custom) + expect_true("PCA_renamed" %in% names(s@reductions)) + expect_true("myemb" %in% names(s@reductions)) + expect_equal(s@reductions$PCA_renamed@key, "Renamed_") + expect_equal(s@reductions$myemb@key, "MyEmb_") +}) + +test_that("reduction_map can disable a default mapping with name = NA", { + skip_if_no_anndata() + skip_if_not_installed("Seurat") + ad <- build_adata(obsm_keys = c("X_pca", "X_umap")) + s <- convert_anndata_to_seurat( + ad, counts_layer = "counts", + reduction_map = list(X_umap = list(name = NA, key = NA)) + ) + expect_true("pca" %in% names(s@reductions)) + expect_false("umap" %in% names(s@reductions)) +}) + +test_that("convert_anndata_to_sce accepts a file path", { + skip_if_no_anndata() + ad <- build_adata() + path <- tempfile(fileext = ".h5ad") + write_h5ad(ad, path) + + sce <- convert_anndata_to_sce(path, counts_layer = "counts") + expect_s4_class(sce, "SingleCellExperiment") +}) + +test_that("convert_anndata_to_sce counts_layer accepts a vector of candidates", { + skip_if_no_anndata() + ad <- build_adata(layer_names = "raw_count") + sce <- convert_anndata_to_sce( + ad, counts_layer = c("counts", "raw_count") + ) + expect_s4_class(sce, "SingleCellExperiment") + expect_true("counts" %in% SummarizedExperiment::assayNames(sce)) +}) + +test_that("pick_first_layer returns NULL on empty/NULL candidates", { + skip_if_no_anndata() + ad <- build_adata() + expect_null(convert2anndata:::pick_first_layer(ad, NULL)) + expect_null(convert2anndata:::pick_first_layer(ad, character(0))) + expect_null(convert2anndata:::pick_first_layer(ad, "")) +}) + +test_that("orig.ident derives from uns$conversion_source when present", { + skip_if_no_anndata() + skip_if_not_installed("Seurat") + ad <- build_adata() + ad$uns[["conversion_source"]] <- "labeled_in_uns" + s <- convert_anndata_to_seurat(ad, counts_layer = "counts") + expect_equal(unique(as.character(s@meta.data$orig.ident)), "labeled_in_uns") +}) diff --git a/tests/testthat/test-process_layers.R b/tests/testthat/test-process_layers.R index 2562c6d..679830d 100644 --- a/tests/testthat/test-process_layers.R +++ b/tests/testthat/test-process_layers.R @@ -27,7 +27,7 @@ test_that("process_layers combines split layers with collisions correctly", { expect_equal(as.matrix(combined_counts), counts_matrix) # Check that the original split layers are removed - remaining_layers <- Layers(result$assay_object) + remaining_layers <- SeuratObject::Layers(result$assay_object) expect_false("counts.A" %in% remaining_layers) expect_false("counts.B" %in% remaining_layers) expect_true("counts.1" %in% remaining_layers) @@ -58,7 +58,7 @@ test_that("process_layers combines split layers without collisions correctly", { expect_equal(as.matrix(combined_counts), counts_matrix) # Check that the original split layers are removed - remaining_layers <- Layers(result$assay_object) + remaining_layers <- SeuratObject::Layers(result$assay_object) expect_false("counts.A" %in% remaining_layers) expect_false("counts.B" %in% remaining_layers) expect_false("counts.1" %in% remaining_layers) @@ -86,7 +86,7 @@ test_that("process_layers handles non-split layers correctly", { expect_equal(result$split_names, list()) # Validate that the non-split layer remains intact - remaining_layers <- Layers(result$assay_object) + remaining_layers <- SeuratObject::Layers(result$assay_object) expect_true("data" %in% remaining_layers) expect_equal(length(remaining_layers), 2) }) @@ -120,7 +120,7 @@ test_that("process_layers handles mixed split and non-split layers", { expect_equal(as.matrix(combined_counts), counts_matrix) # Validate that the non-split layer remains intact - remaining_layers <- Layers(result$assay_object) + remaining_layers <- SeuratObject::Layers(result$assay_object) expect_true("scale.data" %in% remaining_layers) expect_true("counts" %in% remaining_layers) expect_true("test" %in% remaining_layers) diff --git a/tests/testthat/test-process_main_assay.R b/tests/testthat/test-process_main_assay.R index 1433995..9f50fe8 100644 --- a/tests/testthat/test-process_main_assay.R +++ b/tests/testthat/test-process_main_assay.R @@ -2,12 +2,14 @@ test_that("process_main_assay converts non-CsparseMatrix to CsparseMatrix", { library(SingleCellExperiment) library(Matrix) - # Create an assay with a triplet sparse matrix (dgTMatrix) + # Create an assay with a triplet sparse matrix (dgTMatrix). Matrix >= 1.5 + # defaults sparseMatrix() to dgCMatrix; force triplet via repr = "T". triplet_matrix <- Matrix::sparseMatrix( i = c(1, 3, 5, 7), j = c(2, 4, 6, 8), x = 1:4, - dims = c(10, 10) + dims = c(10, 10), + repr = "T" ) expect_false(methods::is(triplet_matrix, "CsparseMatrix")) @@ -23,11 +25,17 @@ test_that("process_main_assay converts non-CsparseMatrix to CsparseMatrix", { # Test that the result is a CsparseMatrix expect_true(methods::is(X, "CsparseMatrix")) - # Test that data is preserved - expected_values <- c(0, 1, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 4, 0, 0) + # Test that data is preserved. process_main_assay transposes + # (genes x cells -> cells x genes) before returning, so non-zeros at + # M[1,2]=1, M[3,4]=2, M[5,6]=3, M[7,8]=4 become X[2,1]=1, X[4,3]=2, + # X[6,5]=3, X[8,7]=4. Column-major linear indices: 2, 24, 46, 68 — only + # the first two land in [1:40]. + expected_values <- c( + 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, # col 1 -> X[2,1]=1 + rep(0, 10), # col 2 + 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, # col 3 -> X[4,3]=2 + rep(0, 10) # col 4 + ) expect_equal(as.vector(as.matrix(X))[1:40], expected_values) # Test with a diagonal matrix diff --git a/tests/testthat/test-raw_and_obsp.R b/tests/testthat/test-raw_and_obsp.R new file mode 100644 index 0000000..d4bdb45 --- /dev/null +++ b/tests/testthat/test-raw_and_obsp.R @@ -0,0 +1,198 @@ +library(testthat) +library(convert2anndata) +library(Matrix) +library(anndata) +library(Seurat) + +skip_if_no_anndata <- function() { + skip_if_not_installed("anndata") + skip_if_not_installed("reticulate") + if (!reticulate::py_available(initialize = TRUE)) { + skip("Python not available for reticulate.") + } + if (!reticulate::py_module_available("anndata")) { + skip("Python 'anndata' module not installed.") + } +} + +build_X <- function(n_cells, n_genes) { + X <- matrix(rpois(n_cells * n_genes, 3), + nrow = n_cells, ncol = n_genes) + rownames(X) <- sprintf("cell-%03d", seq_len(n_cells)) + colnames(X) <- sprintf("gene-%03d", seq_len(n_genes)) + X +} + +# ---------------------------------------------------------------------- +# adata.raw +# ---------------------------------------------------------------------- + +test_that("use_raw='auto' uses raw when no counts_layer matches", { + skip_if_no_anndata() + X <- build_X(12, 8) + log_X <- log1p(X) + ad <- AnnData(X = log_X) + ad$raw <- AnnData(X = X) + + s <- convert_anndata_to_seurat(ad, counts_layer = c("counts","raw_counts")) + counts <- as.matrix(GetAssayData(s, layer = "counts")) + expect_equal(unname(counts), unname(t(X))) + data <- as.matrix(GetAssayData(s, layer = "data")) + expect_equal(unname(data), unname(t(log_X))) +}) + +test_that("use_raw='auto' does NOT override an existing counts_layer match", { + skip_if_no_anndata() + X <- build_X(10, 6) + log_X <- log1p(X) + layer_X <- X * 7L + ad <- AnnData(X = log_X, layers = list(counts = layer_X)) + ad$raw <- AnnData(X = X) + + s <- convert_anndata_to_seurat(ad, counts_layer = "counts") + counts <- as.matrix(GetAssayData(s, layer = "counts")) + # The layer matched, so it wins -- raw should be ignored. + expect_equal(unname(counts), unname(t(layer_X))) +}) + +test_that("use_raw='always' uses raw even when a counts_layer matches", { + skip_if_no_anndata() + X <- build_X(10, 6) + log_X <- log1p(X) + layer_X <- X * 7L + ad <- AnnData(X = log_X, layers = list(counts = layer_X)) + ad$raw <- AnnData(X = X) + + s <- convert_anndata_to_seurat(ad, counts_layer = "counts", use_raw = "always") + counts <- as.matrix(GetAssayData(s, layer = "counts")) + expect_equal(unname(counts), unname(t(X))) +}) + +test_that("use_raw='never' ignores raw entirely", { + skip_if_no_anndata() + X <- build_X(10, 6) + ad <- AnnData(X = log1p(X)) + ad$raw <- AnnData(X = X) + + s <- convert_anndata_to_seurat(ad, counts_layer = c("counts"), use_raw = "never") + counts <- as.matrix(GetAssayData(s, layer = "counts")) + # Raw was ignored; no counts layer matched; X (log1p) became counts. + expect_equal(unname(counts), unname(t(log1p(X)))) +}) + +test_that("use_raw can be passed as logical for back-compat", { + skip_if_no_anndata() + X <- build_X(10, 6) + ad <- AnnData(X = log1p(X)) + ad$raw <- AnnData(X = X) + + s_t <- convert_anndata_to_seurat(ad, use_raw = TRUE) + s_f <- convert_anndata_to_seurat(ad, use_raw = FALSE) + expect_equal(unname(as.matrix(GetAssayData(s_t, layer = "counts"))), unname(t(X))) + expect_equal(unname(as.matrix(GetAssayData(s_f, layer = "counts"))), unname(t(log1p(X)))) +}) + +test_that("raw with a different gene set: counts uses raw shape; data layer skipped", { + skip_if_no_anndata() + # Typical scanpy pattern: adata is filtered to highly-variable genes + # (here 6 of 12); adata.raw retains all 12 genes with raw counts. + X_full <- build_X(10, 12) + hv <- 1:6 + X_filt <- log1p(X_full[, hv]) + colnames(X_filt) <- colnames(X_full)[hv] + + ad <- AnnData(X = X_filt) + ad$raw <- AnnData(X = X_full) + + s <- convert_anndata_to_seurat(ad, use_raw = "always") + expect_equal(nrow(s), ncol(X_full)) # full gene set from raw + expect_equal(ncol(s), nrow(X_full)) # cells unchanged + # Data layer was skipped because the shapes differ. + expect_false("data" %in% SeuratObject::Layers(s[["RNA"]])) +}) + +test_that("raw without an obs/var attached still produces a valid Seurat object", { + skip_if_no_anndata() + X <- build_X(8, 5) + ad <- AnnData(X = log1p(X)) + ad$raw <- AnnData(X = X) + s <- convert_anndata_to_seurat(ad, use_raw = "always") + expect_s4_class(s, "Seurat") + expect_equal(ncol(s), 8) +}) + +# ---------------------------------------------------------------------- +# adata.obsp -> Seurat Graphs +# ---------------------------------------------------------------------- + +build_obsp_graph <- function(n) { + np <- reticulate::import("numpy") + sp <- reticulate::import("scipy.sparse") + G <- matrix(0, n, n) + set.seed(0) + for (i in seq_len(n)) { + nbrs <- sample(setdiff(seq_len(n), i), 2) + G[i, nbrs] <- runif(2) + } + sp$csr_matrix(np$asarray(G, dtype = "float64")) +} + +test_that("obsp connectivities and distances become Seurat Graphs", { + skip_if_no_anndata() + n <- 12 + X <- build_X(n, 8) + ad <- AnnData(X = X) + ad$obsp <- list( + connectivities = build_obsp_graph(n), + distances = build_obsp_graph(n) + ) + + s <- convert_anndata_to_seurat(ad) + graph_names <- names(s@graphs) + expect_true("RNA_connectivities" %in% graph_names) + expect_true("RNA_distances" %in% graph_names) + g <- s@graphs[["RNA_connectivities"]] + expect_equal(nrow(g), n) + expect_equal(ncol(g), n) + expect_equal(rownames(g), colnames(s)) +}) + +test_that("obsp uses the custom assay name in graph names", { + skip_if_no_anndata() + n <- 8 + X <- build_X(n, 5) + ad <- AnnData(X = X) + ad$obsp <- list(connectivities = build_obsp_graph(n)) + s <- convert_anndata_to_seurat(ad, assay = "ATAC") + expect_true("ATAC_connectivities" %in% names(s@graphs)) +}) + +test_that("attach_obsp = FALSE skips obsp attachment", { + skip_if_no_anndata() + n <- 8 + X <- build_X(n, 5) + ad <- AnnData(X = X) + ad$obsp <- list(connectivities = build_obsp_graph(n)) + s <- convert_anndata_to_seurat(ad, attach_obsp = FALSE) + expect_length(s@graphs, 0) +}) + +test_that("extract_anndata_obsp guards against non-square or shape-mismatched matrices", { + skip_if_no_anndata() + n <- 8 + # Use a fake adata-like environment that exposes obsp via its dollar-list. + # We build a non-square matrix directly and route it through the helper. + fake_obsp <- list( + bad_rect = Matrix::Matrix(matrix(runif(n * (n - 1)), n, n - 1), sparse = TRUE), + good_square = Matrix::Matrix(matrix(runif(n * n), n, n), sparse = TRUE) + ) + fake_adata <- list(obsp = fake_obsp) + + # Stub anndata_mapping_keys to read names() instead of py-keys() + res <- expect_warning( + extract_anndata_obsp(fake_adata, obs_names = sprintf("c%02d", seq_len(n))), + regexp = "not square" + ) + expect_named(res, "good_square") + expect_equal(nrow(res$good_square), n) +}) diff --git a/tests/testthat/test-realistic_scanpy_dataset.R b/tests/testthat/test-realistic_scanpy_dataset.R new file mode 100644 index 0000000..84a9f84 --- /dev/null +++ b/tests/testthat/test-realistic_scanpy_dataset.R @@ -0,0 +1,177 @@ +library(testthat) +library(convert2anndata) +library(Matrix) +library(anndata) +library(Seurat) + +skip_if_no_anndata <- function() { + skip_if_not_installed("anndata") + skip_if_not_installed("reticulate") + if (!reticulate::py_available(initialize = TRUE)) { + skip("Python not available for reticulate.") + } + if (!reticulate::py_module_available("anndata")) { + skip("Python 'anndata' module not installed.") + } +} + +# Build a 1000-cell, 500-gene synthetic AnnData that mirrors the typical +# scanpy-pipeline output: filtered + normalised X, raw counts in +# adata.raw, layers for counts/logcounts, multiple obsm reductions, an +# obsp NN graph, multi-column categorical obs, and uns metadata. +build_scanpy_like <- function(n_cells = 1000L, n_genes_full = 500L, + n_genes_hv = 200L, seed = 42L) { + set.seed(seed) + np <- reticulate::import("numpy") + sp <- reticulate::import("scipy.sparse") + + raw_dense <- matrix(rpois(n_cells * n_genes_full, lambda = 1.5), + nrow = n_cells, ncol = n_genes_full) + rownames(raw_dense) <- sprintf("cell-%04d", seq_len(n_cells)) + colnames(raw_dense) <- sprintf("ENSG-%04d", seq_len(n_genes_full)) + + hv_idx <- sort(sample.int(n_genes_full, n_genes_hv)) + X_filt <- log1p(raw_dense[, hv_idx]) + filt_genes <- colnames(raw_dense)[hv_idx] + colnames(X_filt) <- filt_genes + + raw_sparse <- sp$csr_matrix(np$asarray(raw_dense, dtype = "float32")) + X_sparse <- sp$csr_matrix(np$asarray(X_filt, dtype = "float32")) + + obs <- data.frame( + sample = sample(paste0("S", 1:5), n_cells, replace = TRUE), + cell_type = factor(sample(c("T", "B", "NK", "Mono", "DC"), + n_cells, replace = TRUE)), + n_counts = rowSums(raw_dense), + n_genes = rowSums(raw_dense > 0), + pct_mt = runif(n_cells, 0, 10), + is_doublet = sample(c(FALSE, TRUE), n_cells, + replace = TRUE, prob = c(0.95, 0.05)), + row.names = rownames(raw_dense), + stringsAsFactors = FALSE + ) + var_full <- data.frame( + highly_variable = colnames(raw_dense) %in% filt_genes, + mean_counts = colMeans(raw_dense), + n_cells = colSums(raw_dense > 0), + row.names = colnames(raw_dense), + stringsAsFactors = FALSE + ) + var_filt <- var_full[filt_genes, , drop = FALSE] + + X_pca <- matrix(rnorm(n_cells * 30), n_cells, 30) + X_umap <- matrix(rnorm(n_cells * 2), n_cells, 2) + X_tsne <- matrix(rnorm(n_cells * 2), n_cells, 2) + X_harm <- matrix(rnorm(n_cells * 30), n_cells, 30) + + # kNN graph (k = 10, sparse, square) + G <- matrix(0, n_cells, n_cells) + for (i in seq_len(n_cells)) { + nbrs <- sample(setdiff(seq_len(n_cells), i), 10) + G[i, nbrs] <- runif(10) + } + conn <- sp$csr_matrix(np$asarray(G, dtype = "float32")) + dist <- sp$csr_matrix(np$asarray(G * 5, dtype = "float32")) + + ad <- AnnData( + X = X_sparse, + obs = obs, + var = var_filt, + obsm = list(X_pca = X_pca, X_umap = X_umap, + X_tsne = X_tsne, X_harmony = X_harm), + obsp = list(connectivities = conn, distances = dist), + uns = list( + conversion_source = "scanpy_synth", + neighbors = list(method = "umap", n_neighbors = 10L), + pca = list(variance_ratio = runif(30)) + ), + layers = list( + counts = X_sparse, + logcounts = X_sparse + ) + ) + ad$raw <- AnnData(X = raw_sparse, var = var_full) + ad +} + +test_that("realistic scanpy-like dataset converts and round-trips key invariants", { + skip_if_no_anndata() + ad <- build_scanpy_like(n_cells = 600L, n_genes_full = 300L, n_genes_hv = 120L) + s <- convert_anndata_to_seurat(ad, counts_layer = "counts") + + expect_s4_class(s, "Seurat") + expect_equal(ncol(s), as.integer(ad$n_obs)) + expect_equal(nrow(s), as.integer(ad$n_vars)) # X.shape, not raw.shape + + # All four reductions show up. + expect_setequal( + intersect(names(s@reductions), c("pca","umap","tsne","harmony")), + c("pca","umap","tsne","harmony") + ) + + # Both obsp graphs attached. + expect_true(all(c("RNA_connectivities", "RNA_distances") %in% names(s@graphs))) + + # Categorical obs survives. + expect_true("cell_type" %in% colnames(s@meta.data)) + obs_in <- as.data.frame(ad$obs) + expect_equal(as.character(s@meta.data$cell_type), + as.character(obs_in$cell_type)) + + # Feature meta got attached. + fm <- s[["RNA"]][[]] + expect_true("highly_variable" %in% colnames(fm)) + + # uns$conversion_source becomes orig.ident. + expect_equal(unique(as.character(s@meta.data$orig.ident)), "scanpy_synth") +}) + +test_that("realistic dataset with use_raw='always' yields the unfiltered gene set", { + skip_if_no_anndata() + ad <- build_scanpy_like(n_cells = 400L, n_genes_full = 200L, n_genes_hv = 80L) + s <- convert_anndata_to_seurat(ad, use_raw = "always") + expect_equal(nrow(s), as.integer(ad$raw$shape[2])) + # Data layer should be skipped because raw and X have different gene sets. + expect_false("data" %in% SeuratObject::Layers(s[["RNA"]])) +}) + +test_that("realistic dataset converts in reasonable time", { + skip_if_no_anndata() + ad <- build_scanpy_like(n_cells = 1000L, n_genes_full = 500L, n_genes_hv = 200L) + t <- system.time({ + s <- convert_anndata_to_seurat(ad, counts_layer = "counts") + })[["elapsed"]] + # Generous bound: this is on shared infrastructure. 90s is a sanity check + # against quadratic-blowup regressions, not a perf benchmark. + expect_lt(t, 90) + expect_equal(ncol(s), 1000L) +}) + +# ---------------------------------------------------------------------- +# backed='r' mode +# ---------------------------------------------------------------------- + +test_that("backed='r' read still produces an in-memory Seurat object", { + skip_if_no_anndata() + ad <- build_scanpy_like(n_cells = 200L, n_genes_full = 150L, n_genes_hv = 60L) + path <- tempfile(fileext = ".h5ad") + on.exit(unlink(path), add = TRUE) + write_h5ad(ad, path) + + ad_backed <- tryCatch( + anndata::read_h5ad(path, backed = "r"), + error = function(e) { + skip(paste("backed read not supported by this anndata:", + conditionMessage(e))) + } + ) + + s <- tryCatch( + convert_anndata_to_seurat(ad_backed, counts_layer = "counts"), + error = function(e) { + skip(paste("backed conversion not supported:", conditionMessage(e))) + } + ) + expect_s4_class(s, "Seurat") + expect_equal(ncol(s), 200L) +}) diff --git a/tests/testthat/test-roundtrip_complex_anndata.R b/tests/testthat/test-roundtrip_complex_anndata.R new file mode 100644 index 0000000..8c1194b --- /dev/null +++ b/tests/testthat/test-roundtrip_complex_anndata.R @@ -0,0 +1,203 @@ +library(testthat) +library(convert2anndata) +library(Matrix) +library(anndata) +library(Seurat) +library(SingleCellExperiment) +library(SummarizedExperiment) + +# ====================================================================== +# Complex AnnData round-trip fidelity (issue #18). +# +# Two full cycles are exercised on a COMPLEX synthetic AnnData (two layers, +# PCA + UMAP obsm, varm, two obsp graphs, varp, raw, and obs/var carrying +# factor / NA / logical / character / numeric columns): +# +# Path A AnnData -> SCE -> AnnData (SCE-mediated, faithful) +# Path B AnnData -> Seurat -> SCE -> AnnData (Seurat-mediated, lossy) +# +# Fixtures and helpers live in helper-roundtrip.R. Path A writes through an +# on-disk .h5ad so serialisation is part of the test. Components that the +# converters do not (yet) carry in a given direction are documented as +# skip()'d tests with a TODO(#18) so the gap is recorded, not silent. +# ====================================================================== + +TOL <- 1e-6 # exact-ish for values that stay float64 across the trip +TOL_F32 <- 1e-4 # looser bound where a value passes through float32 + +# ---------------------------------------------------------------------- +# Path A: AnnData -> SCE -> AnnData (the faithful, SCE-mediated path) +# ---------------------------------------------------------------------- + +test_that("Path A: complex AnnData round-trips X/counts through SCE and h5ad", { + skip_if_no_anndata() + ad <- build_complex_anndata() + sce <- rt_quiet(convert_anndata_to_sce(ad, counts_layer = "counts")) + ad_rt <- rt_write_read(rt_quiet( + convert_to_anndata(sce, assayName = "counts", useAltExp = TRUE) + )) + + # The original counts layer should land in the round-tripped X. + expect_equal(rt_max_abs_diff(ad$layers[["counts"]], ad_rt$X), 0) + # obs/var names survive the full trip. + expect_identical(as.character(ad$obs_names), as.character(ad_rt$obs_names)) + expect_identical(as.character(ad$var_names), as.character(ad_rt$var_names)) +}) + +test_that("Path A: extra layers survive by name (counts + logcounts)", { + skip_if_no_anndata() + ad <- build_complex_anndata() + sce <- rt_quiet(convert_anndata_to_sce(ad, counts_layer = "counts")) + ad_rt <- rt_write_read(rt_quiet(convert_to_anndata(sce, assayName = "counts"))) + + expect_true("logcounts" %in% rt_keys(ad_rt$layers)) + expect_lte(rt_max_abs_diff(ad$layers[["logcounts"]], + ad_rt$layers[["logcounts"]]), TOL) +}) + +test_that("Path A: obsm reductions (PCA + UMAP) survive within tolerance", { + skip_if_no_anndata() + ad <- build_complex_anndata() + sce <- rt_quiet(convert_anndata_to_sce(ad, counts_layer = "counts")) + ad_rt <- rt_write_read(rt_quiet(convert_to_anndata(sce, assayName = "counts"))) + + rt_map <- setNames(rt_keys(ad_rt$obsm), rt_norm_key(rt_keys(ad_rt$obsm))) + for (k in c("X_pca", "X_umap")) { + nk <- rt_norm_key(k) + expect_true(nk %in% names(rt_map), + info = sprintf("obsm '%s' missing after round trip", k)) + expect_lte(rt_max_abs_diff(ad$obsm[[k]], ad_rt$obsm[[rt_map[[nk]]]]), TOL_F32) + } +}) + +test_that("Path A: obs metadata (factor, NA, logical, character) survives", { + skip_if_no_anndata() + ad <- build_complex_anndata() + sce <- rt_quiet(convert_anndata_to_sce(ad, counts_layer = "counts")) + ad_rt <- rt_write_read(rt_quiet(convert_to_anndata(sce, assayName = "counts"))) + + obs0 <- as.data.frame(ad$obs) + obs1 <- as.data.frame(ad_rt$obs) + + # Categorical preserved (levels + per-cell labels), NA kept as NA. + expect_equal(as.character(obs1$cell_type), as.character(obs0$cell_type)) + expect_true(is.na(obs1$pct_mt[1])) + expect_equal(obs1$n_counts, obs0$n_counts) + expect_equal(as.logical(obs1$is_ctrl), obs0$is_ctrl) + expect_equal(as.character(obs1$donor), obs0$donor) +}) + +test_that("Path A: var metadata (factor, logical, numeric) survives", { + skip_if_no_anndata() + ad <- build_complex_anndata() + sce <- rt_quiet(convert_anndata_to_sce(ad, counts_layer = "counts")) + ad_rt <- rt_write_read(rt_quiet(convert_to_anndata(sce, assayName = "counts"))) + + var0 <- as.data.frame(ad$var) + var1 <- as.data.frame(ad_rt$var) + expect_equal(as.character(var1$gene_class), as.character(var0$gene_class)) + expect_equal(as.logical(var1$highly_variable), var0$highly_variable) + expect_equal(var1$mean_expr, var0$mean_expr, tolerance = TOL_F32) +}) + +test_that("Path A: both sparse and dense X inputs round-trip counts faithfully", { + skip_if_no_anndata() + for (sparse in c(TRUE, FALSE)) { + ad <- build_complex_anndata(sparse_X = sparse) + sce <- rt_quiet(convert_anndata_to_sce(ad, counts_layer = "counts")) + ad_rt <- rt_write_read(rt_quiet(convert_to_anndata(sce, assayName = "counts"))) + expect_equal(rt_max_abs_diff(ad$layers[["counts"]], ad_rt$X), 0, + info = sprintf("sparse_X = %s", sparse)) + } +}) + +# ---------------------------------------------------------------------- +# AnnData -> Seurat: complex components attach directly +# ---------------------------------------------------------------------- + +test_that("complex AnnData attaches reductions, obsp graphs, and var feature-meta to Seurat", { + skip_if_no_anndata() + ad <- build_complex_anndata() + s <- rt_quiet(convert_anndata_to_seurat(ad, counts_layer = "counts")) + + expect_setequal(names(s@reductions), c("pca", "umap")) + expect_true(all(c("RNA_connectivities", "RNA_distances") %in% names(s@graphs))) + fm <- s[["RNA"]][[]] + expect_true(all(c("gene_class", "highly_variable", "mean_expr") %in% colnames(fm))) + # PCA embeddings preserved on the direct AnnData -> Seurat hop. + expect_lte(rt_max_abs_diff(s@reductions$pca@cell.embeddings, ad$obsm[["X_pca"]]), + TOL_F32) +}) + +test_that("Path B: obsp connectivity survives the Seurat-mediated round trip", { + skip_if_no_anndata() + # AnnData obsp -> Seurat Graph -> SCE colPair -> AnnData obsp. Edge *weights* + # are dropped by convert_graph_to_colPair (connectivity is binarised), so we + # assert the non-zero PATTERN is preserved rather than the weights. + ad <- build_complex_anndata() + s <- rt_quiet(convert_anndata_to_seurat(ad, counts_layer = "counts")) + sce <- rt_quiet(convert_seurat_to_sce(s)) + ad_b <- rt_quiet(convert_to_anndata(sce, assayName = "counts")) + + # The Seurat hop prefixes the graph with the assay name + # (connectivities -> RNA_connectivities), so look it up by the prefixed key. + expect_true("RNA_connectivities" %in% rt_keys(ad_b$obsp)) + orig <- as.matrix(ad$obsp[["connectivities"]]) + back <- as.matrix(ad_b$obsp[["RNA_connectivities"]]) + expect_equal(dim(back), dim(orig)) + expect_equal(rt_pattern(back), rt_pattern(orig)) +}) + +# ---------------------------------------------------------------------- +# Known gaps (documented, not silently missing) -- see TODO(#18) +# ---------------------------------------------------------------------- + +test_that("[known gap] Path B drops extra AnnData layers (Seurat assay model)", { + # TODO(#18): a Seurat assay has only counts/data/scale.data slots, so an + # extra layer such as 'logcounts' has nowhere to live on the Seurat hop. + # Use the SCE-mediated path (Path A) when verbatim layers must survive. + skip("Known structural gap: Seurat-mediated path cannot carry extra layers.") + ad <- build_complex_anndata() + s <- convert_anndata_to_seurat(ad, counts_layer = "counts") + sce <- convert_seurat_to_sce(s) + ad_b <- convert_to_anndata(sce, assayName = "counts") + expect_true("logcounts" %in% rt_keys(ad_b$layers)) +}) + +test_that("[known gap] Path B drops var feature metadata", { + # TODO(#18): convert_anndata_to_seurat() attaches var to the Seurat feature + # meta, but convert_seurat_to_sce() does not lift feature meta back into + # rowData, so 'var' is lost across AnnData -> Seurat -> SCE -> AnnData. + # Use the SCE-mediated path (Path A) when var metadata must survive. + skip("Known structural gap: Seurat-mediated path does not preserve var.") + ad <- build_complex_anndata() + s <- convert_anndata_to_seurat(ad, counts_layer = "counts") + sce <- convert_seurat_to_sce(s) + ad_b <- convert_to_anndata(sce, assayName = "counts") + expect_true("gene_class" %in% rt_keys(ad_b$var)) +}) + +test_that("[known gap] convert_anndata_to_sce does not wire obsp into colPairs", { + # TODO(#18): convert_anndata_to_seurat() attaches obsp as Graphs and + # extract_anndata_obsp() already returns the matrices, but + # convert_anndata_to_sce() has no obsp -> colPairs step, so the SCE-mediated + # AnnData round trip loses obsp. A weight-preserving SelfHits construction + # would close this (see the issue report). + skip("Known gap: convert_anndata_to_sce() has no obsp -> colPairs step.") + ad <- build_complex_anndata() + sce <- convert_anndata_to_sce(ad, counts_layer = "counts") + expect_gt(length(SingleCellExperiment::colPairNames(sce)), 0) +}) + +test_that("[known gap] convert_anndata_to_sce does not carry varm / varp / raw", { + # TODO(#18): convert_anndata_to_sce() reads only X/layers/obs/var/obsm, so + # varm (gene loadings), varp (feature graphs) and adata.raw are dropped on + # the AnnData -> SCE hop and therefore do not survive the SCE-mediated round + # trip. These are unimplemented reverse directions, not regressions. + skip("Known gap: convert_anndata_to_sce() ignores varm/varp/raw.") + ad <- build_complex_anndata() + sce <- convert_anndata_to_sce(ad, counts_layer = "counts") + ad_rt <- convert_to_anndata(sce, assayName = "counts") + expect_true("PCs" %in% rt_keys(ad_rt$varm)) + expect_true("corr" %in% rt_keys(ad_rt$varp)) +}) diff --git a/tests/testthat/test-roundtrip_complex_sce.R b/tests/testthat/test-roundtrip_complex_sce.R new file mode 100644 index 0000000..4bd06ee --- /dev/null +++ b/tests/testthat/test-roundtrip_complex_sce.R @@ -0,0 +1,138 @@ +library(testthat) +library(convert2anndata) +library(Matrix) +library(anndata) +library(SingleCellExperiment) +library(SummarizedExperiment) +library(S4Vectors) +library(BiocNeighbors) + +# ====================================================================== +# Complex SingleCellExperiment round-trip fidelity (issue #18). +# +# SCE -> AnnData -> SCE +# +# on a COMPLEX SCE: sparse counts + two dense assays, PCA + UMAP reducedDims, +# an altExp (multimodal ADT), a colPair kNN graph, and colData/rowData with +# factor / NA / character columns. Fixture in helper-roundtrip.R. +# +# The round trip is done in memory (not through .h5ad): convert_to_anndata() +# stores altExps as nested AnnData objects under uns$altExperiments, which is +# not a goal of HDF5 serialisation. Components carried only one way are +# documented as skip()'d tests with TODO(#18). +# ====================================================================== + +TOL_F32 <- 1e-4 # counts pass through float32 inside AnnData + +rt_sce <- function(sce, assayName = "counts") { + ad <- rt_quiet(convert_to_anndata(sce, assayName = assayName, useAltExp = TRUE)) + rt_quiet(convert_anndata_to_sce(ad, counts_layer = "counts")) +} + +# ---------------------------------------------------------------------- +# SCE -> AnnData -> SCE +# ---------------------------------------------------------------------- + +test_that("complex SCE round-trips all assays (sparse counts + dense layers)", { + skip_if_no_anndata() + skip_if_not_installed("BiocNeighbors") + sce <- build_complex_sce() + back <- rt_sce(sce) + + expect_true(all(c("counts", "logcounts", "scaledata") %in% assayNames(back))) + expect_lte(rt_max_abs_diff(assay(sce, "counts"), assay(back, "counts")), TOL_F32) + expect_lte(rt_max_abs_diff(assay(sce, "logcounts"), assay(back, "logcounts")), TOL_F32) + expect_lte(rt_max_abs_diff(assay(sce, "scaledata"), assay(back, "scaledata")), TOL_F32) +}) + +test_that("complex SCE round-trips reducedDims (PCA + UMAP)", { + skip_if_no_anndata() + skip_if_not_installed("BiocNeighbors") + sce <- build_complex_sce() + back <- rt_sce(sce) + + expect_true(all(c("PCA", "UMAP") %in% reducedDimNames(back))) + expect_lte(rt_max_abs_diff(reducedDim(sce, "PCA"), reducedDim(back, "PCA")), TOL_F32) + expect_lte(rt_max_abs_diff(reducedDim(sce, "UMAP"), reducedDim(back, "UMAP")), TOL_F32) +}) + +test_that("complex SCE round-trips colData (factor, NA, character) and rowData", { + skip_if_no_anndata() + skip_if_not_installed("BiocNeighbors") + sce <- build_complex_sce() + back <- rt_sce(sce) + + cd0 <- colData(sce); cd1 <- colData(back) + expect_equal(as.character(cd1$cell_type), as.character(cd0$cell_type)) + expect_true(is.na(cd1$qc[1])) + expect_equal(as.character(cd1$batch), as.character(cd0$batch)) + + rd0 <- rowData(sce); rd1 <- rowData(back) + expect_equal(as.character(rd1$symbol), as.character(rd0$symbol)) + expect_equal(as.logical(rd1$is_hvg), rd0$is_hvg) + + # Cell / feature names survive. + expect_equal(colnames(back), colnames(sce)) + expect_equal(rownames(back), rownames(sce)) +}) + +# ---------------------------------------------------------------------- +# SCE -> AnnData: multimodal / graph data carried one direction +# ---------------------------------------------------------------------- + +test_that("SCE colPair becomes obsp on the AnnData side", { + skip_if_no_anndata() + skip_if_not_installed("BiocNeighbors") + sce <- build_complex_sce() + ad <- rt_quiet(convert_to_anndata(sce, assayName = "counts", useAltExp = TRUE)) + # extract_pairs turns the single-mcol SelfHits into a weighted sparse matrix. + expect_true("knn" %in% rt_keys(ad$obsp)) + expect_equal(dim(as.matrix(ad$obsp[["knn"]])), c(ncol(sce), ncol(sce))) +}) + +test_that("SCE altExp is preserved into AnnData uns$altExperiments", { + skip_if_no_anndata() + skip_if_not_installed("BiocNeighbors") + sce <- build_complex_sce() + ad <- rt_quiet(convert_to_anndata(sce, assayName = "counts", useAltExp = TRUE)) + + expect_true("altExperiments" %in% rt_keys(ad$uns)) + expect_true("ADT" %in% names(ad$uns[["altExperiments"]])) + alt <- ad$uns[["altExperiments"]][["ADT"]] + # The ADT altExp is itself an AnnData (4 ADT features x n_cells). + expect_equal(as.integer(alt$n_vars), 4L) + expect_equal(as.integer(alt$n_obs), ncol(sce)) +}) + +test_that("useAltExp = FALSE drops altExps instead of storing them", { + skip_if_no_anndata() + skip_if_not_installed("BiocNeighbors") + sce <- build_complex_sce() + ad <- rt_quiet(convert_to_anndata(sce, assayName = "counts", useAltExp = FALSE)) + expect_false("altExperiments" %in% rt_keys(ad$uns)) +}) + +# ---------------------------------------------------------------------- +# Known gaps (documented, not silently missing) -- see TODO(#18) +# ---------------------------------------------------------------------- + +test_that("[known gap] altExps are not restored as SCE altExps on the return leg", { + # TODO(#18): SCE altExp -> AnnData uns$altExperiments works, but + # convert_anndata_to_sce() does not read uns$altExperiments back, so altExps + # do not survive SCE -> AnnData -> SCE. A reverse altExp restore step would + # close this. + skip("Known gap: convert_anndata_to_sce() does not restore uns$altExperiments.") + sce <- build_complex_sce() + back <- rt_sce(sce) + expect_true("ADT" %in% altExpNames(back)) +}) + +test_that("[known gap] colPairs are not restored on the return leg", { + # TODO(#18): SCE colPair -> AnnData obsp works, but convert_anndata_to_sce() + # has no obsp -> colPairs step, so colPairs do not survive + # SCE -> AnnData -> SCE (mirror of the AnnData-side obsp gap). + skip("Known gap: convert_anndata_to_sce() has no obsp -> colPairs step.") + sce <- build_complex_sce() + back <- rt_sce(sce) + expect_gt(length(colPairNames(back)), 0) +}) diff --git a/tests/testthat/test-roundtrip_complex_seurat.R b/tests/testthat/test-roundtrip_complex_seurat.R new file mode 100644 index 0000000..e74d683 --- /dev/null +++ b/tests/testthat/test-roundtrip_complex_seurat.R @@ -0,0 +1,113 @@ +library(testthat) +library(convert2anndata) +library(Matrix) +library(anndata) +library(Seurat) +library(SingleCellExperiment) +library(SummarizedExperiment) + +# ====================================================================== +# Complex Seurat object round-trip fidelity (issue #18). +# +# Seurat -> SCE -> AnnData -> Seurat +# +# (there is no direct Seurat<->AnnData; the cycle goes through SCE both ways). +# COMPLEX Seurat: processed RNA assay (normalised/scaled/PCA), a UMAP +# reduction, nearest-neighbour graphs, a second ADT assay, and factor + NA +# cell metadata. Fixture in helper-roundtrip.R. +# ====================================================================== + +TOL_F32 <- 1e-4 + +# Full Seurat -> SCE -> AnnData -> Seurat cycle (in memory). +rt_seu <- function(seu) { + sce <- rt_quiet(convert_seurat_to_sce(seu)) + ad <- rt_quiet(convert_to_anndata(sce, assayName = "RNA_counts", useAltExp = TRUE)) + rt_quiet(convert_anndata_to_seurat(ad, counts_layer = "RNA_counts")) +} + +# ---------------------------------------------------------------------- +# Seurat -> SCE -> AnnData -> Seurat +# ---------------------------------------------------------------------- + +test_that("complex Seurat round-trips RNA counts and dimensions", { + skip_if_no_anndata() + seu <- build_complex_seurat() + back <- rt_seu(seu) + + expect_s4_class(back, "Seurat") + expect_equal(ncol(back), ncol(seu)) # cells + expect_equal(nrow(back), nrow(seu)) # RNA features + c0 <- as.matrix(GetAssayData(seu, assay = "RNA", layer = "counts")) + c1 <- as.matrix(GetAssayData(back, layer = "counts")) + expect_lte(rt_max_abs_diff(c0, c1), TOL_F32) +}) + +test_that("complex Seurat round-trips reductions (pca + umap)", { + skip_if_no_anndata() + seu <- build_complex_seurat() + back <- rt_seu(seu) + + expect_true(all(c("pca", "umap") %in% names(back@reductions))) + expect_lte(rt_max_abs_diff(Embeddings(seu, "pca"), + Embeddings(back, "pca")), TOL_F32) + expect_lte(rt_max_abs_diff(Embeddings(seu, "umap"), + Embeddings(back, "umap")), TOL_F32) +}) + +test_that("complex Seurat round-trips cell metadata (factor + NA)", { + skip_if_no_anndata() + seu <- build_complex_seurat() + back <- rt_seu(seu) + + expect_true(all(c("cell_type", "qc_na") %in% colnames(back@meta.data))) + expect_equal(as.character(back@meta.data$cell_type), + as.character(seu@meta.data$cell_type)) + expect_true(is.na(back@meta.data$qc_na[1])) +}) + +test_that("complex Seurat round-trips NN graphs as connectivity", { + skip_if_no_anndata() + # Seurat Graph -> SCE colPair -> AnnData obsp -> Seurat Graph. Edge weights + # are dropped by convert_graph_to_colPair (binarised), and the obsp key gains + # the assay prefix (RNA_nn -> RNA_RNA_nn), so we assert that *some* returned + # graph reproduces the source nn connectivity pattern. + seu <- build_complex_seurat() + back <- rt_seu(seu) + + expect_gt(length(back@graphs), 0) + src <- rt_pattern(seu@graphs[["RNA_nn"]]) + matches <- vapply(back@graphs, function(g) { + p <- rt_pattern(g) + is.matrix(p) && all(dim(p) == dim(src)) && all(p == src) + }, logical(1)) + expect_true(any(matches), + info = "no round-tripped graph matched the source RNA_nn pattern") +}) + +# ---------------------------------------------------------------------- +# Seurat -> SCE: a secondary assay becomes an altExp +# ---------------------------------------------------------------------- + +test_that("a secondary Seurat assay (ADT) becomes an SCE altExp", { + skip_if_no_anndata() + seu <- build_complex_seurat() + sce <- rt_quiet(convert_seurat_to_sce(seu)) + expect_true("ADT" %in% altExpNames(sce)) + expect_equal(nrow(altExp(sce, "ADT")), 6L) # 6 ADT features +}) + +# ---------------------------------------------------------------------- +# Known gap (documented, not silently missing) -- see TODO(#18) +# ---------------------------------------------------------------------- + +test_that("[known gap] a secondary Seurat assay is not restored as a Seurat assay", { + # TODO(#18): a 2nd Seurat assay -> SCE altExp -> AnnData uns$altExperiments, + # but convert_anndata_to_seurat() does not rebuild Seurat assays from + # uns$altExperiments, so ADT does not return as a Seurat assay across the + # full cycle. + skip("Known gap: convert_anndata_to_seurat() does not restore altExperiments.") + seu <- build_complex_seurat() + back <- rt_seu(seu) + expect_true("ADT" %in% names(back@assays)) +}) diff --git a/tests/testthat/test-setup_anndata_python.R b/tests/testthat/test-setup_anndata_python.R new file mode 100644 index 0000000..dded74c --- /dev/null +++ b/tests/testthat/test-setup_anndata_python.R @@ -0,0 +1,59 @@ +library(testthat) +library(convert2anndata) + +test_that("setup_anndata_python returns 'none' when no source supplied", { + withr::with_envvar( + c(RETICULATE_PYTHON = "", CONDA_PREFIX = ""), + expect_equal(setup_anndata_python(validate = FALSE), "none") + ) +}) + +test_that("setup_anndata_python with a bogus conda_env warns and returns 'none'", { + bogus <- "/definitely/not/a/conda/env/path/xyz" + withr::with_envvar( + c(RETICULATE_PYTHON = "", CONDA_PREFIX = ""), + { + expect_warning( + res <- setup_anndata_python(bogus, required = TRUE, validate = FALSE), + regexp = "Could not activate conda environment" + ) + expect_equal(res, "none") + } + ) +}) + +test_that("setup_anndata_python prefers existing RETICULATE_PYTHON path over CONDA_PREFIX", { + skip_if_not(file.exists(Sys.which("python") |> as.character()), + "no python on PATH") + withr::with_envvar( + c(RETICULATE_PYTHON = as.character(Sys.which("python")), CONDA_PREFIX = "/nonexistent"), + { + res <- suppressWarnings(setup_anndata_python(validate = FALSE)) + expect_true(res %in% c("reticulate_python", "none")) + } + ) +}) + +test_that("setup_anndata_python resolves a name against ~/micromamba/envs/", { + # We don't actually point at a working env here -- just verify that the + # name-resolution path is exercised when a real directory exists at the + # expected location. We create a fake env directory and rely on + # use_condaenv(required=FALSE) accepting the path without validating it. + fake_root <- file.path(tempdir(), "fake_micromamba", "envs", "synthetic_env") + dir.create(fake_root, recursive = TRUE, showWarnings = FALSE) + on.exit(unlink(dirname(dirname(fake_root)), recursive = TRUE), add = TRUE) + + withr::with_envvar( + c(RETICULATE_PYTHON = "", CONDA_PREFIX = "", + MAMBA_ROOT_PREFIX = file.path(tempdir(), "fake_micromamba")), + { + res <- suppressWarnings( + setup_anndata_python("synthetic_env", validate = FALSE) + ) + # If reticulate accepted the directory, label is conda_env_resolved. + # If it didn't, label falls through to 'none'. Either is acceptable; + # what matters is that we didn't crash. + expect_true(res %in% c("conda_env_resolved", "conda_env", "none")) + } + ) +}) diff --git a/tests/testthat/test-sparse_matrix_conversion.R b/tests/testthat/test-sparse_matrix_conversion.R index 52345f4..cd0479c 100644 --- a/tests/testthat/test-sparse_matrix_conversion.R +++ b/tests/testthat/test-sparse_matrix_conversion.R @@ -27,22 +27,26 @@ test_that("sparse matrix conversion works in integration", { diag_counts = diag_counts )) - # Check initial matrix types + # Check initial matrix types. Older Matrix kept dgTMatrix and ddiMatrix + # distinct; newer Matrix sometimes auto-coerces sliced Diagonals to + # CsparseMatrix when they're stored on an SCE assay. We only require the + # *triplet* (logcounts) entry to start out non-Csparse. expect_true(methods::is(assay(sce, "counts"), "CsparseMatrix")) expect_false(methods::is(assay(sce, "logcounts"), "CsparseMatrix")) - expect_false(methods::is(assay(sce, "diag_counts"), "CsparseMatrix")) # Set up a dimensional reduction reducedDim(sce, "PCA") <- matrix(rnorm(50 * 10), 50, 10) - # Add metadata that's a sparse matrix (not CsparseMatrix) + # Add metadata that's a sparse matrix (not CsparseMatrix). Force triplet + # via repr = "T" since Matrix >= 1.5 defaults sparseMatrix() to dgC. int_metadata(sce)$sparse_meta <- Matrix::sparseMatrix( i = sample(1:50, 20), j = sample(1:50, 20), x = 1:20, - dims = c(50, 50) + dims = c(50, 50), + repr = "T" ) - + # Verify it's not a CsparseMatrix expect_false(methods::is(int_metadata(sce)$sparse_meta, "CsparseMatrix"))