Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions funannotate2/align.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import sys
import tempfile
import uuid

from gapmm2.align import aligner as transcript_aligner
Expand Down Expand Up @@ -66,7 +67,7 @@ def align_transcripts(
cpus=1,
max_intron=3000,
log=sys.stderr.write,
tmpdir="/tmp",
tmpdir=None,
):
"""
Run spliced transcript alignment using gapmm2 (mm2 + edlib refinement).
Expand All @@ -82,7 +83,7 @@ def align_transcripts(
cpus (int, optional): Number of CPUs to use for alignment. Defaults to 1.
max_intron (int, optional): Maximum intron length allowed. Defaults to 3000.
log (function, optional): Logging function. Defaults to sys.stderr.write.
tmpdir (str, optional): Temporary directory path. Defaults to "/tmp".
tmpdir (str, optional): Accepted for API parity with ``align_proteins``; currently unused because the gapmm2 backend writes directly to ``output=evidence``. Defaults to ``None`` (resolves to ``tempfile.gettempdir()`` when relied upon).
"""
# function to run spliced transcript alignment using gapmm2 (mm2 + edlib refinement)
# generate an EVM compatible GFF alignment file, parse data and pull out any full length gene models
Expand Down Expand Up @@ -114,7 +115,7 @@ def align_proteins(
cpus=1,
max_intron=3000,
log=sys.stderr.write,
tmpdir="/tmp",
tmpdir=None,
):
"""
Align protein evidence to the genome assembly using miniprot.
Expand All @@ -129,13 +130,15 @@ def align_proteins(
cpus (int, optional): Number of CPUs to use for alignment. Defaults to 1.
max_intron (int, optional): Maximum intron length for alignment. Defaults to 3000.
log (function, optional): Logger function for info and error messages. Defaults to sys.stderr.write.
tmpdir (str, optional): Temporary directory path for storing intermediate files. Defaults to "/tmp".
tmpdir (str, optional): Temporary directory path for storing intermediate files. Defaults to the system temporary directory (``tempfile.gettempdir()``), which honors ``$TMPDIR``.

Returns:
None
"""
# function to align protein evidence to genome with miniprot
# generate evidence alignments and then parse any full length models
if tmpdir is None:
tmpdir = tempfile.gettempdir()
mini_tmp = os.path.join(tmpdir, f"{uuid.uuid1()}.miniprot.out")
# miniprot --outn=4 --gff-only -t 8 valid_contigs.fasta /usr/local/share/funannotate/uniprot_sprot.fasta > uniprot.mini.gff3
log.info("Aligning protein evidence to the genome assembly with miniprot")
Expand Down
4 changes: 2 additions & 2 deletions funannotate2/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ def annotate(args):
"Annotating proteome with diamond against the UniProtKB/Swiss-Prot database"
)
start = time.time()
swiss = swissprot_blast(Proteins, cpus=args.cpus)
swiss = swissprot_blast(Proteins, cpus=args.cpus, tmpdir=tmp_dir)
end = time.time()
logger.info(
f"UniProtKB/Swiss-Prot search resulted in {len(swiss)} hits and finished in {round(end - start, 2)} seconds"
Expand All @@ -330,7 +330,7 @@ def annotate(args):
"Annotating proteome with diamond against the MEROPS protease database"
)
start = time.time()
merops = merops_blast(Proteins, cpus=args.cpus)
merops = merops_blast(Proteins, cpus=args.cpus, tmpdir=tmp_dir)
end = time.time()
logger.info(
f"MEROPS search resulted in {len(merops)} hits and finished in {round(end - start, 2)} seconds"
Expand Down
2 changes: 2 additions & 0 deletions funannotate2/predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,7 @@ def predict(args):
cpus=args.cpus,
max_intron=args.max_intron,
log=logger,
tmpdir=tmp_dir,
)
elif checkfile(TranAlign) and checkfile(TranGenes):
log("Existing transcript alignments found, will re-use and continue")
Expand Down Expand Up @@ -304,6 +305,7 @@ def predict(args):
cpus=args.cpus,
max_intron=args.max_intron,
log=logger,
tmpdir=tmp_dir,
)
elif checkfile(ProtAlign) and checkfile(ProtGenes):
log("Existing protein alignments found, will re-use and continue")
Expand Down
33 changes: 27 additions & 6 deletions funannotate2/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import re
import subprocess
import sys
import tempfile
import uuid

import json_repair
Expand Down Expand Up @@ -316,7 +317,7 @@ def dbcan2tsv(results, output, annots):
return a


def diamond_blast(db, query, cpus=1, evalue=10.0, max_target_seqs=1, tmpdir="/tmp"):
def diamond_blast(db, query, cpus=1, evalue=10.0, max_target_seqs=1, tmpdir=None):
"""
Run a protein BLAST using Diamond.

Expand All @@ -328,12 +329,14 @@ def diamond_blast(db, query, cpus=1, evalue=10.0, max_target_seqs=1, tmpdir="/tm
cpus (int, optional): Number of CPUs to use. Defaults to 1.
evalue (float, optional): E-value threshold. Defaults to 10.0.
max_target_seqs (int, optional): Maximum number of target sequences to report. Defaults to 1.
tmpdir (str, optional): Temporary directory to store intermediate files. Defaults to "/tmp".
tmpdir (str, optional): Temporary directory to store intermediate files. Defaults to the system temporary directory (``tempfile.gettempdir()``), which honors ``$TMPDIR``.

Returns:
list: JSON formatted results of the BLAST search.
"""
# use diamond as protein blast
if tmpdir is None:
tmpdir = tempfile.gettempdir()
tmpfile = os.path.join(tmpdir, f"diamond_{uuid.uuid4()}")
cmd = [
"diamond",
Expand Down Expand Up @@ -377,7 +380,7 @@ def diamond_blast(db, query, cpus=1, evalue=10.0, max_target_seqs=1, tmpdir="/tm
return results


def merops_blast(query, evalue=1e-5, cpus=1, max_target_seqs=1):
def merops_blast(query, evalue=1e-5, cpus=1, max_target_seqs=1, tmpdir=None):
"""
Run a BLAST search against the MEROPS database using Diamond.

Expand All @@ -388,6 +391,7 @@ def merops_blast(query, evalue=1e-5, cpus=1, max_target_seqs=1):
evalue (float, optional): E-value threshold for reporting hits. Defaults to 1e-5.
cpus (int, optional): Number of CPUs to use. Defaults to 1.
max_target_seqs (int, optional): Maximum number of target sequences to report. Defaults to 1.
tmpdir (str, optional): Temporary directory forwarded to ``diamond_blast`` for intermediate files. Defaults to the system temporary directory.

Returns:
list: A list of dictionaries containing information about the hits found, including coverage and family details.
Expand All @@ -397,7 +401,12 @@ def merops_blast(query, evalue=1e-5, cpus=1, max_target_seqs=1):
if checkfile(merops_db):
results = []
raw_results = diamond_blast(
merops_db, query, cpus=cpus, evalue=evalue, max_target_seqs=max_target_seqs
merops_db,
query,
cpus=cpus,
evalue=evalue,
max_target_seqs=max_target_seqs,
tmpdir=tmpdir,
)
for res in raw_results:
coverage = (res["length"] / res["qlen"]) * 100
Expand Down Expand Up @@ -440,7 +449,13 @@ def merops2tsv(results, output, annots):


def swissprot_blast(
query, evalue=1e-5, cpus=1, min_pident=60, min_cov=60, max_target_seqs=1
query,
evalue=1e-5,
cpus=1,
min_pident=60,
min_cov=60,
max_target_seqs=1,
tmpdir=None,
):
"""
Perform a BLAST search against the SwissProt database using Diamond.
Expand All @@ -454,6 +469,7 @@ def swissprot_blast(
min_pident (int, optional): Minimum percentage identity required for a hit. Defaults to 60.
min_cov (int, optional): Minimum coverage percentage required for a hit. Defaults to 60.
max_target_seqs (int, optional): Maximum number of target sequences to report. Defaults to 1.
tmpdir (str, optional): Temporary directory forwarded to ``diamond_blast`` for intermediate files. Defaults to the system temporary directory.

Returns:
list: A list of dictionaries containing information about the filtered hits, including coverage, percentage identity, and alignment details.
Expand All @@ -463,7 +479,12 @@ def swissprot_blast(
if checkfile(uniprot_db):
results = []
raw_results = diamond_blast(
uniprot_db, query, cpus=cpus, evalue=evalue, max_target_seqs=max_target_seqs
uniprot_db,
query,
cpus=cpus,
evalue=evalue,
max_target_seqs=max_target_seqs,
tmpdir=tmpdir,
)
for res in raw_results:
if res["pident"] >= min_pident:
Expand Down
30 changes: 13 additions & 17 deletions funannotate2/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import socket
import subprocess
import sys
import tempfile
import textwrap
import time
import uuid
Expand Down Expand Up @@ -531,32 +532,27 @@ def human_readable_size(size, decimal_places=2):

def create_tmpdir(outdir, base="predict"):
"""
Create a temporary directory for storing files.
Create a unique temporary directory for storing intermediate files.

This function generates a temporary directory with a unique name based on the provided
base name and a UUID. If an output directory is specified, the temporary directory is
created within it. If the output directory is "/tmp", a subdirectory is created; otherwise,
the specified directory is used directly. If no output directory is provided, the temporary
directory is created in the current working directory.
A ``<base>_<uuid>`` subdirectory is always created inside the supplied
volume (``outdir``). If ``outdir`` is falsy, the system temporary directory
(``tempfile.gettempdir()``, which honors the ``$TMPDIR`` environment
variable) is used as the parent. Using a unique subdirectory avoids
collisions between concurrent runs sharing a scratch volume and prevents
downstream cleanup from inadvertently removing a user-supplied directory.

Parameters:
- outdir (str): The output directory path.
- outdir (str or None): The parent directory under which the unique tmpdir
will be created. If falsy, ``tempfile.gettempdir()`` is used.
- base (str, optional): The base name for the temporary directory (default is "predict").

Returns:
- str: The absolute path of the created temporary directory.
"""
# create a tmpdir for some files
tmpdirslug = "{}_{}".format(base, str(uuid.uuid4()))
if outdir:
if outdir == "/tmp":
tmpdir = os.path.join(outdir, tmpdirslug)
else:
tmpdir = outdir
else:
tmpdir = tmpdirslug
if not os.path.isdir(tmpdir):
os.makedirs(tmpdir)
parent = outdir if outdir else tempfile.gettempdir()
tmpdir = os.path.join(parent, tmpdirslug)
os.makedirs(tmpdir, exist_ok=True)
return os.path.abspath(tmpdir)


Expand Down
70 changes: 70 additions & 0 deletions tests/unit/test_predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,3 +317,73 @@ def fake_run_snap(*args, **kwargs):
assert not any("memory prediction for scaffold_1.fasta" in message for message in debug_messages)
assert stats["tools_run"] == ["snap"]
assert stats["contig_length"] == 4



class TestAlignProteinsTmpdir:
"""Tests verifying align_proteins honors the tmpdir kwarg (issue #52 follow-up)."""

def _capture_stdout_path(self):
captured = {}

def fake_run_subprocess(cmd, log, stdout=None, **kwargs):
captured["stdout"] = stdout
captured["cmd"] = cmd
return 0

return captured, fake_run_subprocess

@patch("funannotate2.align.os.remove")
@patch("funannotate2.align.split_evidence_and_genes")
@patch("funannotate2.align.runSubprocess")
def test_align_proteins_uses_provided_tmpdir(
self, mock_run_subprocess, mock_split, mock_remove, tmp_path
):
from funannotate2.align import align_proteins

captured, fake = self._capture_stdout_path()
mock_run_subprocess.side_effect = fake
mock_split.return_value = (0, 0)

logger = MagicMock()
align_proteins(
"genome.fasta",
"proteins.fasta",
"evidence.gff3",
"genes.gff3",
cpus=1,
log=logger,
tmpdir=str(tmp_path),
)

assert captured["stdout"] is not None
assert captured["stdout"].startswith(str(tmp_path) + os.sep)
assert captured["stdout"].endswith(".miniprot.out")

@patch("funannotate2.align.os.remove")
@patch("funannotate2.align.split_evidence_and_genes")
@patch("funannotate2.align.runSubprocess")
def test_align_proteins_defaults_to_system_tmp(
self, mock_run_subprocess, mock_split, mock_remove
):
import tempfile

from funannotate2.align import align_proteins

captured, fake = self._capture_stdout_path()
mock_run_subprocess.side_effect = fake
mock_split.return_value = (0, 0)

logger = MagicMock()
align_proteins(
"genome.fasta",
"proteins.fasta",
"evidence.gff3",
"genes.gff3",
cpus=1,
log=logger,
)

assert captured["stdout"] is not None
assert captured["stdout"].startswith(tempfile.gettempdir() + os.sep)
assert captured["stdout"].endswith(".miniprot.out")
Loading
Loading