Skip to content

Commit 8607c59

Browse files
committed
refine pipeline scripts and configuration: update .gitignore, modify output handling in annotate.pl and match.pl, and add query size to config.yaml
1 parent 6eb20b8 commit 8607c59

File tree

6 files changed

+60
-76
lines changed

6 files changed

+60
-76
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
1-
data/
1+
data/
2+
results/
3+
.snakemake/

Snakefile

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,11 @@ rule match:
1616
output:
1717
temp("results/{sample}.matched")
1818
params:
19-
script=config["scripts_dir"] + "/match.pl"
19+
script=config["scripts_dir"] + "/match.pl",
20+
query= lambda wc: str(config["querysize"])
2021
shell:
2122
"""
22-
{params.script} {input.reads} {input.ref} > {output}
23+
perl {params.script} {input.reads} {input.ref} {params.query} > {output}
2324
"""
2425

2526
rule annotate:
@@ -35,5 +36,5 @@ rule annotate:
3536
script=config["scripts_dir"] + "/annotate.pl"
3637
shell:
3738
"""
38-
{params.script} {input.matches} {input.gff} {input.tss} {input.cpg} {input.repeat} > {output}
39+
perl {params.script} {input.matches} {input.gff} {input.tss} {input.cpg} {input.repeat} > {output}
3940
"""

config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
1+
querysize: 100
22
read: data/reads/*.fasta.gz
33
reference: data/ref/hg38_partial.fasta.gz
44
annotate: data/annotations

docs/quickstart.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@ cd grmap
1616

1717
```bash
1818
conda create -n grmap -c conda-forge -c bioconda snakemake matplotlib numpy=1.26
19-
conda activate grmap
19+
conda activate grmap
20+
sudo cpan Parallel::ForkManager IO::Uncompress::Gunzip
2021
```
2122

2223
4. **Run the pipeline**

scripts/annotate.pl

100644100755
Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@
3030
# Global data structures for genomic features
3131
my (%genes, %tss, %cpg, %repeats, %sorted_cpg, %sorted_repeats);
3232

33+
# Output goes to STDOUT by default
34+
my $out_fh = *STDOUT;
35+
3336
# Detect number of CPU cores for parallel processing
3437
my $num_cores = `lscpu -p | grep -v '^#' | wc -l`;
3538
chomp($num_cores);
@@ -272,8 +275,9 @@ sub process_matched_sequences {
272275
my ($input_file) = @_;
273276
open my $fh, "<", $input_file or die "Cannot open input file ($input_file): $!\n";
274277

275-
my $output_file = "matchedseqs_annotate.txt";
276-
open my $out_fh, ">", $output_file or die "Cannot open output file ($output_file): $!\n";
278+
# my $output_file = "matchedseqs_annotate.txt";
279+
# open my $out_fh, ">", $output_file or die "Cannot open output file ($output_file): $!\n";
280+
277281
# Output header with separate columns for each annotation source
278282
print $out_fh join("\t",
279283
"Start", "End", "Strand", "MatchedSeq", "Occurrences", "Chromosome",
@@ -282,7 +286,7 @@ sub process_matched_sequences {
282286
"InCpG", "CpG_GC_Content", "CpG_ObsExp",
283287
"Repeat_Name", "Repeat_Class", "Repeat_Length"
284288
), "\n";
285-
close $out_fh;
289+
# close $out_fh;
286290

287291
my $temp_dir = "/tmp/perl_parallel";
288292
mkdir $temp_dir unless -d $temp_dir;
@@ -324,7 +328,7 @@ sub process_matched_sequences {
324328

325329
# Merge temporary files into the final output file
326330
opendir my $dir, $temp_dir or die "Cannot open temp dir ($temp_dir): $!\n";
327-
open $out_fh, ">>", $output_file or die "Cannot open output file ($output_file): $!\n";
331+
# open $out_fh, ">>", $output_file or die "Cannot open output file ($output_file): $!\n";
328332
while (my $file = readdir($dir)) {
329333
next unless $file =~ /^process_\d+\.txt$/;
330334
open my $in_fh, "<", "$temp_dir/$file" or next;
@@ -333,7 +337,7 @@ sub process_matched_sequences {
333337
unlink "$temp_dir/$file";
334338
}
335339
closedir $dir;
336-
close $out_fh;
340+
# close $out_fh;
337341
}
338342

339343
############################################
@@ -350,5 +354,5 @@ sub process_matched_sequences {
350354
load_repeatmasker_data($repeat_file);
351355
process_matched_sequences($input_file);
352356

353-
print "Done.\n";
357+
#print "Done.\n";
354358
exit 0;

scripts/match.pl

100644100755
Lines changed: 40 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -26,67 +26,36 @@
2626
use strict;
2727
use warnings;
2828
use diagnostics;
29-
use bytes; # Force byte semantics for DNA sequences
29+
use bytes;
3030
use Time::HiRes qw(gettimeofday tv_interval);
3131

32-
#-----------------------------------------------------
33-
# Usage message and command-line argument processing
34-
#-----------------------------------------------------
35-
sub usage {
36-
print <<"END_USAGE";
37-
Usage: $0 <reads_choice> <query_choice>
38-
39-
<reads_choice> options:
40-
1 Use illumina_reads_40.fasta.gz
41-
2 Use illumina_reads_60.fasta.gz
42-
3 Use illumina_reads_80.fasta.gz
43-
4 Use illumina_reads_100.fasta.gz
44-
45-
<query_choice> options:
46-
1 Load 1,000 queries
47-
2 Load 10,000 queries
48-
3 Load 100,000 queries
49-
4 Load 1,000,000 queries
50-
51-
Example:
52-
$0 2 3
53-
=> Use illumina_reads_60.fasta.gz and load 100,000 queries.
54-
55-
END_USAGE
56-
exit 1;
57-
}
58-
59-
# If no arguments or help flag provided, show usage.
60-
if ( @ARGV != 2 or $ARGV[0] eq '--help' or $ARGV[0] eq '-h' ) {
61-
usage();
62-
}
63-
64-
my ($reads_choice, $query_choice) = @ARGV;
65-
6632
#-----------------------------------------------------
6733
# Record initial time and memory usage
6834
#-----------------------------------------------------
6935
my $start_time = [gettimeofday()];
7036
my $start_timestamp = scalar(localtime());
7137
my $start_mem = get_memory_usage();
7238

73-
#-----------------------------------------------------
74-
# File and Parameter Setup
75-
#-----------------------------------------------------
76-
my $ref_file = 'hg38_partial.fasta.gz';
77-
my %reads_map = (
78-
1 => 'illumina_reads_40.fasta.gz',
79-
2 => 'illumina_reads_60.fasta.gz',
80-
3 => 'illumina_reads_80.fasta.gz',
81-
4 => 'illumina_reads_100.fasta.gz',
82-
);
83-
my @query_opts = ( 1000, 10_000, 100_000, 1_000_000 );
84-
my $result_file = 'matchedseqs.txt';
85-
86-
# Validate command-line arguments.
87-
exists $reads_map{$reads_choice} or die "Invalid read file choice: $reads_choice\n";
88-
$query_choice =~ /^[1-4]$/ or die "Invalid query choice: $query_choice\n";
89-
my $num_queries = $query_opts[$query_choice - 1];
39+
#----------------------------------------
40+
# Usage: match.pl <reads.fasta.gz> <ref.fasta.gz> [<max_queries>]
41+
#----------------------------------------
42+
sub usage {
43+
die <<"USAGE";
44+
Usage: $0 <reads_file> <reference_file> [<max_queries>]
45+
<reads_file> FASTA or FASTA.GZ of your markers
46+
<reference_file> FASTA or FASTA.GZ of your reference
47+
<max_queries> (optional) number of reads to load; default = all
48+
USAGE
49+
}
50+
51+
# require at least two arguments
52+
usage() if @ARGV < 2;
53+
54+
# assign files directly
55+
my ($reads_file, $ref_file, $num_queries) = @ARGV;
56+
$num_queries //= 0; # zero or undef -> load _all_ sequences
57+
#my $result_file = 'matchedseqs.txt';
58+
my $OUT = *STDOUT; # send everything to STDOUT
9059

9160
#-----------------------------------------------------
9261
# Read Reference Genome
@@ -98,12 +67,13 @@ sub usage {
9867
# Load marker sequences (queries)
9968
# Now each query is stored as a hash with key 'id' and 'seq'
10069
#-----------------------------------------------------
101-
my $reads_file = $reads_map{$reads_choice};
102-
my @queries = read_n_sequences($reads_file, $num_queries);
70+
my @queries = $num_queries
71+
? read_n_sequences($reads_file, $num_queries)
72+
: read_n_sequences($reads_file); # load all if no limit
10373
my $actual_queries = scalar @queries;
10474

10575
#-----------------------------------------------------
106-
# Detect number of cores (using external command)
76+
# Detect number of cores
10777
#-----------------------------------------------------
10878
my $num_cores = `lscpu -p | grep -v '^#' | wc -l`;
10979
chomp($num_cores);
@@ -148,9 +118,11 @@ sub usage {
148118
my $end_mem = get_memory_usage();
149119

150120
#-----------------------------------------------------
151-
# Combine temporary files into final output file with header
121+
# Combine temporary files into final output
152122
#-----------------------------------------------------
153-
open(my $OUT, '>', $result_file) or die "Cannot open $result_file: $!";
123+
124+
#open(my $OUT, '>', $result_file) or die "Cannot open $result_file: $!";
125+
154126
print $OUT "# ReferenceFile: $ref_file\n";
155127
print $OUT "# QueryFile: $reads_file\n";
156128
print $OUT "# Total Queries Processed: $actual_queries\n";
@@ -176,9 +148,11 @@ sub usage {
176148
unlink $tmp_file or warn "Couldn't remove $tmp_file: $!";
177149
}
178150
}
179-
close $OUT or die "Cannot close $result_file: $!";
180-
print "\nDone. $total_matched match(es) reported.\n";
181-
print "See '$result_file' for the matched markers.\n";
151+
#-----------------------------------------------------
152+
#close $OUT or die "Cannot close $result_file: $!";
153+
#print "\nDone. $total_matched match(es) reported.\n";
154+
#print "See '$result_file' for the matched markers.\n";
155+
#-----------------------------------------------------
182156
exit 0;
183157

184158
#=====================================================
@@ -195,8 +169,8 @@ sub get_memory_usage {
195169
my $mem = `ps -o rss= -p $pid`;
196170
chomp($mem);
197171
return $mem; # in KB
198-
}
199-
172+
}
173+
200174
#-----------------------------------------------------
201175
# Subroutine: format_memory_usage
202176
# Converts a memory value in KB into a human-readable string,
@@ -260,7 +234,8 @@ sub read_n_sequences {
260234
if ($line =~ /^>(.*)/) {
261235
if ($seq ne '') {
262236
push @list, { id => $id, seq => $seq };
263-
last if @list == $max;
237+
# only stop early if the caller really passed a max>0
238+
last if defined($max) && $max > 0 && @list == $max;
264239
$seq = '';
265240
}
266241
$id = $1; # capture header (without '>')
@@ -269,7 +244,8 @@ sub read_n_sequences {
269244
$seq .= $line;
270245
}
271246
}
272-
if ($seq ne '' and @list < $max) {
247+
# always push the final sequence
248+
if ($seq ne '') {
273249
push @list, { id => $id, seq => $seq };
274250
}
275251
close $fh;

0 commit comments

Comments
 (0)