-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathset_SNP_fasta.py
More file actions
executable file
·77 lines (65 loc) · 2.89 KB
/
set_SNP_fasta.py
File metadata and controls
executable file
·77 lines (65 loc) · 2.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/env python3
import sys
from collections import defaultdict
def create_utils_lookup(utils_file):
lookup = {}
with open(utils_file) as f:
for line_num, line in enumerate(f, 1):
try:
# Parse the line
coords, sequence = line.strip().split('\t')
chrom_range, seq = coords, sequence
# Extract chromosome and position
chrom, pos_range = chrom_range.split(':')
start, end = map(int, pos_range.split('-'))
mid_pos = (start + end) // 2
# Find the variant position and alleles
left_bracket = sequence.find('[')
right_bracket = sequence.find(']')
if left_bracket != -1 and right_bracket != -1:
alleles = sequence[left_bracket+1:right_bracket]
ref, alt = alleles.split('/')
key = f"{chrom}_{mid_pos}"
lookup[key] = (ref, alt)
except Exception as e:
print(f"Error on line {line_num}: {line.strip()}", file=sys.stderr)
print(f"Error details: {str(e)}", file=sys.stderr)
print(f"Loaded {len(lookup)} positions from utils file", file=sys.stderr)
return lookup
def process_fasta(fasta_file, utils_lookup):
missing_pos = defaultdict(int)
processed = 0
replaced = 0
with open(fasta_file) as f:
while True:
header = f.readline().strip()
if not header:
break
seq = f.readline().strip()
# Process header to get middle position
header = header[1:] # Remove '>'
chrom, range_part = header.split(':')
start, end = map(int, range_part.split('-'))
mid_pos = (start + end) // 2
# Create lookup key
lookup_key = f"{chrom}_{mid_pos}"
# Get reference nucleotide from utils lookup
n_pos = seq.find('N')
if n_pos != -1:
if lookup_key in utils_lookup:
ref_nuc = utils_lookup[lookup_key][0] # Use reference nucleotide
seq = seq[:n_pos] + ref_nuc + seq[n_pos + 1:]
replaced += 1
else:
missing_pos[chrom] += 1
print(f">{chrom}_{mid_pos}")
print(seq)
processed += 1
print(f"\nProcessed {processed} sequences", file=sys.stderr)
print(f"Successfully replaced {replaced} N's with reference nucleotides", file=sys.stderr)
print("\nMissing positions by chromosome:", file=sys.stderr)
for chrom, count in sorted(missing_pos.items()):
print(f"{chrom}: {count} positions not found", file=sys.stderr)
if __name__ == '__main__':
utils_lookup = create_utils_lookup('SSW_snps_morex_v2_SNP_Utils.txt')
process_fasta('SSW_snps_final_morex_v2_renamed.fasta', utils_lookup)