-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathancestral_state.py
More file actions
123 lines (109 loc) · 3.59 KB
/
ancestral_state.py
File metadata and controls
123 lines (109 loc) · 3.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python
"""
Pull ancestral state of variants from a list and create input for plotting
the unfolded SFS and identifying ancestral state for each genotype at each
position.
This script is lightly rewritten from a version from Tom Kono written for
barley ancestral state inference:
https://github.com/MorrellLAB/Deleterious_GP/blob/master/Analysis_Scripts/Data_Handling/H_murinum_Ancestral.py
"""
import sys
import gzip
def store_anc(a):
"""Read through list of position and ancestral states and store.
This will be in a dictionary keyed on chr:pos.
"""
anc = {}
with gzip.open(a, 'rt') as f:
for line in f:
if line.startswith('chr'):
continue
tmp = line.strip().split('\t')
k = tmp[0] + ':' + tmp[1]
v = tmp[2]
anc[k] = v
return anc
def polarize(ref, alt, anc, samples):
"""Return a list of the polarized genotypes in the samples.
We will use A for ancestral allele, D for derived allele, and N for
unknown. The genotypes will be diploid.
"""
pol = []
# First, if ancestral is N, then we return N for all samples
if anc == 'N':
der = 'N'
for s in samples:
pol.append('NN')
return (pol, der)
if anc == ref:
der = alt
a = '0'
d = '1'
else:
der = ref
a = '1'
d = '0'
for s in samples:
gt = s.split(':')[0]
if gt in ('.', './.'):
pol.append('NN')
elif gt == a + '/' + a:
pol.append('AA')
elif gt in (a + '/' + d, d + '/' + a):
pol.append('AD')
elif gt == d + '/' + d:
pol.append('DD')
else:
pol.append('NN')
return (pol, der)
def main(anc, snps_vcf):
"""Main function."""
# Parse and store the ancestral states in the H. hurinum VCF
astates = store_anc(anc)
# Then iterate through the derived VCF and print out the relevant fields
with gzip.open(snps_vcf, 'rt') as f:
for line in f:
if line.startswith('##'):
continue
if line.startswith('#CHROM'):
samples = line.strip().split('\t')[9:]
# Print the header here
header_cols = (
['Chromosome', 'Pos', 'SNPID', 'Ancestral',
'Derived', 'Reference'] + samples
)
print('\t'.join(header_cols))
continue
tmp = line.strip().split()
key = tmp[0] + ':' + tmp[1]
snpid = tmp[0] + '_' + tmp[1]
ref = tmp[3]
alt = tmp[4]
samp = tmp[9:]
ancestral = astates.get(key, 'N')
if ancestral == 'N':
morex = 'N'
elif ancestral == ref:
morex = 'A'
else:
morex = 'D'
genos, derived = polarize(ref, alt, ancestral, samp)
# Print chrom, pos, ancestral, derived, morex state, and
# polarized letters for samples
output_cols = (
[tmp[0], tmp[1], snpid, ancestral, derived, morex]
+ genos
)
print('\t'.join(output_cols))
if len(sys.argv) != 3:
print(
"""Print out the ancestral/derived alleles for each SNP, and
which samples have the derived or ancestral alleles at each SNP. Both
files must be anchored on the pseudomolecule assembly. Takes two
arguments:
1) Ancestral state list (gzipped)
2) Non-ancestral VCF (gzipped)"""
)
sys.exit(1)
else:
main(sys.argv[1], sys.argv[2])