forked from z0on/2bRAD_denovo
-
Notifications
You must be signed in to change notification settings - Fork 0
/
zooxType.pl
executable file
·150 lines (134 loc) · 3.55 KB
/
zooxType.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env perl
my $usage="
zooxType3.pl:
Calculates relative representations of zooxanthellae clades (A,B,C,D)
using SAM files with reads mapped to combined reference in which concatenated
symbiont genomes or transcriptomes are represented as single chromosome each
By default:
A: chr11
B: chr12
C: chr13
D: chr14
If you want to use different chr range, specify it using argument \'symgenomes\'
Calculates number of high-quality (= high uniqueness) mappings a proxy of relative
clade abundance in each SAM file.
arguments and defaults:
ext=sam : extension of SAM files
minq=40 : minimum mapping quality
host=\"chr1\": name of host contig to use for calculating zoox prevalence
symgenomes=11-14 : range of \'chr\' numbers containing concatenated symbiont references
in the order A, B, C, D
output:
tab-delimited table (printed to STDOUT) of relative proportions of clades,
in the form of
sample\tsitesHost\tsitesA\tsitesB\tsitesC\tsitesD\tcountHost\tcountA\tcountB\tcountC\tcountD\tfracZoox\tfracA\tfracB\tfracC\tfracD
";
my $ext="sam";
if (" @ARGV "=~/glob=(\S+)/) { $ext=$1;}
my $minq=40;
if (" @ARGV "=~/minq=(\d+)/) { $minq=$1;}
my $host="chr1";
if (" @ARGV "=~/host=(\S+)/) { $host=$1;}
my $syms="11-14";
if (" @ARGV "=~/symgenomes=(\d+-\d+)/) { $syms=$1;}
my @sg=();
(my $s1, my $s2)=split(/-/,$syms);
for (my $i=0;$i<=$s2-$s1;$i++) {
$sg[$i]="chr".($s1+$i);
}
opendir THIS, ".";
my @files=grep/\.$ext$/, readdir THIS;
@files=sort {$a cmp $b} @files;
if ($#files==-1) { die $usage; }
print "sample\tsitesH\tsitesA\tsitesB\tsitesC\tsitesD\tcountH\tcountA\tcountB\tcountC\tcountD\tfracZ\tfracA\tfracB\tfracC\tfracD\n";
foreach my $file (@files){
my $sH=0;
my $sA=0;
my $sB=0;
my $sC=0;
my $sD=0;
my $nH=0;
my $nA=0;
my $nB=0;
my $nC=0;
my $nD=0;
my $fZ=0;
my $fA=0;
my $fB=0;
my $fC=0;
my $fD=0;
my %seenH={};
my %seenA={};
my %seenB={};
my %seenC={};
my %seenD={};
my $site;
my $qual;
next if ($file=~/HASH/);
open INF, $file or die "$usage\n\nCannot open sam file $file\n\n";
while (<INF>) {
if ($_=~/\t$host\t(\d+)\t(\d+)/) {
$qual=$2;
$site=$1;
next if ($qual<$minq);
#warn "$file:A:Q=$qual:S=$site\n$_";
$nH++;
if (!$seenH{$site}) {
$seenH{$site}=1;
$sH++;
}
}
elsif ($_=~/\t$sg[0]\t(\d+)\t(\d+)/) {
$qual=$2;
$site=$1;
next if ($qual<$minq);
#warn "$file:A:Q=$qual:S=$site\n$_";
$nA++;
if (!$seenA{$site}) {
$seenA{$site}=1;
$sA++;
}
}
elsif ($_=~/\t$sg[1]\t(\d+)\t(\d+)/) {
$qual=$2;
$site=$1;
next if ($qual<$minq);
$nB++;
if (!$seenB{$site}) {
$seenB{$site}=1;
$sB++;
}
}
elsif ($_=~/\t$sg[2]\t(\d+)\t(\d+)/) {
$qual=$2;
$site=$1;
next if ($qual<$minq);
$nC++;
if (!$seenC{$site}) {
$seenC{$site}=1;
$sC++;
}
}
elsif ($_=~/\t$sg[3]\t(\d+)\t(\d+)/) {
$qual=$2;
$site=$1;
next if ($qual<$minq);
$nD++;
if (!$seenD{$site}) {
$seenD{$site}=1;
$sD++;
}
}
}
my $sumf=$nA+$nB+$nC+$nD;
if ($sumf==0) {
print "$file\t$sH\t$sA\t$sB\t$sC\t$sD\t$nH\t$nA\t$nB\t$nC\t$nD\tNA\tNA\tNA\tNA\tNA\n";
next;
}
my $fZ=sprintf("%.5f",$sumf/($sumf+$nH));
$fA=sprintf("%.3f",$nA/$sumf);
$fB=sprintf("%.3f",$nB/$sumf);
$fC=sprintf("%.3f",$nC/$sumf);
$fD=sprintf("%.3f",$nD/$sumf);
print "$file\t$sH\t$sA\t$sB\t$sC\t$sD\t$nH\t$nA\t$nB\t$nC\t$nD\t$fZ\t$fA\t$fB\t$fC\t$fD\n";
}