diff --git a/svtyper b/svtyper index e06a5f3..556e369 100755 --- a/svtyper +++ b/svtyper @@ -8,7 +8,7 @@ from collections import Counter from argparse import RawTextHelpFormatter __author__ = "Colby Chiang (colbychiang@wustl.edu)" -__version__ = "v0.1.0" +__version__ = "v0.1.1" # -------------------------------------- # define functions @@ -358,7 +358,7 @@ def bayes_gt(ref, alt, is_dup): total = ref + alt log_combo = log_choose(total, alt) - + lp_homref = log_combo + alt * math.log(p_alt[0], 10) + ref * math.log(1 - p_alt[0], 10) lp_het = log_combo + alt * math.log(p_alt[1], 10) + ref * math.log(1 - p_alt[1], 10) lp_homalt = log_combo + alt * math.log(p_alt[2], 10) + ref * math.log(1 - p_alt[2], 10) @@ -736,7 +736,7 @@ class Sample(object): genome_size = float(sum(self.bam.lengths)) weighted_mean_span = sum([(lib.mean - 2 * lib.read_length + 2 * min_aligned) * lib.prevalence for lib in self.lib_dict.values()]) exp_spanning_depth = (weighted_mean_span * self.bam_mapped) / genome_size - + self.exp_spanning_depth = exp_spanning_depth return @@ -779,7 +779,7 @@ class SamFragment(object): return else: self.read_set.add(read_hash) - + if is_primary(read): # add the primary reads self.primary_reads.append(read) @@ -880,7 +880,7 @@ class SamFragment(object): return False return True - + # calculate the probability that a read pair is concordant at a breakpoint, # given the putative variant size and insert distribution of the library. def p_concordant(self, var_length=None): @@ -1008,10 +1008,13 @@ class SplitRead(object): # set query_left and query_right splitter by alignment position on the reference # this is used for non-overlap and off-diagonal filtering # (query_left and query_right are random when on different chromosomes - if (self.read.reference_name == mate_chrom - and self.read.pos > mate_pos): - self.query_left = b - self.query_right = a + if self.read.reference_name == mate_chrom: + if self.read.pos > mate_pos: + self.query_left = b + self.query_right = a + else: + self.query_left = a + self.query_right = b else: self.set_order_by_clip(a, b) @@ -1059,11 +1062,26 @@ class SplitRead(object): return non_overlap + @staticmethod + def check_split_support(split, chrom, pos, is_reverse, split_slop): + if split.chrom != chrom: + return False + + if is_reverse: + coord = split.reference_start + else: + coord = split.reference_end + + if (coord > pos + split_slop + or coord < pos - split_slop): + return False + return True + def is_split_straddle(self, chromA, posA, ciA, chromB, posB, ciB, o1_is_reverse, o2_is_reverse, - split_slop): + svtype, split_slop): # arrange the SV breakends from left to right if (chromA != chromB @@ -1087,44 +1105,56 @@ class SplitRead(object): is_reverse_right = o2_is_reverse - left_split = True - right_split = True - # check split chromosomes against variant - if self.query_left.chrom != chrom_left: - left_split = False - if self.query_right.chrom != chrom_right: - right_split = False - - ''' Check left side of variant ''' - # left side of variant is negative strand - if is_reverse_left: - if self.query_left.reference_start > pos_left + split_slop: - left_split = False - if self.query_left.reference_start < pos_left - split_slop: - left_split = False - - # left side of variant is positive strand - else: - if self.query_left.reference_end > pos_left + split_slop: - left_split = False - if self.query_left.reference_end < pos_left - split_slop: - left_split = False - - ''' Check right side of variant ''' - # right side of variant is negative strand - if is_reverse_right: - if self.query_right.reference_start > pos_right + split_slop: - right_split = False - if self.query_right.reference_start < pos_right - split_slop: - right_split = False - - # right side of variant is positive strand - else: - if self.query_right.reference_end < pos_right - split_slop: - right_split = False - if self.query_right.reference_end > pos_right + split_slop: - right_split = False + left_split = False + right_split = False + + if (not self.is_soft_clip) or svtype == 'DEL' or svtype == 'INS': + left_split = self.check_split_support(self.query_left, + chrom_left, + pos_left, + is_reverse_left, + split_slop) + right_split = self.check_split_support(self.query_right, + chrom_right, + pos_right, + is_reverse_right, + split_slop) + elif svtype == 'DUP': + left_split = self.check_split_support(self.query_left, + chrom_right, + pos_right, + is_reverse_right, + split_slop) + right_split = self.check_split_support(self.query_right, + chrom_left, + pos_left, + is_reverse_left, + split_slop) + elif svtype == 'INV': + # check all possible sides + left_split_left = self.check_split_support(self.query_left, + chrom_left, + pos_left, + is_reverse_left, + split_slop) + left_split_right = self.check_split_support(self.query_left, + chrom_right, + pos_right, + is_reverse_right, + split_slop) + left_split = left_split_left or left_split_right + right_split_left = self.check_split_support(self.query_right, + chrom_left, + pos_left, + is_reverse_left, + split_slop) + right_split_right = self.check_split_support(self.query_right, + chrom_right, + pos_right, + is_reverse_right, + split_slop) + right_split = right_split_left or right_split_right return (left_split, right_split) @@ -1136,12 +1166,12 @@ class SplitRead(object): value = 'A' else: value = 'R' - + self.read.set_tag('XV', value) return - def set_order_by_clip(self, a, b): + def set_order_by_clip(self, a, b): ''' Determine which SplitPiece is the leftmost based on the side of the longest clipping operation @@ -1331,7 +1361,7 @@ def sv_genotype(bam_string, else: sys.stderr.write('Reading library metrics from %s...' % lib_info_path) sample = Sample.from_lib_info(bam_list[i], lib_info, min_lib_prevalence) - + sample.set_exp_seq_depth(min_aligned) sample.set_exp_spanning_depth(min_aligned) sample_list.append(sample) @@ -1416,7 +1446,7 @@ def sv_genotype(bam_string, sys.stderr.write('Warning: Unsupported SV type at variant %s (%s). Skipping.\n' % (var.var_id, svtype)) vcf_out.write(var.get_var_string() + '\n') continue - + if svtype == 'BND': if var.info['MATEID'] in breakend_dict: var2 = var @@ -1436,7 +1466,7 @@ def sv_genotype(bam_string, if var2.alt[-1] == '[' or var2.alt[-1] == ']': o2_is_reverse = False else: o2_is_reverse = True - + # remove the BND from the breakend_dict # to free up memory del breakend_dict[var.var_id] @@ -1504,7 +1534,7 @@ def sv_genotype(bam_string, split_lr = split.is_split_straddle(chromA, posA, ciA, chromB, posB, ciB, o1_is_reverse, o2_is_reverse, - split_slop) + svtype, split_slop) # p_alt = prob_mapq(split.query_left) * prob_mapq(split.query_right) p_alt = (prob_mapq(split.query_left) * split_lr[0] + prob_mapq(split.query_right) * split_lr[1]) / 2.0 if split.is_soft_clip: @@ -1519,7 +1549,7 @@ def sv_genotype(bam_string, # ------------------------------------- # Check for paired-end evidence # ------------------------------------- - + # tally spanning alternate pairs if svtype == 'DEL' and posB - posA < 2 * fragment.lib.sd: alt_straddle = False @@ -1588,7 +1618,7 @@ def sv_genotype(bam_string, if p_conc is not None: p_reference = p_conc * prob_mapq(fragment.readA) * prob_mapq(fragment.readB) ref_span += (ref_straddle_A + ref_straddle_B) * p_reference / 2 - + fragment.tag_span(1 - p_conc) write_fragment = True diff --git a/test/example.gt.vcf b/test/example.gt.vcf index e1d5a47..0c8d68a 100644 --- a/test/example.gt.vcf +++ b/test/example.gt.vcf @@ -1,5 +1,5 @@ ##fileformat=VCFv4.1 -##fileDate=20161116 +##fileDate=20161212 ##reference=/shared/genomes/b37/full/human_g1k_v37.fasta ##INFO= ##INFO= @@ -268,7 +268,7 @@ 3 20311457 156765 A 0.00 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-6276;END=20317733;STR=+-:20;IMPRECISE;CIPOS=-3,35;CIEND=0,0;EVENT=156765;SUP=20;PESUP=20;SRSUP=0;EVTYPE=PE;PRIN GT:GQ:DP:RO:AO:SQ:GL:QR:QA:RS:AS:ASC:RP:AP:AB 0/0:200:115:115:0:0.00:-0,-34,-114:114:0:50:0:0:64:0:0 3 20492792 156827 G 0.00 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-65;END=20492857;STR=+-:11;CIPOS=0,0;CIEND=0,0;EVENT=156827;SUP=11;PESUP=0;SRSUP=11;EVTYPE=SR;PRIN GT:GQ:DP:RO:AO:SQ:GL:QR:QA:RS:AS:ASC:RP:AP:AB 0/0:147:49:49:0:0.00:-0,-15,-49:49:0:49:0:0:0:0:0 3 20919493 156974 T 0.00 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-272;END=20919765;STR=+-:20;CIPOS=0,0;CIEND=0,0;EVENT=156974;SUP=20;PESUP=6;SRSUP=14;EVTYPE=PE,SR;PRIN GT:GQ:DP:RO:AO:SQ:GL:QR:QA:RS:AS:ASC:RP:AP:AB 0/0:200:115:114:0:0.00:-0,-34,-113:113:0:53:0:0:60:0:0 -3 21723705 157253 T 650.38 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-2260;END=21725965;STR=+-:167;IMPRECISE;CIPOS=-1,8;CIEND=-8,0;EVENT=157253;SUP=167;PESUP=167;SRSUP=0;EVTYPE=PE;PRIN GT:GQ:DP:RO:AO:SQ:GL:QR:QA:RS:AS:ASC:RP:AP:AB 0/1:200:86:54:31:650.38:-67,-2,-32:53:30:27:1:2:26:27:0.36 +3 21723705 157253 T 650.38 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-2260;END=21725965;STR=+-:167;IMPRECISE;CIPOS=-1,8;CIEND=-8,0;EVENT=157253;SUP=167;PESUP=167;SRSUP=0;EVTYPE=PE;PRIN GT:GQ:DP:RO:AO:SQ:GL:QR:QA:RS:AS:ASC:RP:AP:AB 0/1:200:86:54:31:650.38:-67,-2,-32:53:30:27:0:2:26:27:0.36 3 25030887 158374 A 797.16 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-1361;END=25032248;STR=+-:515;CIPOS=0,0;CIEND=0,0;EVENT=158374;SUP=515;PESUP=378;SRSUP=137;EVTYPE=PE,SR;PRIN GT:GQ:DP:RO:AO:SQ:GL:QR:QA:RS:AS:ASC:RP:AP:AB 0/1:200:117:77:39:797.16:-84,-4,-47:76:38:36:4:5:40:28:0.33 3 26123848 158740 A 649.93 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-126;END=26123974;STR=+-:212;CIPOS=0,0;CIEND=0,0;EVENT=158740;SUP=212;PESUP=3;SRSUP=209;EVTYPE=PE,SR;PRIN GT:GQ:DP:RO:AO:SQ:GL:QR:QA:RS:AS:ASC:RP:AP:AB 1/1:56:22:0:22:649.93:-66,-7,-1:0:22:0:22:0:0:0:1 3 26450970 158848 T 935.17 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-1327;END=26452297;STR=+-:907;CIPOS=0,0;CIEND=0,0;EVENT=158848;SUP=907;PESUP=654;SRSUP=253;EVTYPE=PE,SR;PRIN GT:GQ:DP:RO:AO:SQ:GL:QR:QA:RS:AS:ASC:RP:AP:AB 0/1:200:110:67:43:935.17:-96,-2,-38:66:42:32:8:2:34:31:0.39 @@ -287,7 +287,7 @@ 3 61666417 170381 G 0.00 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-267;END=61666684;STR=+-:93;CIPOS=0,0;CIEND=0,0;EVENT=170381;SUP=93;PESUP=20;SRSUP=73;EVTYPE=PE,SR;PRIN GT:GQ:DP:RO:AO:SQ:GL:QR:QA:RS:AS:ASC:RP:AP:AB 0/0:200:126:126:0:0.00:-0,-38,-125:125:0:60:0:0:65:0:0 3 65188871 172115 T 644.36 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-25876;END=65214747;STR=+-:227;CIPOS=0,0;CIEND=0,0;EVENT=172115;SUP=227;PESUP=191;SRSUP=36;EVTYPE=PE,SR;PRIN GT:GQ:DP:RO:AO:SQ:GL:QR:QA:RS:AS:ASC:RP:AP:AB 0/1:150:88:56:31:644.36:-67,-3,-33:55:30:25:7:1:30:22:0.35 3 68634773 173189 T 959.09 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-5479;END=68640252;STR=+-:333;CIPOS=0,0;CIEND=0,0;EVENT=173189;SUP=333;PESUP=258;SRSUP=75;EVTYPE=PE,SR;PRIN GT:GQ:DP:RO:AO:SQ:GL:QR:QA:RS:AS:ASC:RP:AP:AB 0/1:149:123:77:45:959.09:-99,-3,-45:76:44:36:11:3:40:29:0.37 -3 68739682 173219 A 812.31 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-8180;END=68747862;STR=+-:687;CIPOS=0,0;CIEND=0,0;EVENT=173219;SUP=687;PESUP=531;SRSUP=156;EVTYPE=PE,SR;PRIN GT:GQ:DP:RO:AO:SQ:GL:QR:QA:RS:AS:ASC:RP:AP:AB 0/1:200:91:54:37:812.31:-83,-2,-30:53:36:25:10:2:28:23:0.4 +3 68739682 173219 A 785.32 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-8180;END=68747862;STR=+-:687;CIPOS=0,0;CIEND=0,0;EVENT=173219;SUP=687;PESUP=531;SRSUP=156;EVTYPE=PE,SR;PRIN GT:GQ:DP:RO:AO:SQ:GL:QR:QA:RS:AS:ASC:RP:AP:AB 0/1:152:91:54:36:785.32:-80,-2,-30:53:35:25:9:2:28:23:0.4 3 73063077 174617 C 1244.15 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-476;END=73063553;STR=+-:66;CIPOS=0,0;CIEND=0,0;EVENT=174617;SUP=66;PESUP=0;SRSUP=66;EVTYPE=SR;PRIN GT:GQ:DP:RO:AO:SQ:GL:QR:QA:RS:AS:ASC:RP:AP:AB 0/1:200:107:54:53:1244.15:-126,-1,-25:53:52:21:13:2:32:36:0.5 3 73939035 174881 A 0.00 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-342;END=73939377;STR=+-:219;CIPOS=0,0;CIEND=0,0;EVENT=174881;SUP=219;PESUP=118;SRSUP=101;EVTYPE=PE,SR;PRIN GT:GQ:DP:RO:AO:SQ:GL:QR:QA:RS:AS:ASC:RP:AP:AB 0/0:200:140:140:0:0.00:-0,-42,-139:139:0:65:0:0:74:0:0 3 78779423 176268 A 2008.88 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-294;END=78779717;STR=+-:686;CIPOS=0,0;CIEND=0,0;EVENT=176268;SUP=686;PESUP=333;SRSUP=353;EVTYPE=PE,SR;PRIN GT:GQ:DP:RO:AO:SQ:GL:QR:QA:RS:AS:ASC:RP:AP:AB 1/1:200:69:0:69:2008.88:-204,-20,-3:0:68:0:20:3:0:44:1 diff --git a/test/test.sh b/test/test.sh index 5edb293..4e35960 100755 --- a/test/test.sh +++ b/test/test.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +set -e + # # make truth set and sort the BAMs # BAM=/gscmnt/gc2802/halllab/sv_aggregate/MISC/realigned_BAMs/NA12878/NA12878.bam # BAM_BASE=`basename $BAM .bam`