Skip to content

Commit

Permalink
Merge pull request #27 from chrovis/fix/mpileup-so
Browse files Browse the repository at this point in the history
Fix bugs in mpileup.
  • Loading branch information
totakke committed Feb 10, 2017
2 parents d7e8c79 + eef7841 commit cbe3cee
Show file tree
Hide file tree
Showing 5 changed files with 208 additions and 67 deletions.
39 changes: 39 additions & 0 deletions src/cljam/cigar.clj
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,45 @@
(for [[_ n op] (re-seq #"([0-9]*)([MIDNSHP=X])" s)]
[(Integer/parseInt n) (first op)]))

(defn simplify
"Merge contiguous same operations of parsed CIGAR."
[cigs]
(loop [[[^long l op :as x] & xs] cigs result (transient [])]
(if (and l op)
(let [[^long nl nop] (first xs)]
(if (= op nop)
(recur (cons [(+ l nl) op] (next xs)) result)
(recur xs (conj! result x))))
(persistent! result))))

(defn- concat! [v coll]
(reduce conj! v coll))

(defn- update-last! [coll f]
(let [c (dec (count coll))]
(if (neg? c)
coll
(let [[op x] (get coll c)]
(if (= :m op)
(assoc! coll c (f x))
coll)))))

(defn to-index*
"Convert CIGAR string to sequence of indices."
[^String s]
(let [cigs (simplify (remove (comp #{\P \H} second) (parse s)))]
(loop [[[^long l op] & xs] cigs r 0 s 0 idx (transient [])]
(if (and l op)
(condp get op
#{\M \= \X} (recur xs (+ l r) (+ l s) (concat! idx (map (fn [x] [:m x]) (range s (+ l s)))))
#{\D} (recur xs (+ r l) s (concat! (update-last! idx (fn [x] [:d x (range r (+ l r))])) (repeat l [:m \*])))
#{\N} (recur xs (+ r l) s (concat! idx (repeat l [:m \>])))
#{\S} (recur xs r (+ s l) idx)
#{\I} (recur xs r (+ s l) (update-last! idx (fn [x] [:i x (range s (+ l s))]))))
(persistent! idx)))))

(def to-index (memoize to-index*))

(defn count-op
"Returns length of CIGAR operations."
[^String s]
Expand Down
2 changes: 1 addition & 1 deletion src/cljam/cli.clj
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@
:seq (cstr/join (% line))
(% line)) [:rname :pos :ref :count :seq :qual]))))))
([rdr rname start end]
(doseq [line (plp/mpileup rdr rname start end)]
(doseq [line (plp/mpileup nil rdr rname start end)]
(if-not (zero? (:count line))
(println (cstr/join \tab (map #(case %
:qual (cstr/join (% line))
Expand Down
109 changes: 57 additions & 52 deletions src/cljam/pileup/mpileup.clj
Original file line number Diff line number Diff line change
Expand Up @@ -7,72 +7,77 @@
[cljam.sequence :as cseq]
[cljam.io :as io]
[cljam.fasta :as fa]
[cljam.cigar :as cig]
[cljam.pileup.common :refer [window-width step center]]
[cljam.pileup.pileup :refer [rpositions]]))

(defn- append-seq
[op target current]
(case op
\M (apply conj current (map str (:seq target)))
\I (if (seq current)
(update-in current [(dec (count current))]
vector
(str "+" (:n target) (apply str (:seq target))))
current)
\D (apply conj current (map str (:seq target)))
\N (apply conj current (map str (:seq target)))
current))
(defn to-mpileup
"Stringify mpileup sequence."
[x]
(if (vector? x)
(let [[y op xs] x] (apply str y op (count xs) xs))
(str x)))

(defn encode-seq [seq*]
(loop [[f & r] (filter #(nil? (#{\P} (:op %))) seq*)
ret []
op nil
tmp {:n 0, :op nil, :seq nil}]
(if (nil? f)
(append-seq op tmp ret)
(if (nil? op)
(recur r ret (:op f) f)
(if (= (:op f) op)
(recur r ret (:op f) (-> tmp
(update-in [:n] + (:n f))
(assoc :op (:op f))
(update-in [:seq] (partial apply conj) (:seq f))))
(let [new-ret (append-seq op tmp ret)]
(recur r new-ret (:op f) f)))))))
(defn substitute-seq
"Substitute sequence with mpileup index."
[[^Character r & refs] ^String s [op x xs]]
(letfn [(get-char [i] (if (number? i) (let [c (.charAt s i)] (if (= c r) \. c)) i))]
(case op
:m (get-char x)
:d [(get-char x) \- (take (count xs) refs)]
:i [(get-char x) \+ (map #(.charAt s %) xs)]
x)))

(defn substitute-qual
"Substitute base quality with mpileup index."
[^String q [op x]]
(if (and (number? x) (not= q "*")) (.charAt q x) \~))

(defn pileup-seq
"Returns a lazy sequence that each element contains reads piled-up at the locus."
[^long start ^long end reads]
(letfn [(cover-locus? [pos aln]
(<= (:pos aln) pos (sam-util/get-end aln)))
(step [[pos buf alns]]
(let [[i o] (split-with (partial cover-locus? (inc pos)) alns)]
[(inc pos) (concat (filter (partial cover-locus? (inc pos)) buf) i) o]))]
(map second (rest (iterate step [(dec start) [] (remove (comp empty? :cigar) reads)])))))
(<= (:pos aln) pos (:end aln)))
(step [pos buf alns]
(lazy-seq
(when (< pos end)
(let [[i o] (split-with (partial cover-locus? (inc pos)) alns)
b (doall (concat (filter #(<= (inc pos) (:end %)) buf) i))]
(cons b (step (inc pos) b o))))))]
(->> reads
(sequence
(comp
(remove #(and (empty? (:cigar %)) (nil? (:cigar-bytes (:meta %)))))
(map #(assoc % :end (sam-util/get-end %)))
(drop-while #(< (:end %) start))))
(step (dec start) []))))

(defrecord MPileupElement [^String rname ^long pos ^Character ref ^long count seq qual reads])

(defn- gen-mpileup
(defn gen-mpileup
"Compute mpileup info from piled-up reads and reference."
[^String rname ^long locus ^Character reference reads]
(let [seq (mapv (fn [{:keys [^long pos ^String seq ^String cigar] :as aln}]
(let [s (nth (encode-seq (cseq/parse seq cigar)) (- locus pos))]
(cond
(vector? s) (second s)
(= reference (first s)) "."
:else s))) reads)
qual (mapv (fn [{:keys [^long pos ^String qual] :as aln}]
(if (= qual "*") \~ (nth qual (- locus pos) \~))) reads)]
(MPileupElement. rname locus reference (count reads) seq qual reads)))
[^String rname ^long locus [^Character ref-base :as refs] reads]
(let [seqs (map (fn gen-mpileup-seq [{:keys [^long pos ^String seq cig-index]}]
(substitute-seq refs seq (nth cig-index (- locus pos)))) reads)
qual (map (fn gen-mpileup-qual [{:keys [^long pos ^String qual cig-index]}]
(substitute-qual qual (nth cig-index (- locus pos)))) reads)]
(MPileupElement. rname locus ref-base (count reads) seqs qual reads)))

(defn pileup*
"Internal pileup function independent from I/O."
[refseq alns rname start end]
(map
(partial gen-mpileup rname)
(range start (inc end))
refseq
(pileup-seq start end alns)))
"Internal mpileup function independent from I/O.
Can take multiple alignments seqs."
[refseq rname start end & aln-seqs]
(->> aln-seqs
(map
(fn [alns]
(->> alns
(sequence (map (fn [a] (assoc a :cig-index (cig/to-index (:cigar a))))))
(pileup-seq start end))))
(apply map
(fn [index refs & plps]
(map (fn [plp] (gen-mpileup rname index refs plp)) plps))
(range start (inc end))
(partition 10 1 (concat refseq (repeat \N))))))

(defn pileup
"Returns a lazy sequence of MPileupElement calculated from FASTA and BAM."
Expand All @@ -89,7 +94,7 @@
(fa/read-sequence fa-reader {:chr rname :start s :end e})
(repeat \N))
alns (io/read-alignments bam-reader {:chr rname :start s :end e :depth :deep})]
(pileup* refseq alns rname s e)))
(map (fn [p] (update (first p) :seq (fn [s] (map to-mpileup s)))) (pileup* refseq rname s e alns))))
(catch bgzf4j.BGZFException _
(throw (RuntimeException. "Invalid file format"))))))

Expand Down
36 changes: 36 additions & 0 deletions test/cljam/t_cigar.clj
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,39 @@

(fact "about parse"
(cgr/parse "1S2I6M1P11I") => '([1 \S] [2 \I] [6 \M] [1 \P] [11 \I]))

(tabular
(fact
"cigar to index"
(map (fn [[op x :as xs]] (if (= op :m) x xs)) (cgr/to-index* ?cigar)) => ?index)
?cigar ?index
"4M" [0 1 2 3]
"1M3I" [[:i 0 [1 2 3]]]
"1M3D" [[:d 0 [1 2 3]] \* \* \*]
"2M3I" [0 [:i 1 [2 3 4]]]
"4D5M" [\* \* \* \* 0 1 2 3 4] ;; ^]* * * * A T G C A$
"4D5I" [\* \* \* [:i \* [0 1 2 3 4]]] ;; TODO: ^]* * * *+5TGCA=$
"4S4M" [4 5 6 7]
"4H4M" [0 1 2 3]
"1M3I1M" [[:i 0 [1 2 3]] 4]
"2M3I1M" [0 [:i 1 [2 3 4]] 5]
"4I2D5M" [\* \* 4 5 6 7 8] ;; ^]* * A T G C A$
"1M4D5I" [[:d 0 [1 2 3 4]] \* \* \* [:i \* [1 2 3 4 5]]] ;; TODO: ^]* * * *+5TGCA=$
"1M3D3M" [[:d 0 [1 2 3]] \* \* \* 1 2 3]
"1M2D1I2M" [[:d 0 [1 2]] \* [:i \* [1]] 2 3] ;; TODO: ^]A-2NN * *+1G G C$
"1M2N1I2M" [0 \> [:i \> [1]] 2 3] ;; TODO: ^]A > >+1G G C$
"1M1I2D2M" [[:i 0 [1]] \* \* 2 3] ;; ^]A+1T * * G C$
"1M1I2N2M" [[:i 0 [1]] \> \> 2 3] ;; ^]A+1T > > G C$
"1M4I2D5M" [[:i 0 [1 2 3 4]] \* \* 5 6 7 8 9] ;; ^]A+4TGCA * * C A T G C$
"1S4I2D5M" [\* \* 5 6 7 8 9] ;; ^]* * T G C A T$
"3M2P2I3M" [0 1 [:i 2 [3 4]] 5 6 7] ;; ^]A T G+2CA T G C$
"1P4I1D5M" [\* 4 5 6 7 8] ;; ^]* A T G C A$
"5M1D3M4S" [0 1 2 3 [:d 4 [5]] \* 5 6 7] ;; ^]A T G C A-1N * T G C$
"5M1N3M4S" [0 1 2 3 4 \> 5 6 7] ;; ^]A T G C A > T G C$
"3H2S3M1D2S4H" [2 3 [:d 4 [3]] \*] ;; ^]G C A-1N \*$
"1M1P1I1P1I2M" [[:i 0 [1 2]] 3 4] ;; ^]A+2TG C A$
"2M1I1D2I2D1M" [0 [:i 1 [2]] [:i \* [3 4]] \* \* 5] ;; TODO: ^]A T+1G *+2AT * * T$
"1S2I6M1P1I1P1I4M2I" [3 4 5 6 7 [:i 8 [9 10]] 11 12 13 [:i 14 [15 16]]] ;; ^]C A T G C A+2TG C A T G+2CA$
"6M14D1I5M" [0 1 2 3 4 [:d 5 [6 7 8 9 10 11 12 13 14 15 16 17 18 19]] \* \* \* \* \* \* \* \* \* \* \* \* \* [:i \* [6]] 7 8 9 10 11] ;; ^]A T G C A T-14NNNNNNNNNNNNNN * * * * * * * * * * * * * *+1C C A T G C$
"6M14N1I5M" [0 1 2 3 4 5 \> \> \> \> \> \> \> \> \> \> \> \> \> [:i \> [6]] 7 8 9 10 11] ;; ^]A T G C A T > > > > > > > > > > > > > >+1C C A T G C$
"1S2M2D3M1D4M3D2M3D3M3I4M3D2M3S" [1 [:d 2 [2 3]] \* \* 3 4 [:d 5 [7]] \* 6 7 8 [:d 9 [12 13 14]] \* \* \* 10 [:d 11 [17 18 19]] \* \* \* 12 13 [:i 14 [15 16 17]] 18 19 20 [:d 21 [27 28 29]] \* \* \* 22 23])
89 changes: 75 additions & 14 deletions test/cljam/t_pileup.clj
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,30 @@
cljam.t-common)
(:require [cljam.bam :as bam]
[cljam.fasta :as fa]
[cljam.pileup :as plp]))
[cljam.pileup :as plp]
[cljam.pileup.mpileup :as mplp]))

(def test-bam-pileup-ref '(0 0 0 0 0 0 1 1 3 3 3 3 3 3 2 3 3 3 2 2 2 2 1 1 1 1 1 1 2 2 2 2 2 1 1 1 2 2 2 2 1 1 1 1 1))
(def test-bam-pileup-ref2 '(1 2 2 2 2 3 3 3 3 4 4 5 5 6 6 6 6 6 6 6 5 5 4 4 4 4 4 3 3 3 3 3 3 3 2 1 0 0 0 0))
(def test-bam-mpileup-seq-ref
'(() () () () () () ("T") ("T") ("A" "A" "A") ("G" "G" "G") ("A" "A" "C") ("T" "T" "T")
("A" "A" "A") ("+4AGAG" "+2GG" "A") ("G" "G") ("A" "A" "A") ("T" "T" "T") ("A" "+2AA" "A")
("*" "G") ("C" "C") ("T" "T") ("G" ">") (">") (">") (">") (">") (">") (">") (">" "T") (">" "A")
(">" "G") (">" "G") (">" "C") (">") ("+1C") ("T") ("C" "C") ("A" "A") ("G" "G") ("C" "C")
("G") ("C") ("C") ("A") ("T")))
'(() () () () () () ("T") ("T") ("A" "A" "A") ("G" "G" "G") ("A" "A" "C") ("T" "T" "T") ("A" "A" "A")
("A+4AGAG" "A+2GG" "A") ("G" "G") ("A" "A" "A") ("T" "T" "T") ("A-1N" "A+2AA" "A") ("*" "G") ("C" "C")
("T" "T") ("G" ">") (">") (">") (">") (">") (">") (">") (">" "T") (">" "A") (">" "G") (">" "G") (">" "C")
(">") (">+1C") ;; adjacent indel >+1T
("T") ("C" "C") ("A" "A") ("G" "G") ("C" "C") ("G") ("C") ("C") ("A") ("T")))
(def test-bam-mpileup-seq-ref2
'(() () () () () () (".") (".") ("." "." ".") ("." "." ".") ("." "." "C") ("." "." ".")
("." "." ".") ("+4AGAG" "+2GG" ".") ("." ".") ("." "." ".") ("." "." ".") ("." "+2AA" ".")
("*" ".") ("." ".") ("." ".") ("." ">") (">") (">") (">") (">") (">") (">") (">" ".") (">" ".")
(">" ".") (">" ".") (">" ".") (">") ("+1C") (".") ("." ".") ("." ".") ("." ".") ("." ".")
(".") (".") (".") (".") (".")))
'(("A") ("G" "G") ("G" "G") ("T" "T") ("T" "T") ("T" "T" "T") ("T" "T" "T") ("A" "A" "A") ("T" "T" "T")
("A" "A" "A" "C") ("A" "A" "A" "A") ("A" "A" "A" "A" "A") ("A" "A" "A" "A" "A") ("C" "C" "C+4AAAT" "T" "T" "T")
("A" "A" "A" "A" "A" "A") ("A" "A" "A" "A" "A" "A") ("A" "A" "T" "T" "T" "T") ("T" "T" "T" "T" "T" "T")
("A" "A" "A" "A" "A" "A") ("A" "A" "A" "A" "A" "A") ("T" "G" "G" "G" "G") ("T" "T" "T" "T" "T")
("C" "C" "C" "C") ("T" "T" "T" "T") ("A" "A" "A" "A") ("C" "C" "C" "C") ("A" "A" "A" "A") ("G" "G" "G")
("A" "A" "A") ("G" "G" "G") ("C" "C" "C") ("A" "A" "A") ("A" "A" "A") ("C" "C" "C") ("T" "T") ("A") () () () ()))
(def test-bam-mpileup-seq-ref-with-ref
'(() () () () () () (".") (".") ("." "." ".") ("." "." ".") ("." "." "C") ("." "." ".") ("." "." ".")
(".+4AGAG" ".+2GG" ".") ("." ".") ("." "." ".") ("." "." ".") (".-1G" ".+2AA" ".") ("*" ".") ("." ".")
("." ".") ("." ">") (">") (">") (">") (">") (">") (">") (">" ".") (">" ".") (">" ".") (">" ".") (">" ".")
(">") (">+1C") ;; adjacent indel >+1T
(".") ("." ".") ("." ".") ("." ".") ("." ".") (".") (".") (".") (".") (".")))
(def test-bam-mpileup-qual-ref
'([] [] [] [] [] [] [\~] [\~] [\~ \~ \~] [\~ \~ \~] [\~ \~ \~] [\~ \~ \~] [\~ \~ \~] [\~ \~ \~]
[\~ \~] [\~ \~ \~] [\~ \~ \~] [\~ \~ \~] [\~ \~] [\~ \~] [\~ \~] [\~ \~] [\~] [\~] [\~] [\~]
Expand All @@ -43,6 +51,58 @@
(fact "about first-pos"
(plp/first-pos (bam/reader test-sorted-bam-file) "ref2" 0 64) => 1)

(fact
"about pileup-seq"

;; ----------
;; 1234567890...
(map
(fn [xs] (map #(dissoc % :end) xs))
(mplp/pileup-seq 1 2 [{:pos 1 :cigar "10M"}]))
=> [[{:pos 1 :cigar "10M"}] [{:pos 1 :cigar "10M"}]]

;; ----------
;; ----------
;; ----------
;; ----------
;; 1234567890123...
(map
count
(mplp/pileup-seq 1 20 (map #(hash-map :pos (inc %) :cigar "10M") (range))))
=> [1 2 3 4 5 6 7 8 9 10 10 10 10 10 10 10 10 10 10 10]
(map
count
(mplp/pileup-seq 101 120 (map #(hash-map :pos (inc %) :cigar "10M") (range))))
=> [10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10]
(map
:pos
(last (mplp/pileup-seq 1 1000000 (map #(hash-map :pos (inc %) :cigar "10M") (range)))))
=> [999991 999992 999993 999994 999995 999996 999997 999998 999999 1000000]

;; -----
;; ----
;; ---
;; --
;; -
;; 1234567890...
(map
count
(mplp/pileup-seq 1 10 (map #(hash-map :pos (inc %) :cigar (str (inc %) "M")) (range))))
=> [1 1 2 2 3 3 4 4 5 5]

;; --------
;; ----------
;; --
;; ----
;; ------
;; --------
;; ----------
;; 1234567890...
(map
count
(mplp/pileup-seq 1 10 (map #(hash-map :pos (inc %) :cigar (str (- 10 (* (mod % 5) 2)) "M")) (range))))
=> [1 2 3 4 5 6 6 6 6 6])

(fact "about mpileup"
(with-open [br (bam/reader test-sorted-bam-file)
fr (fa/reader test-fa-file)]
Expand All @@ -54,15 +114,16 @@
(map :pos mplp-ref) => (range 1 46)
(map :seq mplp-ref) => test-bam-mpileup-seq-ref
(map :qual mplp-ref) => test-bam-mpileup-qual-ref
(map :count mplp-ref2) => test-bam-pileup-ref2)
(map :count mplp-ref2) => test-bam-pileup-ref2
(map :seq mplp-ref2) => test-bam-mpileup-seq-ref2)

(let [mplp-ref (doall (plp/mpileup fr br "ref"))]
(map :rname mplp-ref) => (repeat 45 "ref")
;; 123456789012345678901234567890123456789012345
(apply str (map :ref mplp-ref)) => "AGCATGTTAGATAAGATAGCTGTGCTAGTAGGCAGTCAGCGCCAT"
(map :count mplp-ref) => test-bam-pileup-ref
(map :pos mplp-ref) => (range 1 46)
(map :seq mplp-ref) => test-bam-mpileup-seq-ref2
(map :seq mplp-ref) => test-bam-mpileup-seq-ref-with-ref
(map :qual mplp-ref) => test-bam-mpileup-qual-ref)))

(fact "mpileup region"
Expand All @@ -75,6 +136,6 @@
(apply str (map :ref mplp-ref1)) => "AGCATGTTAGATAAGATAGCTGTGCTAGTAGGCAGTCAGC"
(map :count mplp-ref1) => (take 40 test-bam-pileup-ref)
(map :pos mplp-ref1) => (range 1 41)
(map :seq mplp-ref1) => (take 40 test-bam-mpileup-seq-ref2)
(map :seq mplp-ref1) => (take 40 test-bam-mpileup-seq-ref-with-ref)
(apply str (map :ref mplp-ref2)) => "AGGTTTTATAAAACAATTAAGTCTACAGAGCAACTACGCG")))

0 comments on commit cbe3cee

Please sign in to comment.