-
-
Notifications
You must be signed in to change notification settings - Fork 73
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
created mash function for sketching sequences. (#344)
- Loading branch information
1 parent
eacc869
commit 2271b5d
Showing
4 changed files
with
190 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
/* | ||
Package mash is for sketching sequence data to make it easier to compare to other sequence. | ||
The package is named mash after the mash sketching algorithm, which is based on the MinHash algorithm. | ||
Mash: fast genome and metagenome distance estimation using MinHash. | ||
Ondov, B.D., Treangen, T.J., Melsted, P. et al. | ||
Genome Biol 17, 132 (2016). | ||
https://doi.org/10.1186/s13059-016-0997-x | ||
Mash Screen: high-throughput sequence containment estimation for genome discovery. | ||
Ondov, B., Starrett, G., Sappington, A. et al. | ||
Genome Biol 20, 232 (2019). | ||
https://doi.org/10.1186/s13059-019-1841-x | ||
tl;dr for the papers above: | ||
Comparing biological sequences is really hard because similar sequences can have frameshifts that make it impossible | ||
to measure similarity using more common metric distances like hamming distance or levenshtein distance. | ||
Bioinformatics and nlp researchers have come up with tons of string comparison algorithms that are better suited for | ||
comparing biological sequences. For example poly already implements a few of them like the Needleman-Wunsch and Smith-Waterman | ||
algorithms in our "align" package. | ||
Mash is a different approach to comparing biological sequences. It uses a technique called sketching to reduce the | ||
complexity of the sequence to a vector of hashes. The hashes are generated by sliding a window of size k along the | ||
sequence and hashing each kmer. The hash is then stored in a vector of size s. The vector is sorted and the smallest | ||
hash is kept. The process is repeated until the vector is full. The vector of hashes is the sketch. | ||
The sketch is then compared to other sketches by counting the number of hashes that are the same between the two sketches. | ||
The number of hashes that are the same is divided by the size of the sketch to get a distance between 0 and 1. | ||
Hash vectors can only be compared to other hash vectors that use the same sliding window size. | ||
Sketch size limits how many hashes can be stored in the vector and the return vector | ||
will always be of length of the sketch size and filled the smallest hashes that were generated | ||
and sorted from smallest to largest. | ||
The larger the sketch size the more accurate the distance calculation will be but the longer it will take to calculate. | ||
TTFN, | ||
Tim | ||
*/ | ||
package mash | ||
|
||
import ( | ||
"sort" | ||
|
||
"github.com/spaolacci/murmur3" | ||
) // murmur3 is a fast non-cryptographic hash algorithm that was also used in the original papers-> https://github.com/shenwei356/go-hashing-kmer-bench | ||
|
||
// Mash is a collection of hashes of kmers from a given sequence. | ||
type Mash struct { | ||
KmerSize int // The kmer size is the size of the sliding window that is used to generate the hashes. | ||
SketchSize int // The sketch size is the number of hashes to store. | ||
Sketches []uint32 // The sketches are the hashes of the kmers that we can compare to other sketches. | ||
} | ||
|
||
// NewMash initializes a new mash sketch. | ||
func NewMash(kmerSize int, sketchSize int) *Mash { | ||
return &Mash{ | ||
KmerSize: kmerSize, | ||
SketchSize: sketchSize, | ||
Sketches: make([]uint32, sketchSize), | ||
} | ||
} | ||
|
||
// Sketch generates a mash sketch of the sequence. | ||
func (mash *Mash) Sketch(sequence string) { | ||
// the sketch size is the number of hashes to store. Pre-shifted to avoid off-by-one errors. | ||
maxShiftedSketchSize := mash.SketchSize - 1 | ||
|
||
// slide a window of size k along the sequence | ||
for kmerStart := 0; kmerStart < len(sequence)-mash.KmerSize; kmerStart++ { | ||
kmer := sequence[kmerStart : kmerStart+mash.KmerSize] | ||
// hash the kmer to a 32 bit number | ||
hash := murmur3.Sum32([]byte(kmer)) | ||
// keep the minimum hash value of all the kmers in the window up to a given sketch size | ||
// the sketch is a vector of the minimum hash values | ||
|
||
// if the sketch is not full, store the hash in the sketch | ||
if kmerStart < maxShiftedSketchSize { | ||
mash.Sketches[kmerStart] = hash | ||
continue | ||
} | ||
|
||
// if the sketch has just been filled add the hash to the sketch and sort the sketch | ||
if kmerStart == maxShiftedSketchSize { | ||
// sort the sketch from smallest to largest | ||
mash.Sketches[maxShiftedSketchSize] = hash | ||
sort.Slice(mash.Sketches, func(i, j int) bool { return mash.Sketches[i] < mash.Sketches[j] }) | ||
continue | ||
} | ||
|
||
// if the sketch is full and the new hash is smaller than the largest hash in the sketch, | ||
// replace the largest hash with the new hash and sort the sketch if the new hash is smaller than the second largest hash in the sketch | ||
if kmerStart > maxShiftedSketchSize && mash.Sketches[maxShiftedSketchSize] > hash { | ||
mash.Sketches[maxShiftedSketchSize] = hash | ||
if hash < mash.Sketches[maxShiftedSketchSize-1] { // if the new hash is smaller than the second largest hash in the sketch, sort the sketch | ||
sort.Slice(mash.Sketches, func(i, j int) bool { return mash.Sketches[i] < mash.Sketches[j] }) | ||
} | ||
continue | ||
} | ||
} | ||
} | ||
|
||
// Similarity returns the Jaccard similarity between two sketches (number of matching hashes / sketch size) | ||
func (mash *Mash) Similarity(other *Mash) float64 { | ||
var sameHashes int | ||
|
||
var largerSketch *Mash | ||
var smallerSketch *Mash | ||
|
||
if mash.SketchSize > other.SketchSize { | ||
largerSketch = mash | ||
smallerSketch = other | ||
} else { | ||
largerSketch = other | ||
smallerSketch = mash | ||
} | ||
|
||
largerSketchSizeShifted := largerSketch.SketchSize - 1 | ||
smallerSketchSizeShifted := smallerSketch.SketchSize - 1 | ||
|
||
// if the largest hash in the larger sketch is smaller than the smallest hash in the smaller sketch, the distance is 1 | ||
if largerSketch.Sketches[largerSketchSizeShifted] < smallerSketch.Sketches[0] { | ||
return 0 | ||
} | ||
|
||
// if the largest hash in the smaller sketch is smaller than the smallest hash in the larger sketch, the distance is 1 | ||
if smallerSketch.Sketches[smallerSketchSizeShifted] < largerSketch.Sketches[0] { | ||
return 0 | ||
} | ||
|
||
for _, hash := range smallerSketch.Sketches { | ||
ind := sort.Search(largerSketchSizeShifted, func(ind int) bool { return largerSketch.Sketches[ind] <= hash }) | ||
if largerSketch.Sketches[ind] == hash { | ||
sameHashes++ | ||
} | ||
} | ||
|
||
return float64(sameHashes) / float64(smallerSketch.SketchSize) | ||
} | ||
|
||
// Distance returns the Jaccard distance between two sketches (1 - similarity) | ||
func (mash *Mash) Distance(other *Mash) float64 { | ||
return 1 - mash.Similarity(other) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
package mash_test | ||
|
||
import ( | ||
"testing" | ||
|
||
"github.com/TimothyStiles/poly/mash" | ||
) | ||
|
||
func TestMash(t *testing.T) { | ||
fingerprint1 := mash.NewMash(17, 10) | ||
fingerprint1.Sketch("ATGCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGA") | ||
|
||
fingerprint2 := mash.NewMash(17, 9) | ||
fingerprint2.Sketch("ATGCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGA") | ||
|
||
distance := fingerprint1.Distance(fingerprint2) | ||
if distance != 0 { | ||
t.Errorf("Expected distance to be 0, got %f", distance) | ||
} | ||
|
||
distance = fingerprint2.Distance(fingerprint1) | ||
if distance != 0 { | ||
t.Errorf("Expected distance to be 0, got %f", distance) | ||
} | ||
|
||
spoofedFingerprint := mash.NewMash(17, 10) | ||
spoofedFingerprint.Sketches[0] = 0 | ||
|
||
distance = fingerprint1.Distance(spoofedFingerprint) | ||
if distance != 1 { | ||
t.Errorf("Expected distance to be 1, got %f", distance) | ||
} | ||
|
||
spoofedFingerprint = mash.NewMash(17, 9) | ||
|
||
distance = fingerprint1.Distance(spoofedFingerprint) | ||
if distance != 1 { | ||
t.Errorf("Expected distance to be 1, got %f", distance) | ||
} | ||
} |