created mash function for sketching sequences. (#344)

bebop · Oct 5, 2023 · 2271b5d · 2271b5d
1 parent eacc869
commit 2271b5d
Show file tree

Hide file tree

Showing 4 changed files with 190 additions and 0 deletions.
diff --git a/go.mod b/go.mod
@@ -16,6 +16,7 @@ require (
 require (
 	github.com/davecgh/go-spew v1.1.1 // indirect
 	github.com/mattn/go-sqlite3 v1.14.13 // indirect
+	github.com/spaolacci/murmur3 v1.1.0 // indirect
 )
 
 require (

diff --git a/go.sum b/go.sum
@@ -22,6 +22,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/sergi/go-diff v1.2.0 h1:XU+rvMAioB0UC3q1MFrIQy4Vo5/4VsRDQQXHsEya6xQ=
 github.com/sergi/go-diff v1.2.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM=
+github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0bLI=
+github.com/spaolacci/murmur3 v1.1.0/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk=
 github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=

diff --git a/mash/mash.go b/mash/mash.go
@@ -0,0 +1,147 @@
+/*
+Package mash is for sketching sequence data to make it easier to compare to other sequence.
+
+The package is named mash after the mash sketching algorithm, which is based on the MinHash algorithm.
+
+Mash: fast genome and metagenome distance estimation using MinHash.
+Ondov, B.D., Treangen, T.J., Melsted, P. et al.
+Genome Biol 17, 132 (2016).
+https://doi.org/10.1186/s13059-016-0997-x
+
+Mash Screen: high-throughput sequence containment estimation for genome discovery.
+Ondov, B., Starrett, G., Sappington, A. et al.
+Genome Biol 20, 232 (2019).
+https://doi.org/10.1186/s13059-019-1841-x
+
+tl;dr for the papers above:
+
+Comparing biological sequences is really hard because similar sequences can have frameshifts that make it impossible
+to measure similarity using more common metric distances like hamming distance or levenshtein distance.
+
+Bioinformatics and nlp researchers have come up with tons of string comparison algorithms that are better suited for
+comparing biological sequences. For example poly already implements a few of them like the Needleman-Wunsch and Smith-Waterman
+algorithms in our "align" package.
+
+Mash is a different approach to comparing biological sequences. It uses a technique called sketching to reduce the
+complexity of the sequence to a vector of hashes. The hashes are generated by sliding a window of size k along the
+sequence and hashing each kmer. The hash is then stored in a vector of size s. The vector is sorted and the smallest
+hash is kept. The process is repeated until the vector is full. The vector of hashes is the sketch.
+
+The sketch is then compared to other sketches by counting the number of hashes that are the same between the two sketches.
+The number of hashes that are the same is divided by the size of the sketch to get a distance between 0 and 1.
+
+Hash vectors can only be compared to other hash vectors that use the same sliding window size.
+Sketch size limits how many hashes can be stored in the vector and the return vector
+will always be of length of the sketch size and filled the smallest hashes that were generated
+and sorted from smallest to largest.
+
+The larger the sketch size the more accurate the distance calculation will be but the longer it will take to calculate.
+
+TTFN,
+Tim
+*/
+package mash
+
+import (
+	"sort"
+
+	"github.com/spaolacci/murmur3"
+) // murmur3 is a fast non-cryptographic hash algorithm that was also used in the original papers-> https://github.com/shenwei356/go-hashing-kmer-bench
+
+// Mash is a collection of hashes of kmers from a given sequence.
+type Mash struct {
+	KmerSize   int      // The kmer size is the size of the sliding window that is used to generate the hashes.
+	SketchSize int      // The sketch size is the number of hashes to store.
+	Sketches   []uint32 // The sketches are the hashes of the kmers that we can compare to other sketches.
+}
+
+// NewMash initializes a new mash sketch.
+func NewMash(kmerSize int, sketchSize int) *Mash {
+	return &Mash{
+		KmerSize:   kmerSize,
+		SketchSize: sketchSize,
+		Sketches:   make([]uint32, sketchSize),
+	}
+}
+
+// Sketch generates a mash sketch of the sequence.
+func (mash *Mash) Sketch(sequence string) {
+	// the sketch size is the number of hashes to store. Pre-shifted to avoid off-by-one errors.
+	maxShiftedSketchSize := mash.SketchSize - 1
+
+	// slide a window of size k along the sequence
+	for kmerStart := 0; kmerStart < len(sequence)-mash.KmerSize; kmerStart++ {
+		kmer := sequence[kmerStart : kmerStart+mash.KmerSize]
+		// hash the kmer to a 32 bit number
+		hash := murmur3.Sum32([]byte(kmer))
+		// keep the minimum hash value of all the kmers in the window up to a given sketch size
+		// the sketch is a vector of the minimum hash values
+
+		// if the sketch is not full, store the hash in the sketch
+		if kmerStart < maxShiftedSketchSize {
+			mash.Sketches[kmerStart] = hash
+			continue
+		}
+
+		// if the sketch has just been filled add the hash to the sketch and sort the sketch
+		if kmerStart == maxShiftedSketchSize {
+			// sort the sketch from smallest to largest
+			mash.Sketches[maxShiftedSketchSize] = hash
+			sort.Slice(mash.Sketches, func(i, j int) bool { return mash.Sketches[i] < mash.Sketches[j] })
+			continue
+		}
+
+		// if the sketch is full and the new hash is smaller than the largest hash in the sketch,
+		// replace the largest hash with the new hash and sort the sketch if the new hash is smaller than the second largest hash in the sketch
+		if kmerStart > maxShiftedSketchSize && mash.Sketches[maxShiftedSketchSize] > hash {
+			mash.Sketches[maxShiftedSketchSize] = hash
+			if hash < mash.Sketches[maxShiftedSketchSize-1] { // if the new hash is smaller than the second largest hash in the sketch, sort the sketch
+				sort.Slice(mash.Sketches, func(i, j int) bool { return mash.Sketches[i] < mash.Sketches[j] })
+			}
+			continue
+		}
+	}
+}
+
+// Similarity returns the Jaccard similarity between two sketches (number of matching hashes / sketch size)
+func (mash *Mash) Similarity(other *Mash) float64 {
+	var sameHashes int
+
+	var largerSketch *Mash
+	var smallerSketch *Mash
+
+	if mash.SketchSize > other.SketchSize {
+		largerSketch = mash
+		smallerSketch = other
+	} else {
+		largerSketch = other
+		smallerSketch = mash
+	}
+
+	largerSketchSizeShifted := largerSketch.SketchSize - 1
+	smallerSketchSizeShifted := smallerSketch.SketchSize - 1
+
+	// if the largest hash in the larger sketch is smaller than the smallest hash in the smaller sketch, the distance is 1
+	if largerSketch.Sketches[largerSketchSizeShifted] < smallerSketch.Sketches[0] {
+		return 0
+	}
+
+	// if the largest hash in the smaller sketch is smaller than the smallest hash in the larger sketch, the distance is 1
+	if smallerSketch.Sketches[smallerSketchSizeShifted] < largerSketch.Sketches[0] {
+		return 0
+	}
+
+	for _, hash := range smallerSketch.Sketches {
+		ind := sort.Search(largerSketchSizeShifted, func(ind int) bool { return largerSketch.Sketches[ind] <= hash })
+		if largerSketch.Sketches[ind] == hash {
+			sameHashes++
+		}
+	}
+
+	return float64(sameHashes) / float64(smallerSketch.SketchSize)
+}
+
+// Distance returns the Jaccard distance between two sketches (1 - similarity)
+func (mash *Mash) Distance(other *Mash) float64 {
+	return 1 - mash.Similarity(other)
+}
diff --git a/mash/mash_test.go b/mash/mash_test.go
@@ -0,0 +1,40 @@
+package mash_test
+
+import (
+	"testing"
+
+	"github.com/TimothyStiles/poly/mash"
+)
+
+func TestMash(t *testing.T) {
+	fingerprint1 := mash.NewMash(17, 10)
+	fingerprint1.Sketch("ATGCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGA")
+
+	fingerprint2 := mash.NewMash(17, 9)
+	fingerprint2.Sketch("ATGCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGA")
+
+	distance := fingerprint1.Distance(fingerprint2)
+	if distance != 0 {
+		t.Errorf("Expected distance to be 0, got %f", distance)
+	}
+
+	distance = fingerprint2.Distance(fingerprint1)
+	if distance != 0 {
+		t.Errorf("Expected distance to be 0, got %f", distance)
+	}
+
+	spoofedFingerprint := mash.NewMash(17, 10)
+	spoofedFingerprint.Sketches[0] = 0
+
+	distance = fingerprint1.Distance(spoofedFingerprint)
+	if distance != 1 {
+		t.Errorf("Expected distance to be 1, got %f", distance)
+	}
+
+	spoofedFingerprint = mash.NewMash(17, 9)
+
+	distance = fingerprint1.Distance(spoofedFingerprint)
+	if distance != 1 {
+		t.Errorf("Expected distance to be 1, got %f", distance)
+	}
+}