-
Notifications
You must be signed in to change notification settings - Fork 76
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[cudamapper] Filtering for self mappings, identical overlaps, and highly-similar overlaps #563
base: dev
Are you sure you want to change the base?
Changes from all commits
73c9f9f
4aa4709
8a22d38
cf1698c
d77d052
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -38,6 +38,18 @@ namespace details | |
namespace overlapper | ||
{ | ||
|
||
/// | ||
/// \brief Removes overlaps which cover more than a certain percentage of a read. | ||
/// | ||
/// \param overlaps A vector of overlaps. | ||
/// \param query_parser A FastaParser containing query sequences. | ||
/// \param target_parser A FastaParser containing target sequences. | ||
/// \param max_percent_overlap The maximum percent overlap allowed before an overlap is removed. | ||
void filter_self_mappings(std::vector<Overlap>& overlaps, | ||
const io::FastaParser& query_parser, | ||
const io::FastaParser& target_parser, | ||
const double max_percent_overlap); | ||
|
||
/// \brief Extends a single overlap at its ends if the similarity of the query and target sequences is above a specified threshold. | ||
/// \param overlap An Overlap which is modified in place. Any of the query_start_position_in_read, query_end_position_in_read, | ||
/// target_start_position_in_read, and target_end_position_in_read fields may be modified. | ||
|
@@ -98,7 +110,8 @@ class Overlapper | |
/// \brief Identified overlaps which can be combined into a larger overlap and add them to the input vector | ||
/// \param overlaps reference to vector of Overlaps. New overlaps (result of fusing) are added to this vector | ||
/// \param drop_fused_overlaps If true, remove overlaps that are fused into larger overlaps in output. | ||
static void post_process_overlaps(std::vector<Overlap>& overlaps, bool drop_fused_overlaps = false); | ||
/// \param max_reciprocal Maximum reciprocal identity between two overlaps before one is filtered. Default (< 0.0) means no filtering; 0.0 removes only identical adjacent overlaps. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure how complicated would it be to specify which overlap gets filtered out (e.g. the one with smaller starting position, longer/shorter one or something) |
||
static void post_process_overlaps(std::vector<Overlap>& overlaps, bool drop_fused_overlaps = false, const double max_reciprocal = -1.0); | ||
|
||
/// \brief Given a vector of overlaps, extend the start/end of the overlaps based on the sequence similarity of the query and target. | ||
/// \param overlaps A vector of overlaps. This is modified in-place; query_start_position_in_read_, query_end_position_in_read_, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -51,6 +51,8 @@ ApplicationParameters::ApplicationParameters(int argc, char* argv[]) | |
{"min-overlap-fraction", required_argument, 0, 'z'}, | ||
{"rescue-overlap-ends", no_argument, 0, 'R'}, | ||
{"drop-fused-overlaps", no_argument, 0, 'D'}, | ||
{"preserve-self-mappings", no_argument, 0, 'S'}, | ||
{"max-reciprocal", required_argument, 0, 'Z'}, | ||
Comment on lines
+54
to
+55
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These parameters should also be added to |
||
{"query-indices-in-host-memory", required_argument, 0, 'Q'}, | ||
{"query-indices-in-device-memory", required_argument, 0, 'q'}, | ||
{"target-indices-in-host-memory", required_argument, 0, 'C'}, | ||
|
@@ -59,7 +61,7 @@ ApplicationParameters::ApplicationParameters(int argc, char* argv[]) | |
{"help", no_argument, 0, 'h'}, | ||
}; | ||
|
||
std::string optstring = "k:w:d:m:i:t:F:a:r:l:b:z:RDQ:q:C:c:vh"; | ||
std::string optstring = "k:w:d:m:i:t:F:a:r:l:b:z:Z:RDSQ:q:C:c:vh"; | ||
|
||
bool target_indices_in_host_memory_set = false; | ||
bool target_indices_in_device_memory_set = false; | ||
|
@@ -111,12 +113,18 @@ ApplicationParameters::ApplicationParameters(int argc, char* argv[]) | |
case 'z': | ||
min_overlap_fraction = std::stof(optarg); | ||
break; | ||
case 'Z': | ||
max_reciprocal = std::stof(optarg); | ||
break; | ||
case 'R': | ||
perform_overlap_end_rescue = true; | ||
break; | ||
case 'D': | ||
drop_fused_overlaps = true; | ||
break; | ||
case 'S': | ||
filter_self_mappings = false; | ||
break; | ||
case 'Q': | ||
query_indices_in_host_memory = std::stoi(optarg); | ||
break; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -53,8 +53,10 @@ class ApplicationParameters | |
int32_t min_overlap_len = 250; // l, recommended range: 100 - 1000 | ||
int32_t min_bases_per_residue = 1000; // b | ||
float min_overlap_fraction = 0.8; // z | ||
float max_reciprocal = -1.0; // Z, < 0 : no filtering. 0.0: only identical filtering. > 1.0: filter by percent reciprocity | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For |
||
bool perform_overlap_end_rescue = false; // R | ||
bool drop_fused_overlaps = false; // D | ||
bool filter_self_mappings = true; // S (to turn off) | ||
int32_t query_indices_in_host_memory = 10; // Q | ||
int32_t query_indices_in_device_memory = 5; // q | ||
int32_t target_indices_in_host_memory = 10; // C | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -356,7 +356,13 @@ void postprocess_and_write_thread_function(const int32_t device_id, | |
{ | ||
GW_NVTX_RANGE(profiler, "main::postprocess_and_write_thread::postprocessing"); | ||
// Overlap post processing - add overlaps which can be combined into longer ones. | ||
Overlapper::post_process_overlaps(data_to_write->overlaps, application_parameters.drop_fused_overlaps); | ||
Overlapper::post_process_overlaps(data_to_write->overlaps, application_parameters.drop_fused_overlaps, application_parameters.max_reciprocal); | ||
} | ||
|
||
if (application_parameters.all_to_all && application_parameters.filter_self_mappings) | ||
{ | ||
GW_NVTX_RANGE(profiler, "main::postprocess_and_write_thread::remove_self_mappings"); | ||
::claraparabricks::genomeworks::cudamapper::details::overlapper::filter_self_mappings(overlaps, *application_parameters.query_parser, *application_parameters.target_parser, 0.9); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it necessary to qualify the function this day? Also, is function really a detail if it is used in a completely different (top level) file |
||
} | ||
|
||
if (application_parameters.perform_overlap_end_rescue) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -98,6 +98,51 @@ bool overlaps_mergable(const claraparabricks::genomeworks::cudamapper::Overlap o | |
return short_gap_relative_to_length; | ||
} | ||
|
||
inline bool overlaps_identical(const claraparabricks::genomeworks::cudamapper::Overlap& a, const claraparabricks::genomeworks::cudamapper::Overlap& b) | ||
{ | ||
return a.query_read_id_ == b.query_read_id_ && | ||
a.target_read_id_ == b.target_read_id_ && | ||
a.query_start_position_in_read_ == b.query_start_position_in_read_ && | ||
a.query_end_position_in_read_ == b.query_end_position_in_read_ && | ||
a.target_start_position_in_read_ == b.target_start_position_in_read_ && | ||
a.target_end_position_in_read_ == b.target_end_position_in_read_; | ||
} | ||
|
||
inline double percent_reciprocal_overlap(const claraparabricks::genomeworks::cudamapper::Overlap& a, const claraparabricks::genomeworks::cudamapper::Overlap& b) | ||
{ | ||
if (a.query_read_id_ != b.query_read_id_ || a.target_read_id_ != b.target_read_id_ || a.relative_strand != b.relative_strand) | ||
{ | ||
return 0.0; | ||
} | ||
int32_t query_overlap = std::min(a.query_end_position_in_read_, b.query_end_position_in_read_) - std::max(a.query_start_position_in_read_, b.query_start_position_in_read_); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This and similar should be |
||
int32_t target_overlap; | ||
if (a.relative_strand == claraparabricks::genomeworks::cudamapper::RelativeStrand::Forward && b.relative_strand == claraparabricks::genomeworks::cudamapper::RelativeStrand::Forward) | ||
{ | ||
target_overlap = std::min(a.target_end_position_in_read_, b.target_end_position_in_read_) - std::max(a.target_start_position_in_read_, b.target_start_position_in_read_); | ||
} | ||
else | ||
{ | ||
target_overlap = std::max(a.target_start_position_in_read_, b.target_start_position_in_read_) - std::min(a.target_end_position_in_read_, b.target_end_position_in_read_); | ||
} | ||
|
||
int32_t query_total_length = std::max(a.query_end_position_in_read_, b.query_end_position_in_read_) - std::min(a.query_start_position_in_read_, b.query_start_position_in_read_); | ||
int32_t target_total_length; | ||
if (a.relative_strand == claraparabricks::genomeworks::cudamapper::RelativeStrand::Forward && b.relative_strand == claraparabricks::genomeworks::cudamapper::RelativeStrand::Forward) | ||
{ | ||
target_total_length = std::max(a.target_end_position_in_read_, b.target_end_position_in_read_) - std::min(a.target_start_position_in_read_, b.target_start_position_in_read_); | ||
} | ||
else | ||
{ | ||
target_total_length = std::min(a.target_start_position_in_read_, b.target_start_position_in_read_) - std::max(a.target_end_position_in_read_, b.target_end_position_in_read_); | ||
} | ||
return static_cast<double>(query_overlap + target_overlap) / static_cast<double>(query_total_length + target_total_length); | ||
} | ||
|
||
bool overlaps_reciprocal(const claraparabricks::genomeworks::cudamapper::Overlap& a, const claraparabricks::genomeworks::cudamapper::Overlap& b, const double threshold) | ||
{ | ||
return percent_reciprocal_overlap(a, b) > threshold; | ||
} | ||
|
||
// Reverse complement lookup table | ||
static char complement_array[26] = { | ||
84, 66, 71, 68, 69, | ||
|
@@ -132,7 +177,7 @@ namespace genomeworks | |
namespace cudamapper | ||
{ | ||
|
||
void Overlapper::post_process_overlaps(std::vector<Overlap>& overlaps, const bool drop_fused_overlaps) | ||
void Overlapper::post_process_overlaps(std::vector<Overlap>& overlaps, const bool drop_fused_overlaps, const double max_reciprocal) | ||
{ | ||
const auto num_overlaps = get_size(overlaps); | ||
bool in_fuse = false; | ||
|
@@ -143,7 +188,7 @@ void Overlapper::post_process_overlaps(std::vector<Overlap>& overlaps, const boo | |
int num_residues = 0; | ||
Overlap prev_overlap; | ||
std::vector<bool> drop_overlap_mask; | ||
if (drop_fused_overlaps) | ||
if (drop_fused_overlaps || max_reciprocal >= 0.0) | ||
{ | ||
drop_overlap_mask.resize(overlaps.size()); | ||
} | ||
|
@@ -152,6 +197,14 @@ void Overlapper::post_process_overlaps(std::vector<Overlap>& overlaps, const boo | |
{ | ||
prev_overlap = overlaps[i - 1]; | ||
const Overlap current_overlap = overlaps[i]; | ||
if (max_reciprocal == 0.0 && overlaps_identical(prev_overlap, current_overlap)) | ||
{ | ||
drop_overlap_mask[i - 1] = true; | ||
} | ||
else if (max_reciprocal > 0.0 && percent_reciprocal_overlap(prev_overlap, current_overlap)) | ||
{ | ||
drop_overlap_mask[i - 1] = true; | ||
} | ||
//Check if previous overlap can be merged into the current one | ||
if (overlaps_mergable(prev_overlap, current_overlap)) | ||
{ | ||
|
@@ -229,7 +282,7 @@ void Overlapper::post_process_overlaps(std::vector<Overlap>& overlaps, const boo | |
overlaps.push_back(fused_overlap); | ||
} | ||
|
||
if (drop_fused_overlaps) | ||
if (drop_fused_overlaps || max_reciprocal >= 0.0) | ||
{ | ||
details::overlapper::drop_overlaps_by_mask(overlaps, drop_overlap_mask); | ||
} | ||
|
@@ -239,6 +292,27 @@ namespace details | |
{ | ||
namespace overlapper | ||
{ | ||
|
||
void filter_self_mappings(std::vector<Overlap>& overlaps, | ||
const io::FastaParser& query_parser, | ||
const io::FastaParser& target_parser, | ||
const double max_percent_overlap) | ||
{ | ||
|
||
auto remove_self_helper = [&query_parser, &target_parser, &max_percent_overlap](const Overlap& o) { | ||
claraparabricks::genomeworks::io::FastaSequence query = query_parser.get_sequence_by_id(o.query_read_id_); | ||
claraparabricks::genomeworks::io::FastaSequence target = target_parser.get_sequence_by_id(o.target_read_id_); | ||
if (query.name != target.name) | ||
return false; | ||
std::size_t read_len = query.seq.size(); | ||
std::int32_t overlap_length = abs(o.query_end_position_in_read_ - o.query_start_position_in_read_); | ||
double percent_overlap = static_cast<double>(overlap_length) / static_cast<double>(read_len); | ||
return percent_overlap >= max_percent_overlap; | ||
}; | ||
|
||
overlaps.erase(std::remove_if(begin(overlaps), end(overlaps), remove_self_helper), end(overlaps)); | ||
} | ||
|
||
void drop_overlaps_by_mask(std::vector<claraparabricks::genomeworks::cudamapper::Overlap>& overlaps, const std::vector<bool>& mask) | ||
{ | ||
std::size_t i = 0; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I find
0.0
a bit confusing. If I understand it correctly passing 0.75 would mean that any pair that has more than 75% overlap will get filtered out (i.e. one of those two overlaps). Wouldn't it than make sense to say that specifying 1.0 means only identical (= 100% overlapped) overlaps are filtered out and >1.0 that nothing gets filtered out as two overlaps cannot overlap more than 100%?