Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[cudamapper] Filtering for self mappings, identical overlaps, and highly-similar overlaps #563

Open
wants to merge 5 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,18 @@ namespace details
namespace overlapper
{

///
/// \brief Removes overlaps which cover more than a certain percentage of a read.
///
/// \param overlaps A vector of overlaps.
/// \param query_parser A FastaParser containing query sequences.
/// \param target_parser A FastaParser containing target sequences.
/// \param max_percent_overlap The maximum percent overlap allowed before an overlap is removed.
void filter_self_mappings(std::vector<Overlap>& overlaps,
const io::FastaParser& query_parser,
const io::FastaParser& target_parser,
const double max_percent_overlap);

/// \brief Extends a single overlap at its ends if the similarity of the query and target sequences is above a specified threshold.
/// \param overlap An Overlap which is modified in place. Any of the query_start_position_in_read, query_end_position_in_read,
/// target_start_position_in_read, and target_end_position_in_read fields may be modified.
Expand Down Expand Up @@ -98,7 +110,8 @@ class Overlapper
/// \brief Identified overlaps which can be combined into a larger overlap and add them to the input vector
/// \param overlaps reference to vector of Overlaps. New overlaps (result of fusing) are added to this vector
/// \param drop_fused_overlaps If true, remove overlaps that are fused into larger overlaps in output.
static void post_process_overlaps(std::vector<Overlap>& overlaps, bool drop_fused_overlaps = false);
/// \param max_reciprocal Maximum reciprocal identity between two overlaps before one is filtered. Default (< 0.0) means no filtering; 0.0 removes only identical adjacent overlaps.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I find 0.0 a bit confusing. If I understand it correctly passing 0.75 would mean that any pair that has more than 75% overlap will get filtered out (i.e. one of those two overlaps). Wouldn't it than make sense to say that specifying 1.0 means only identical (= 100% overlapped) overlaps are filtered out and >1.0 that nothing gets filtered out as two overlaps cannot overlap more than 100%?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure how complicated would it be to specify which overlap gets filtered out (e.g. the one with smaller starting position, longer/shorter one or something)

static void post_process_overlaps(std::vector<Overlap>& overlaps, bool drop_fused_overlaps = false, const double max_reciprocal = -1.0);

/// \brief Given a vector of overlaps, extend the start/end of the overlaps based on the sequence similarity of the query and target.
/// \param overlaps A vector of overlaps. This is modified in-place; query_start_position_in_read_, query_end_position_in_read_,
Expand Down
10 changes: 9 additions & 1 deletion cudamapper/src/application_parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ ApplicationParameters::ApplicationParameters(int argc, char* argv[])
{"min-overlap-fraction", required_argument, 0, 'z'},
{"rescue-overlap-ends", no_argument, 0, 'R'},
{"drop-fused-overlaps", no_argument, 0, 'D'},
{"preserve-self-mappings", no_argument, 0, 'S'},
{"max-reciprocal", required_argument, 0, 'Z'},
Comment on lines +54 to +55
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These parameters should also be added to ApplicationParameters::help()

{"query-indices-in-host-memory", required_argument, 0, 'Q'},
{"query-indices-in-device-memory", required_argument, 0, 'q'},
{"target-indices-in-host-memory", required_argument, 0, 'C'},
Expand All @@ -59,7 +61,7 @@ ApplicationParameters::ApplicationParameters(int argc, char* argv[])
{"help", no_argument, 0, 'h'},
};

std::string optstring = "k:w:d:m:i:t:F:a:r:l:b:z:RDQ:q:C:c:vh";
std::string optstring = "k:w:d:m:i:t:F:a:r:l:b:z:Z:RDSQ:q:C:c:vh";

bool target_indices_in_host_memory_set = false;
bool target_indices_in_device_memory_set = false;
Expand Down Expand Up @@ -111,12 +113,18 @@ ApplicationParameters::ApplicationParameters(int argc, char* argv[])
case 'z':
min_overlap_fraction = std::stof(optarg);
break;
case 'Z':
max_reciprocal = std::stof(optarg);
break;
case 'R':
perform_overlap_end_rescue = true;
break;
case 'D':
drop_fused_overlaps = true;
break;
case 'S':
filter_self_mappings = false;
break;
case 'Q':
query_indices_in_host_memory = std::stoi(optarg);
break;
Expand Down
2 changes: 2 additions & 0 deletions cudamapper/src/application_parameters.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,10 @@ class ApplicationParameters
int32_t min_overlap_len = 250; // l, recommended range: 100 - 1000
int32_t min_bases_per_residue = 1000; // b
float min_overlap_fraction = 0.8; // z
float max_reciprocal = -1.0; // Z, < 0 : no filtering. 0.0: only identical filtering. > 1.0: filter by percent reciprocity
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For -F we use 0.0 - 1.0 range, not 1.0 - 100.0 (which should actually probably be 0.0 - 100.0)

bool perform_overlap_end_rescue = false; // R
bool drop_fused_overlaps = false; // D
bool filter_self_mappings = true; // S (to turn off)
int32_t query_indices_in_host_memory = 10; // Q
int32_t query_indices_in_device_memory = 5; // q
int32_t target_indices_in_host_memory = 10; // C
Expand Down
8 changes: 7 additions & 1 deletion cudamapper/src/main.cu
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,13 @@ void postprocess_and_write_thread_function(const int32_t device_id,
{
GW_NVTX_RANGE(profiler, "main::postprocess_and_write_thread::postprocessing");
// Overlap post processing - add overlaps which can be combined into longer ones.
Overlapper::post_process_overlaps(data_to_write->overlaps, application_parameters.drop_fused_overlaps);
Overlapper::post_process_overlaps(data_to_write->overlaps, application_parameters.drop_fused_overlaps, application_parameters.max_reciprocal);
}

if (application_parameters.all_to_all && application_parameters.filter_self_mappings)
{
GW_NVTX_RANGE(profiler, "main::postprocess_and_write_thread::remove_self_mappings");
::claraparabricks::genomeworks::cudamapper::details::overlapper::filter_self_mappings(overlaps, *application_parameters.query_parser, *application_parameters.target_parser, 0.9);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it necessary to qualify the function this day? Also, is function really a detail if it is used in a completely different (top level) file

}

if (application_parameters.perform_overlap_end_rescue)
Expand Down
80 changes: 77 additions & 3 deletions cudamapper/src/overlapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,51 @@ bool overlaps_mergable(const claraparabricks::genomeworks::cudamapper::Overlap o
return short_gap_relative_to_length;
}

inline bool overlaps_identical(const claraparabricks::genomeworks::cudamapper::Overlap& a, const claraparabricks::genomeworks::cudamapper::Overlap& b)
{
return a.query_read_id_ == b.query_read_id_ &&
a.target_read_id_ == b.target_read_id_ &&
a.query_start_position_in_read_ == b.query_start_position_in_read_ &&
a.query_end_position_in_read_ == b.query_end_position_in_read_ &&
a.target_start_position_in_read_ == b.target_start_position_in_read_ &&
a.target_end_position_in_read_ == b.target_end_position_in_read_;
}

inline double percent_reciprocal_overlap(const claraparabricks::genomeworks::cudamapper::Overlap& a, const claraparabricks::genomeworks::cudamapper::Overlap& b)
{
if (a.query_read_id_ != b.query_read_id_ || a.target_read_id_ != b.target_read_id_ || a.relative_strand != b.relative_strand)
{
return 0.0;
}
int32_t query_overlap = std::min(a.query_end_position_in_read_, b.query_end_position_in_read_) - std::max(a.query_start_position_in_read_, b.query_start_position_in_read_);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This and similar should be position_in_read_t or even better number_of_basepairs_t

int32_t target_overlap;
if (a.relative_strand == claraparabricks::genomeworks::cudamapper::RelativeStrand::Forward && b.relative_strand == claraparabricks::genomeworks::cudamapper::RelativeStrand::Forward)
{
target_overlap = std::min(a.target_end_position_in_read_, b.target_end_position_in_read_) - std::max(a.target_start_position_in_read_, b.target_start_position_in_read_);
}
else
{
target_overlap = std::max(a.target_start_position_in_read_, b.target_start_position_in_read_) - std::min(a.target_end_position_in_read_, b.target_end_position_in_read_);
}

int32_t query_total_length = std::max(a.query_end_position_in_read_, b.query_end_position_in_read_) - std::min(a.query_start_position_in_read_, b.query_start_position_in_read_);
int32_t target_total_length;
if (a.relative_strand == claraparabricks::genomeworks::cudamapper::RelativeStrand::Forward && b.relative_strand == claraparabricks::genomeworks::cudamapper::RelativeStrand::Forward)
{
target_total_length = std::max(a.target_end_position_in_read_, b.target_end_position_in_read_) - std::min(a.target_start_position_in_read_, b.target_start_position_in_read_);
}
else
{
target_total_length = std::min(a.target_start_position_in_read_, b.target_start_position_in_read_) - std::max(a.target_end_position_in_read_, b.target_end_position_in_read_);
}
return static_cast<double>(query_overlap + target_overlap) / static_cast<double>(query_total_length + target_total_length);
}

bool overlaps_reciprocal(const claraparabricks::genomeworks::cudamapper::Overlap& a, const claraparabricks::genomeworks::cudamapper::Overlap& b, const double threshold)
{
return percent_reciprocal_overlap(a, b) > threshold;
}

// Reverse complement lookup table
static char complement_array[26] = {
84, 66, 71, 68, 69,
Expand Down Expand Up @@ -132,7 +177,7 @@ namespace genomeworks
namespace cudamapper
{

void Overlapper::post_process_overlaps(std::vector<Overlap>& overlaps, const bool drop_fused_overlaps)
void Overlapper::post_process_overlaps(std::vector<Overlap>& overlaps, const bool drop_fused_overlaps, const double max_reciprocal)
{
const auto num_overlaps = get_size(overlaps);
bool in_fuse = false;
Expand All @@ -143,7 +188,7 @@ void Overlapper::post_process_overlaps(std::vector<Overlap>& overlaps, const boo
int num_residues = 0;
Overlap prev_overlap;
std::vector<bool> drop_overlap_mask;
if (drop_fused_overlaps)
if (drop_fused_overlaps || max_reciprocal >= 0.0)
{
drop_overlap_mask.resize(overlaps.size());
}
Expand All @@ -152,6 +197,14 @@ void Overlapper::post_process_overlaps(std::vector<Overlap>& overlaps, const boo
{
prev_overlap = overlaps[i - 1];
const Overlap current_overlap = overlaps[i];
if (max_reciprocal == 0.0 && overlaps_identical(prev_overlap, current_overlap))
{
drop_overlap_mask[i - 1] = true;
}
else if (max_reciprocal > 0.0 && percent_reciprocal_overlap(prev_overlap, current_overlap))
{
drop_overlap_mask[i - 1] = true;
}
//Check if previous overlap can be merged into the current one
if (overlaps_mergable(prev_overlap, current_overlap))
{
Expand Down Expand Up @@ -229,7 +282,7 @@ void Overlapper::post_process_overlaps(std::vector<Overlap>& overlaps, const boo
overlaps.push_back(fused_overlap);
}

if (drop_fused_overlaps)
if (drop_fused_overlaps || max_reciprocal >= 0.0)
{
details::overlapper::drop_overlaps_by_mask(overlaps, drop_overlap_mask);
}
Expand All @@ -239,6 +292,27 @@ namespace details
{
namespace overlapper
{

void filter_self_mappings(std::vector<Overlap>& overlaps,
const io::FastaParser& query_parser,
const io::FastaParser& target_parser,
const double max_percent_overlap)
{

auto remove_self_helper = [&query_parser, &target_parser, &max_percent_overlap](const Overlap& o) {
claraparabricks::genomeworks::io::FastaSequence query = query_parser.get_sequence_by_id(o.query_read_id_);
claraparabricks::genomeworks::io::FastaSequence target = target_parser.get_sequence_by_id(o.target_read_id_);
if (query.name != target.name)
return false;
std::size_t read_len = query.seq.size();
std::int32_t overlap_length = abs(o.query_end_position_in_read_ - o.query_start_position_in_read_);
double percent_overlap = static_cast<double>(overlap_length) / static_cast<double>(read_len);
return percent_overlap >= max_percent_overlap;
};

overlaps.erase(std::remove_if(begin(overlaps), end(overlaps), remove_self_helper), end(overlaps));
}

void drop_overlaps_by_mask(std::vector<claraparabricks::genomeworks::cudamapper::Overlap>& overlaps, const std::vector<bool>& mask)
{
std::size_t i = 0;
Expand Down