Skip to content

Commit

Permalink
r1105: avoid long running time occasionally (lh3#771)
Browse files Browse the repository at this point in the history
Caused by highly repetitive minimizers on a query sequence. The solution is to
filter out these query minimizers.
  • Loading branch information
lh3 committed Aug 15, 2021
1 parent e3d1dc6 commit 6c5bc22
Show file tree
Hide file tree
Showing 8 changed files with 40 additions and 2 deletions.
4 changes: 3 additions & 1 deletion main.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include "mmpriv.h"
#include "ketopt.h"

#define MM_VERSION "2.22-r1101"
#define MM_VERSION "2.22-r1105-dirty"

#ifdef __linux__
#include <sys/resource.h>
Expand Down Expand Up @@ -74,6 +74,7 @@ static ko_longopt_t long_options[] = {
{ "rmq", ko_optional_argument, 347 },
{ "qstrand", ko_no_argument, 348 },
{ "cap-kalloc", ko_required_argument, 349 },
{ "q-occ-frac", ko_required_argument, 350 },
{ "help", ko_no_argument, 'h' },
{ "max-intron-len", ko_required_argument, 'G' },
{ "version", ko_no_argument, 'V' },
Expand Down Expand Up @@ -229,6 +230,7 @@ int main(int argc, char *argv[])
else if (c == 346) opt.mask_len = mm_parse_num(o.arg); // --mask-len
else if (c == 348) opt.flag |= MM_F_QSTRAND | MM_F_NO_INV; // --qstrand
else if (c == 349) opt.cap_kalloc = mm_parse_num(o.arg); // --cap-kalloc
else if (c == 350) opt.q_occ_frac = atof(o.arg); // --q-occ-frac
else if (c == 330) {
fprintf(stderr, "[WARNING] \033[1;31m --lj-min-ratio has been deprecated.\033[0m\n");
} else if (c == 314) { // --frag
Expand Down
1 change: 1 addition & 0 deletions map.c
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
hash = __ac_Wang_hash(hash);

collect_minimizers(b->km, opt, mi, n_segs, qlens, seqs, &mv);
if (opt->q_occ_frac > 0.0f) mm_seed_mz_flt(b->km, &mv, opt->mid_occ, opt->q_occ_frac);
if (opt->flag & MM_F_HEAP_SORT) a = collect_seed_hits_heap(b->km, opt, opt->mid_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos);
else a = collect_seed_hits(b->km, opt, opt->mid_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos);

Expand Down
1 change: 1 addition & 0 deletions minimap.h
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ typedef struct {
int pe_ori, pe_bonus;

float mid_occ_frac; // only used by mm_mapopt_update(); see below
float q_occ_frac;
int32_t min_mid_occ, max_mid_occ;
int32_t mid_occ; // ignore seeds with occurrences above this threshold
int32_t max_occ, max_max_occ, occ_dist;
Expand Down
8 changes: 7 additions & 1 deletion minimap2.1
Original file line number Diff line number Diff line change
Expand Up @@ -151,10 +151,16 @@ Lower and upper bounds of k-mer occurrences [10,1000000]. The final k-mer occurr
.BR -f }}.
This option prevents excessively small or large
.B -f
estimated from the input reference. It deprecates
estimated from the input reference. Available since r1034 and deprecating
.B --min-occ-floor
in earlier versions of minimap2.
.TP
.BI --q-occ-frac \ FLOAT
Discard a query minimizer if its occurrence is higher than
.I FLOAT
fraction of query minimizers and than the reference occurrence threshold
[0.02]. Set 0 to disable. Available since r1105.
.TP
.BI -e \ INT
Sample a high-frequency minimizer every
.I INT
Expand Down
1 change: 1 addition & 0 deletions mmpriv.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p);

mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos);
void mm_seed_mz_flt(void *km, mm128_v *mv, int32_t q_occ_max, float q_occ_frac);

double mm_event_identity(const mm_reg1_t *r);
int mm_write_sam_hdr(const mm_idx_t *mi, const char *rg, const char *ver, int argc, char *argv[]);
Expand Down
1 change: 1 addition & 0 deletions options.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ void mm_mapopt_init(mm_mapopt_t *opt)
opt->min_mid_occ = 10;
opt->max_mid_occ = 1000000;
opt->sdust_thres = 0; // no SDUST masking
opt->q_occ_frac = 0.02f;

opt->min_cnt = 3;
opt->min_chain_score = 40;
Expand Down
1 change: 1 addition & 0 deletions python/cmappy.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ cdef extern from "minimap.h":
int pe_ori, pe_bonus

float mid_occ_frac
float q_occ_frac
int32_t min_mid_occ
int32_t mid_occ
int32_t max_occ
Expand Down
25 changes: 25 additions & 0 deletions seed.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,31 @@
#include "kalloc.h"
#include "ksort.h"

void mm_seed_mz_flt(void *km, mm128_v *mv, int32_t q_occ_max, float q_occ_frac)
{
mm128_t *a;
size_t i, j, st;
if (mv->n <= q_occ_max || q_occ_frac <= 0.0f || q_occ_max <= 0) return;
KMALLOC(km, a, mv->n);
for (i = 0; i < mv->n; ++i)
a[i].x = mv->a[i].x, a[i].y = i;
radix_sort_128x(a, a + mv->n);
for (st = 0, i = 1; i <= mv->n; ++i) {
if (i == mv->n || a[i].x != a[st].x) {
int32_t cnt = i - st;
if (cnt > q_occ_max && cnt > mv->n * q_occ_frac)
for (j = st; j < i; ++j)
mv->a[a[j].y].x = 0;
st = i;
}
}
kfree(km, a);
for (i = j = 0; i < mv->n; ++i)
if (mv->a[i].x != 0)
mv->a[j++] = mv->a[i];
mv->n = j;
}

mm_seed_t *mm_seed_collect_all(void *km, const mm_idx_t *mi, const mm128_v *mv, int32_t *n_m_)
{
mm_seed_t *m;
Expand Down

0 comments on commit 6c5bc22

Please sign in to comment.