diff --git a/README.md b/README.md index a3d3a91..8d9f9aa 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,8 @@ ClairS is now available for early access to interested and experienced users. Yo ## Latest Updates +*v0.1.2 (May 17)* : Added HiFi Revio model, renamed HiFi Sequel II model from `hifi` to `hifi_sequel2`. + *v0.1.1 (Apr 30)* : 1. Added the "command line used" to VCF header. 2. Added `NAU`, `NCU`, `NGU`, and `NTU` tags (#reads supporting the four bases in normal) to the output. 3. Hybrid calling mode now outputs three VCFs, ClairS somatic variant calls, Clair3 normal germline variant calls, and Clair3 tumor germline variant calls. 4. Added the `--enable_clair3_germline_output` option to also output Clair3 normal germline variant calls, and Clair3 tumor germline variant calls (even when hybrid calling more is not enabled). Running time will increase by ~40%. *v0.1.0 (Mar 24)* : 1. Added support for Indel calling. ClairS Indel calling currently only supports ONT R10 data. To enable, use the `--enable_indel_calling` option. The Indel F1-score is ~73% with 50x/50x HCC1395/BL data. 2. Added an experimental `--normal_vcf_fn` to skip germline variant calling on normal BAM ([#7](https://github.com/HKU-BAL/ClairS/pull/7), contributor @[Xingyao](https://github.com/xingyaoc)). 3. Added `--hybrid_mode_vcf_fn` option to enable hybrid calling mode that combines de novo calling results and genotyping results without running the tool twice. Renamed the `--vcf_fn` to `--genotyping_mode_vcf_fn` for clarification. 4. Fixed a memory issue, memory consumption is now sub 100G for high coverage samples. 5. Fixed a conda environment issue in Singularity ([#3](https://github.com/HKU-BAL/ClairS/issues/3)). 6. Fixed zero division when no SNV was found ([#2](https://github.com/HKU-BAL/ClairS/issues/2), [#5](https://github.com/HKU-BAL/ClairS/issues/5)). 7. Added `AD` tag in the output. diff --git a/clairs/utils.py b/clairs/utils.py index 5b0243e..ee98496 100644 --- a/clairs/utils.py +++ b/clairs/utils.py @@ -504,6 +504,10 @@ def get_training_array(args, label = [0, 1, 0] else: label = [1, 0, 0] + + if args.use_reference_candidates_only and label[0] != 1: + continue + total_compressed = write_table_dict(table_dict=table_dict, normal_matrix=normal_tensor, tumor_matrix=tumor_tensor, diff --git a/src/create_bin.py b/src/create_bin.py index 79a7a45..74d8bf0 100644 --- a/src/create_bin.py +++ b/src/create_bin.py @@ -118,7 +118,7 @@ def main(): parser.add_argument('--phase_tumor', type=str2bool, default=False, help=SUPPRESS) - parser.add_argument('--ref_only', type=str2bool, default=False, + parser.add_argument('--use_reference_candidates_only', type=str2bool, default=False, help=SUPPRESS) args = parser.parse_args() diff --git a/src/get_candidates.py b/src/get_candidates.py index aea06bf..e4c396d 100644 --- a/src/get_candidates.py +++ b/src/get_candidates.py @@ -458,7 +458,7 @@ def get_candidates(args): random.seed(0) hetero_germline = random.sample(hetero_germline, int(len(hetero_germline) * maximum_non_variant_ratio)) - if args.ref_only: + if args.use_reference_candidates_only: homo_germline = [] hetero_germline = [] @@ -470,7 +470,7 @@ def get_candidates(args): # skip hetero variant here hetero_somatic = [(item, 'hetero_somatic') for item in hetero_somatic_set] if add_hetero_pos else [] - if args.ref_only: + if args.use_reference_candidates_only: homo_somatic = [] hetero_somatic = [] @@ -654,7 +654,7 @@ def main(): parser.add_argument('--exclude_flanking_truth', type=str2bool, default=1, help="Exclude truths in a flanking window into training") - parser.add_argument('--ref_only', type=str2bool, default=0, + parser.add_argument('--use_reference_candidates_only', type=str2bool, default=0, help="Exclude truths in a flanking window into training") ## Output VCF path