From 04fcab203acb2aaeeed4c8726d30b02eccff11d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Wed, 13 Dec 2023 15:22:20 +0100 Subject: [PATCH 01/15] refactor: compare alignments between mapped reads only --- htsinfer/get_library_type.py | 61 +++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 25 deletions(-) diff --git a/htsinfer/get_library_type.py b/htsinfer/get_library_type.py index 4d4257c..545c4d2 100644 --- a/htsinfer/get_library_type.py +++ b/htsinfer/get_library_type.py @@ -144,6 +144,8 @@ def _align_mates(self): samfile1 = pysam.AlignmentFile(str(alignment_1), 'r') samfile2 = pysam.AlignmentFile(str(alignment_2), 'r') + seq_id1 = None + seq_id2 = None previous_seq_id1 = None previous_seq_id2 = None @@ -154,43 +156,52 @@ def _align_mates(self): concordant = 0 for read1 in samfile1: - seq_id1 = read1.query_name - if seq_id1 != previous_seq_id1 \ - and previous_seq_id1 is not None: - mate1.append(reads1.copy()) - reads1.clear() - if read1.reference_end: - reads1.append(read1) + if not read1.flag & (1 << 2) and isinstance(read1.query_name, str): + seq_id1 = read1.query_name + if ( + seq_id1 != previous_seq_id1 and + previous_seq_id1 is not None + ): + mate1.append(reads1.copy()) + reads1.clear() + if read1.reference_end: + reads1.append(read1) previous_seq_id1 = seq_id1 mate1.append(reads1.copy()) read_counter = 0 for read2 in samfile2: - seq_id2 = read2.query_name - if seq_id2 != previous_seq_id2 \ - and previous_seq_id2 is not None: - if self._compare_alignments(mate1[read_counter], reads2): - concordant += 1 - reads2.clear() - read_counter += 1 - if read2.reference_end: - reads2.append(read2) + if not read2.flag & (1 << 2) and isinstance(read2.query_name, str): + seq_id2 = read2.query_name + if seq_id2 != previous_seq_id2 \ + and previous_seq_id2 is not None: + if self._compare_alignments(mate1[read_counter], reads2): + concordant += 1 + reads2.clear() + read_counter += 1 + if read2.reference_end: + reads2.append(read2) previous_seq_id2 = seq_id2 if self._compare_alignments(mate1[read_counter], reads2): concordant += 1 - if (concordant / read_counter) >= self.cutoff: - self.results.relationship = ( - StatesTypeRelationship.split_mates - ) - self.mapping.library_type.relationship \ - = StatesTypeRelationship.split_mates - self.mapping.mapped = False - self.mapping.star_dirs = [] + if read_counter > 0: + if (concordant / read_counter) >= self.cutoff: + self.results.relationship = ( + StatesTypeRelationship.split_mates + ) + self.mapping.library_type.relationship \ + = StatesTypeRelationship.split_mates + self.mapping.mapped = False + self.mapping.star_dirs = [] + else: + self.results.relationship = ( + StatesTypeRelationship.not_mates + ) else: self.results.relationship = ( - StatesTypeRelationship.not_mates + StatesTypeRelationship.not_available ) samfile1.close() From 455e98368aac67a76f6103a4a9cb8f749545de8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Wed, 3 Jan 2024 13:33:28 +0100 Subject: [PATCH 02/15] fix: update get lib type test --- htsinfer/get_library_type.py | 5 +++-- tests/test_get_library_type.py | 35 +++++++++++++++++++++++++++++++--- 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/htsinfer/get_library_type.py b/htsinfer/get_library_type.py index 545c4d2..cc4bb1e 100644 --- a/htsinfer/get_library_type.py +++ b/htsinfer/get_library_type.py @@ -135,6 +135,7 @@ def _evaluate_mate_relationship( self.mapping.evaluate() self._align_mates() + # pylint: disable=R0912 def _align_mates(self): """Decide mate relationship by alignment.""" @@ -186,7 +187,7 @@ def _align_mates(self): if self._compare_alignments(mate1[read_counter], reads2): concordant += 1 - if read_counter > 0: + try: if (concordant / read_counter) >= self.cutoff: self.results.relationship = ( StatesTypeRelationship.split_mates @@ -199,7 +200,7 @@ def _align_mates(self): self.results.relationship = ( StatesTypeRelationship.not_mates ) - else: + except ZeroDivisionError: self.results.relationship = ( StatesTypeRelationship.not_available ) diff --git a/tests/test_get_library_type.py b/tests/test_get_library_type.py index dd52dfa..8555694 100644 --- a/tests/test_get_library_type.py +++ b/tests/test_get_library_type.py @@ -112,6 +112,35 @@ def test_evaluate_mate_relationship_split_mates(self): ) def test_evaluate_mate_relationship_not_mates(self, tmpdir): + """Test mate relationship evaluation logic with input files that are + mates, but the relationship is not enough to trigger split_mates. + """ + CONFIG.args.path_1_processed = FILE_MATE_1 + CONFIG.args.path_2_processed = FILE_MATE_2 + CONFIG.args.t_file_processed = FILE_TRANSCRIPTS + CONFIG.args.tmp_dir = tmpdir + MAPPING.paths = (FILE_MATE_1, FILE_MATE_2) + MAPPING.transcripts_file = FILE_TRANSCRIPTS + MAPPING.tmp_dir = tmpdir + + test_instance = GetLibType(config=CONFIG, mapping=MAPPING) + test_instance.results.file_1 = StatesType.not_available + test_instance.results.file_2 = StatesType.not_available + + # Set the cutoff such that it's not enough to trigger split_mates + test_instance.cutoff = 300 + + # Call the _evaluate_mate_relationship method + test_instance._evaluate_mate_relationship( + ids_1=["A", "B", "C"], ids_2=["A", "B", "C"] + ) + + assert ( + test_instance.results.relationship == + StatesTypeRelationship.not_mates + ) + + def test_evaluate_mate_relationship_not_available(self, tmpdir): """Test mate relationship evaluation logic with input files that are not mates from a paired-end library. """ @@ -124,12 +153,12 @@ def test_evaluate_mate_relationship_not_mates(self, tmpdir): MAPPING.tmp_dir = tmpdir test_instance = GetLibType(config=CONFIG, mapping=MAPPING) - test_instance.results.file_1 = StatesType.first_mate - test_instance.results.file_2 = StatesType.second_mate + test_instance.results.file_1 = StatesType.not_available + test_instance.results.file_2 = StatesType.not_available test_instance.evaluate() assert ( test_instance.results.relationship == - StatesTypeRelationship.not_mates + StatesTypeRelationship.not_available ) def test_evaluate_split_mates_not_matching_ids(self, tmpdir): From 806420d64f022ca912111a9c28baf155eb5d00b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Wed, 3 Jan 2024 15:24:25 +0100 Subject: [PATCH 03/15] refactor: helper function for getlibtype --- htsinfer/get_library_type.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/htsinfer/get_library_type.py b/htsinfer/get_library_type.py index cc4bb1e..825a66a 100644 --- a/htsinfer/get_library_type.py +++ b/htsinfer/get_library_type.py @@ -135,7 +135,6 @@ def _evaluate_mate_relationship( self.mapping.evaluate() self._align_mates() - # pylint: disable=R0912 def _align_mates(self): """Decide mate relationship by alignment.""" @@ -187,6 +186,13 @@ def _align_mates(self): if self._compare_alignments(mate1[read_counter], reads2): concordant += 1 + self._update_relationship(concordant, read_counter) + + samfile1.close() + samfile2.close() + + def _update_relationship(self, concordant, read_counter): + """Helper function to update relationship based on alignment.""" try: if (concordant / read_counter) >= self.cutoff: self.results.relationship = ( @@ -205,9 +211,6 @@ def _align_mates(self): StatesTypeRelationship.not_available ) - samfile1.close() - samfile2.close() - class AlignedSegment: """Placeholder class for mypy "Missing attribute" error in _compare_alignments(), the actual object used From 782971a5ea4e8020e521d315388ac7fc8814759e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Thu, 4 Jan 2024 18:27:48 +0100 Subject: [PATCH 04/15] feat: only map when lib source is inferred --- htsinfer/get_library_type.py | 8 +++++++- htsinfer/get_read_orientation.py | 9 ++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/htsinfer/get_library_type.py b/htsinfer/get_library_type.py index 825a66a..0daa737 100644 --- a/htsinfer/get_library_type.py +++ b/htsinfer/get_library_type.py @@ -127,13 +127,19 @@ def _evaluate_mate_relationship( self.mapping.library_type.relationship = ( StatesTypeRelationship.split_mates ) - else: + elif self.library_source.file_1.short_name is not None \ + and self.library_source.file_2.short_name is not None: self.mapping.library_type.relationship \ = StatesTypeRelationship.not_available self.mapping.library_source = self.library_source self.mapping.paths = self.path_1, self.path_2 self.mapping.evaluate() self._align_mates() + else: + LOGGER.warning( + "Library source is not determined, " + "mate relationship cannot be inferred by alignment." + ) def _align_mates(self): """Decide mate relationship by alignment.""" diff --git a/htsinfer/get_read_orientation.py b/htsinfer/get_read_orientation.py index d76b092..c7dd1ff 100644 --- a/htsinfer/get_read_orientation.py +++ b/htsinfer/get_read_orientation.py @@ -75,8 +75,15 @@ def evaluate(self) -> ResultsOrientation: self.mapping.transcripts_file = self.transcripts_file self.mapping.tmp_dir = self.tmp_dir - if not self.mapping.mapped: + if not self.mapping.mapped \ + and self.library_source.file_1.short_name is not None \ + and self.library_source.file_2.short_name is not None: self.mapping.evaluate() + else: + LOGGER.warning( + "Library source is not determined, " + "read orientation cannot be inferred by alignment." + ) return self.process_alignments(star_dirs=self.mapping.star_dirs) From 28284debc61a48ef8c36c511a4c43525f41d8dcb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Fri, 5 Jan 2024 11:12:25 +0100 Subject: [PATCH 05/15] update tests --- htsinfer/get_read_orientation.py | 2 +- tests/test_get_library_type.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/htsinfer/get_read_orientation.py b/htsinfer/get_read_orientation.py index c7dd1ff..a9ccd36 100644 --- a/htsinfer/get_read_orientation.py +++ b/htsinfer/get_read_orientation.py @@ -77,7 +77,7 @@ def evaluate(self) -> ResultsOrientation: if not self.mapping.mapped \ and self.library_source.file_1.short_name is not None \ - and self.library_source.file_2.short_name is not None: + or self.library_source.file_2.short_name is not None: self.mapping.evaluate() else: LOGGER.warning( diff --git a/tests/test_get_library_type.py b/tests/test_get_library_type.py index 8555694..738840a 100644 --- a/tests/test_get_library_type.py +++ b/tests/test_get_library_type.py @@ -13,7 +13,9 @@ GetFastqType, ) from htsinfer.models import ( + ResultsSource, ResultsType, + Source, SeqIdFormats, StatesType, StatesTypeRelationship, @@ -147,6 +149,10 @@ def test_evaluate_mate_relationship_not_available(self, tmpdir): CONFIG.args.path_1_processed = FILE_IDS_NOT_MATCH_1 CONFIG.args.path_2_processed = FILE_MATE_2 CONFIG.args.t_file_processed = FILE_TRANSCRIPTS + CONFIG.results.library_source = ResultsSource( + file_1=Source(short_name="hsapiens", taxon_id=9606), + file_2=Source(short_name="hsapiens", taxon_id=9606), + ) CONFIG.args.tmp_dir = tmpdir MAPPING.paths = (FILE_IDS_NOT_MATCH_1, FILE_MATE_2) MAPPING.transcripts_file = FILE_TRANSCRIPTS @@ -167,6 +173,10 @@ def test_evaluate_split_mates_not_matching_ids(self, tmpdir): """ CONFIG.args.path_1_processed = FILE_IDS_NOT_MATCH_1 CONFIG.args.path_2_processed = FILE_IDS_NOT_MATCH_2 + CONFIG.results.library_source = ResultsSource( + file_1=Source(short_name="hsapiens", taxon_id=9606), + file_2=Source(short_name="hsapiens", taxon_id=9606), + ) CONFIG.args.tmp_dir = tmpdir MAPPING.paths = (FILE_IDS_NOT_MATCH_1, FILE_IDS_NOT_MATCH_2) MAPPING.tmp_dir = tmpdir From dae137d6c23f34bd6d037f7f4c3d9b8f34398ae1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Fri, 5 Jan 2024 11:45:40 +0100 Subject: [PATCH 06/15] fix orientation tests --- tests/test_get_read_orientation.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/test_get_read_orientation.py b/tests/test_get_read_orientation.py index 1089433..ddd9fb6 100644 --- a/tests/test_get_read_orientation.py +++ b/tests/test_get_read_orientation.py @@ -154,7 +154,10 @@ def test_evaluate_paired_unmapped(self, tmpdir): CONFIG.args.path_1_processed = FILE_UNMAPPED_PAIRED_1 CONFIG.args.path_2_processed = FILE_UNMAPPED_PAIRED_2 CONFIG.args.tmp_dir = tmpdir - CONFIG.results.library_source = ResultsSource() + CONFIG.results.library_source = ResultsSource( + file_1=Source(short_name="hsapiens", taxon_id=9606), + file_2=Source(short_name="hsapiens", taxon_id=9606) + ) CONFIG.results.library_type = ResultsType( relationship=StatesTypeRelationship.split_mates, ) @@ -259,7 +262,7 @@ def test_evaluate_paired_not_mates_unmapped(self, tmpdir): ) CONFIG.results.library_source = ResultsSource( file_1=Source(), - file_2=Source(), + file_2=Source(short_name="hsapiens", taxon_id=9606), ) CONFIG.args.tmp_dir = tmpdir MAPPING.mapped = False From bf7bea12885064946d70d79d9c454ce92c6f09e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Fri, 5 Jan 2024 13:41:41 +0100 Subject: [PATCH 07/15] fix orientation tests --- tests/test_get_read_orientation.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_get_read_orientation.py b/tests/test_get_read_orientation.py index ddd9fb6..b73b8e2 100644 --- a/tests/test_get_read_orientation.py +++ b/tests/test_get_read_orientation.py @@ -73,6 +73,10 @@ def test_init_all(self, tmpdir): CONFIG.args.path_2_processed = FILE_MATE_2 CONFIG.args.t_file_processed = FILE_TRANSCRIPTS CONFIG.args.tmp_dir = tmpdir + CONFIG.results.library_source = ResultsSource( + file_1=Source(), + file_2=Source() + ) test_instance = GetOrientation(config=CONFIG, mapping=MAPPING) assert test_instance.paths[0] == FILE_MATE_1 From 5f8f62d113a970579b544a487f2ce08d265fd539 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Fri, 5 Jan 2024 15:48:55 +0100 Subject: [PATCH 08/15] refactor scripts --- htsinfer/get_library_type.py | 4 ++-- htsinfer/get_read_orientation.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/htsinfer/get_library_type.py b/htsinfer/get_library_type.py index 0daa737..0edfc36 100644 --- a/htsinfer/get_library_type.py +++ b/htsinfer/get_library_type.py @@ -127,8 +127,8 @@ def _evaluate_mate_relationship( self.mapping.library_type.relationship = ( StatesTypeRelationship.split_mates ) - elif self.library_source.file_1.short_name is not None \ - and self.library_source.file_2.short_name is not None: + elif (self.library_source.file_1.short_name is not None + and self.library_source.file_2.short_name is not None): self.mapping.library_type.relationship \ = StatesTypeRelationship.not_available self.mapping.library_source = self.library_source diff --git a/htsinfer/get_read_orientation.py b/htsinfer/get_read_orientation.py index a9ccd36..6e3c7a7 100644 --- a/htsinfer/get_read_orientation.py +++ b/htsinfer/get_read_orientation.py @@ -76,8 +76,8 @@ def evaluate(self) -> ResultsOrientation: self.mapping.tmp_dir = self.tmp_dir if not self.mapping.mapped \ - and self.library_source.file_1.short_name is not None \ - or self.library_source.file_2.short_name is not None: + and (self.library_source.file_1.short_name is not None + or self.library_source.file_2.short_name is not None): self.mapping.evaluate() else: LOGGER.warning( From 1e992e1cda3ec16c73d1704417e06310a7167fed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Mon, 8 Jan 2024 11:05:54 +0100 Subject: [PATCH 09/15] update get lib type and get read orient --- htsinfer/get_library_type.py | 65 +++++++++++++++++++------------- htsinfer/get_read_orientation.py | 2 +- tests/test_get_library_type.py | 6 --- 3 files changed, 40 insertions(+), 33 deletions(-) diff --git a/htsinfer/get_library_type.py b/htsinfer/get_library_type.py index 0edfc36..ca68c4e 100644 --- a/htsinfer/get_library_type.py +++ b/htsinfer/get_library_type.py @@ -113,7 +113,7 @@ def _evaluate_mate_relationship( ids_2: As `ids_1` but for the putative second mate file. """ self.results.relationship = StatesTypeRelationship.not_mates - if ids_1 == ids_2: + if ids_1 == ids_2 and len(ids_1) > 0 and len(ids_2) > 0: if ( self.results.file_1 == StatesType.first_mate and self.results.file_2 == StatesType.second_mate @@ -136,7 +136,7 @@ def _evaluate_mate_relationship( self.mapping.evaluate() self._align_mates() else: - LOGGER.warning( + LOGGER.debug( "Library source is not determined, " "mate relationship cannot be inferred by alignment." ) @@ -181,7 +181,9 @@ def _align_mates(self): seq_id2 = read2.query_name if seq_id2 != previous_seq_id2 \ and previous_seq_id2 is not None: - if self._compare_alignments(mate1[read_counter], reads2): + if read_counter < len(mate1) and self._compare_alignments( + mate1[read_counter], reads2 + ): concordant += 1 reads2.clear() read_counter += 1 @@ -189,9 +191,13 @@ def _align_mates(self): reads2.append(read2) previous_seq_id2 = seq_id2 - if self._compare_alignments(mate1[read_counter], reads2): + if read_counter < len(mate1) and self._compare_alignments( + mate1[read_counter], reads2 + ): concordant += 1 - + LOGGER.debug(f"Number of mapped reads file 1: {len(mate1)}") + LOGGER.debug(f"Number of mapped reads file 2: {read_counter}") + LOGGER.debug(f"Number of concordant reads: {concordant}") self._update_relationship(concordant, read_counter) samfile1.close() @@ -325,12 +331,14 @@ def evaluate(self) -> None: if self.seq_id_format is None: self.result = StatesType.not_available - raise MetadataWarning( + LOGGER.warning( "Could not determine sequence identifier format." ) - LOGGER.debug( - f"Sequence identifier format: {self.seq_id_format.name}" - ) + else: + LOGGER.debug( + "Sequence identifier format: " + f"{self.seq_id_format.name}" + ) # Ensure that remaining records are compatible with sequence # identifier format and library type determined from first @@ -339,28 +347,33 @@ def evaluate(self) -> None: "Checking consistency of remaining reads with initially " "determined identifier format and library type..." ) - for record in seq_iter: - records += 1 - try: - self._get_read_type( - seq_id=record[0], - regex=self.seq_id_format.value, - ) - except ( - InconsistentFastqIdentifiers, - UnknownFastqIdentifier, - ) as exc: - self.result = StatesType.not_available - raise MetadataWarning( - f"{type(exc).__name__}: {str(exc)}" - ) from exc + if self.seq_id_format is not None: + for record in seq_iter: + records += 1 + try: + self._get_read_type( + seq_id=record[0], + regex=self.seq_id_format.value, + ) + except ( + InconsistentFastqIdentifiers, + UnknownFastqIdentifier, + ) as exc: + self.result = StatesType.not_available + raise MetadataWarning( + f"{type(exc).__name__}: {str(exc)}" + ) from exc + LOGGER.debug(f"Total records processed: {records}") + else: + LOGGER.debug( + "Could not determine sequence identifier format. " + "Skipping consistency check for the remaining reads." + ) except (OSError, ValueError) as exc: self.result = StatesType.not_available raise FileProblem(f"{type(exc).__name__}: {str(exc)}") from exc - LOGGER.debug(f"Total records processed: {records}") - def _get_read_type( self, seq_id: str, diff --git a/htsinfer/get_read_orientation.py b/htsinfer/get_read_orientation.py index 6e3c7a7..93b7c01 100644 --- a/htsinfer/get_read_orientation.py +++ b/htsinfer/get_read_orientation.py @@ -80,7 +80,7 @@ def evaluate(self) -> ResultsOrientation: or self.library_source.file_2.short_name is not None): self.mapping.evaluate() else: - LOGGER.warning( + LOGGER.debug( "Library source is not determined, " "read orientation cannot be inferred by alignment." ) diff --git a/tests/test_get_library_type.py b/tests/test_get_library_type.py index 738840a..9682bf7 100644 --- a/tests/test_get_library_type.py +++ b/tests/test_get_library_type.py @@ -214,12 +214,6 @@ def test_evaluate_single(self): test_instance.evaluate() assert test_instance.result == StatesType.single - def test_evaluate_unknown_seq_id(self): - """Evaluate file with identifiers of an unknown format.""" - test_instance = GetFastqType(path=FILE_UNKNOWN_SEQ_ID) - with pytest.raises(MetadataWarning): - test_instance.evaluate() - def test_evaluate_inconsistent_identifiers_single_mate(self): """Raise ``MetadataWarning`` by passing a file with inconsistent identifiers, suggesting a single-end library first, then a paired-end From 3fbaceb511b6b3aedcebf0cd73217f763a428e95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Mon, 8 Jan 2024 14:02:31 +0100 Subject: [PATCH 10/15] refactor get lib type --- htsinfer/get_library_type.py | 14 ++++++++------ tests/test_get_library_type.py | 1 - 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/htsinfer/get_library_type.py b/htsinfer/get_library_type.py index ca68c4e..19665c9 100644 --- a/htsinfer/get_library_type.py +++ b/htsinfer/get_library_type.py @@ -206,7 +206,13 @@ def _align_mates(self): def _update_relationship(self, concordant, read_counter): """Helper function to update relationship based on alignment.""" try: - if (concordant / read_counter) >= self.cutoff: + ratio = concordant / read_counter + except ZeroDivisionError: + self.results.relationship = ( + StatesTypeRelationship.not_available + ) + else: + if ratio >= self.cutoff: self.results.relationship = ( StatesTypeRelationship.split_mates ) @@ -218,10 +224,6 @@ def _update_relationship(self, concordant, read_counter): self.results.relationship = ( StatesTypeRelationship.not_mates ) - except ZeroDivisionError: - self.results.relationship = ( - StatesTypeRelationship.not_available - ) class AlignedSegment: """Placeholder class for mypy "Missing attribute" @@ -331,7 +333,7 @@ def evaluate(self) -> None: if self.seq_id_format is None: self.result = StatesType.not_available - LOGGER.warning( + LOGGER.debug( "Could not determine sequence identifier format." ) else: diff --git a/tests/test_get_library_type.py b/tests/test_get_library_type.py index 9682bf7..2ae8c96 100644 --- a/tests/test_get_library_type.py +++ b/tests/test_get_library_type.py @@ -33,7 +33,6 @@ FILE_IDS_NOT_MATCH_2, FILE_TRANSCRIPTS, FILE_SINGLE, - FILE_UNKNOWN_SEQ_ID, RaiseError, SEQ_ID_DUMMY, SEQ_ID_MATE_1, From 70f425214c77be96c3cc01a38ced88cf2e0193fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Mon, 8 Jan 2024 16:16:25 +0100 Subject: [PATCH 11/15] refactor: update get lib type and tests --- htsinfer/get_library_type.py | 35 +++++++++-------- htsinfer/get_read_orientation.py | 7 ++-- tests/test_get_library_type.py | 65 +++++++++++++++++++++++++++++++- 3 files changed, 85 insertions(+), 22 deletions(-) diff --git a/htsinfer/get_library_type.py b/htsinfer/get_library_type.py index 19665c9..19d870f 100644 --- a/htsinfer/get_library_type.py +++ b/htsinfer/get_library_type.py @@ -127,8 +127,10 @@ def _evaluate_mate_relationship( self.mapping.library_type.relationship = ( StatesTypeRelationship.split_mates ) - elif (self.library_source.file_1.short_name is not None - and self.library_source.file_2.short_name is not None): + elif ( + self.library_source.file_1.short_name is not None and + self.library_source.file_2.short_name is not None + ): self.mapping.library_type.relationship \ = StatesTypeRelationship.not_available self.mapping.library_source = self.library_source @@ -136,6 +138,7 @@ def _evaluate_mate_relationship( self.mapping.evaluate() self._align_mates() else: + self.results.relationship = StatesTypeRelationship.not_available LOGGER.debug( "Library source is not determined, " "mate relationship cannot be inferred by alignment." @@ -195,8 +198,8 @@ def _align_mates(self): mate1[read_counter], reads2 ): concordant += 1 - LOGGER.debug(f"Number of mapped reads file 1: {len(mate1)}") - LOGGER.debug(f"Number of mapped reads file 2: {read_counter}") + LOGGER.debug(f"Number of aligned reads file 1: {len(mate1)}") + LOGGER.debug(f"Number of aligned reads file 2: {read_counter}") LOGGER.debug(f"Number of concordant reads: {concordant}") self._update_relationship(concordant, read_counter) @@ -331,25 +334,26 @@ def evaluate(self) -> None: self.result = StatesType.not_available raise FileProblem(f"File is empty: {self.path}") from exc - if self.seq_id_format is None: - self.result = StatesType.not_available + if self.seq_id_format is not None: LOGGER.debug( - "Could not determine sequence identifier format." + "Sequence identifier format: " + f"{self.seq_id_format.name}" ) else: + self.result = StatesType.not_available LOGGER.debug( - "Sequence identifier format: " - f"{self.seq_id_format.name}" + "Could not determine sequence identifier format." ) # Ensure that remaining records are compatible with sequence # identifier format and library type determined from first # record - LOGGER.debug( - "Checking consistency of remaining reads with initially " - "determined identifier format and library type..." - ) if self.seq_id_format is not None: + LOGGER.debug( + "Checking consistency of remaining reads with " + "initially determined identifier format " + "and library type..." + ) for record in seq_iter: records += 1 try: @@ -366,11 +370,6 @@ def evaluate(self) -> None: f"{type(exc).__name__}: {str(exc)}" ) from exc LOGGER.debug(f"Total records processed: {records}") - else: - LOGGER.debug( - "Could not determine sequence identifier format. " - "Skipping consistency check for the remaining reads." - ) except (OSError, ValueError) as exc: self.result = StatesType.not_available diff --git a/htsinfer/get_read_orientation.py b/htsinfer/get_read_orientation.py index 93b7c01..f66794b 100644 --- a/htsinfer/get_read_orientation.py +++ b/htsinfer/get_read_orientation.py @@ -75,9 +75,10 @@ def evaluate(self) -> ResultsOrientation: self.mapping.transcripts_file = self.transcripts_file self.mapping.tmp_dir = self.tmp_dir - if not self.mapping.mapped \ - and (self.library_source.file_1.short_name is not None - or self.library_source.file_2.short_name is not None): + if not self.mapping.mapped and ( + self.library_source.file_1.short_name is not None or + self.library_source.file_2.short_name is not None + ): self.mapping.evaluate() else: LOGGER.debug( diff --git a/tests/test_get_library_type.py b/tests/test_get_library_type.py index 2ae8c96..e1a0dce 100644 --- a/tests/test_get_library_type.py +++ b/tests/test_get_library_type.py @@ -29,6 +29,7 @@ FILE_INCONSISTENT_IDS_SINGLE_OLD_NEW, FILE_MATE_1, FILE_MATE_2, + FILE_UNKNOWN_SEQ_ID, FILE_IDS_NOT_MATCH_1, FILE_IDS_NOT_MATCH_2, FILE_TRANSCRIPTS, @@ -148,11 +149,11 @@ def test_evaluate_mate_relationship_not_available(self, tmpdir): CONFIG.args.path_1_processed = FILE_IDS_NOT_MATCH_1 CONFIG.args.path_2_processed = FILE_MATE_2 CONFIG.args.t_file_processed = FILE_TRANSCRIPTS + CONFIG.args.tmp_dir = tmpdir CONFIG.results.library_source = ResultsSource( file_1=Source(short_name="hsapiens", taxon_id=9606), file_2=Source(short_name="hsapiens", taxon_id=9606), ) - CONFIG.args.tmp_dir = tmpdir MAPPING.paths = (FILE_IDS_NOT_MATCH_1, FILE_MATE_2) MAPPING.transcripts_file = FILE_TRANSCRIPTS MAPPING.tmp_dir = tmpdir @@ -166,6 +167,62 @@ def test_evaluate_mate_relationship_not_available(self, tmpdir): StatesTypeRelationship.not_available ) + def test_update_relationship_not_mates(self, tmpdir): + """Test update_relationship logic.""" + CONFIG.args.path_1_processed = FILE_IDS_NOT_MATCH_1 + CONFIG.args.path_2_processed = FILE_MATE_2 + CONFIG.args.t_file_processed = FILE_TRANSCRIPTS + CONFIG.args.tmp_dir = tmpdir + MAPPING.paths = (FILE_IDS_NOT_MATCH_1, FILE_MATE_2) + MAPPING.transcripts_file = FILE_TRANSCRIPTS + MAPPING.tmp_dir = tmpdir + + test_instance = GetLibType(config=CONFIG, mapping=MAPPING) + test_instance.results.file_1 = StatesType.not_available + test_instance.results.file_2 = StatesType.not_available + + # Simulate a scenario where ratio is below the cutoff + concordant = 0 + read_counter = 20 + + # Call the _update_relationship method + test_instance._update_relationship(concordant, read_counter) + + assert ( + test_instance.results.relationship == + StatesTypeRelationship.not_mates + ) + assert ( + test_instance.mapping.library_type.relationship == + StatesTypeRelationship.not_available + ) + + def test_evaluate_mate_relationship_not_determined(self, tmpdir): + """Test mate relationship evaluation logic when + library source is not determined. + """ + CONFIG.args.path_1_processed = FILE_MATE_1 + CONFIG.args.path_2_processed = FILE_MATE_2 + CONFIG.args.t_file_processed = FILE_TRANSCRIPTS + CONFIG.args.tmp_dir = tmpdir + CONFIG.results.library_source = ResultsSource( + file_1=Source(), + file_2=Source(), + ) + test_instance = GetLibType(config=CONFIG, mapping=MAPPING) + test_instance.results.file_1 = StatesType.not_available + test_instance.results.file_2 = StatesType.not_available + + # Call the _evaluate_mate_relationship method + test_instance._evaluate_mate_relationship( + ids_1=["A", "B", "C"], ids_2=["D", "E", "F"] + ) + + assert ( + test_instance.results.relationship == + StatesTypeRelationship.not_available + ) + def test_evaluate_split_mates_not_matching_ids(self, tmpdir): """Test mate relationship evaluation logic with input files that are not mates from a paired-end library. @@ -304,6 +361,12 @@ def test_get_read_type_no_match(self): regex=SeqIdFormats['Casava >=1.8'].value, ) + def test_evaluate_unknown_identifier_format(self): + """Test scenario where seq_id format cannot be determined.""" + test_instance = GetFastqType(path=FILE_UNKNOWN_SEQ_ID) + test_instance.evaluate() + assert test_instance.result == StatesType.not_available + def test_get_read_type_single_pass(self): """Read identifier is consistent with previous state.""" test_instance = GetFastqType(path=FILE_DUMMY) From b5b80ca72ef69276fdffb257caff4bb850ede1d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Mon, 15 Jan 2024 14:00:41 +0100 Subject: [PATCH 12/15] refactor: update lib type and orientation --- htsinfer/get_library_type.py | 12 +++++++----- htsinfer/get_read_orientation.py | 3 +-- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/htsinfer/get_library_type.py b/htsinfer/get_library_type.py index 19d870f..3781179 100644 --- a/htsinfer/get_library_type.py +++ b/htsinfer/get_library_type.py @@ -113,7 +113,7 @@ def _evaluate_mate_relationship( ids_2: As `ids_1` but for the putative second mate file. """ self.results.relationship = StatesTypeRelationship.not_mates - if ids_1 == ids_2 and len(ids_1) > 0 and len(ids_2) > 0: + if ids_1 and ids_2 and ids_1 == ids_2: if ( self.results.file_1 == StatesType.first_mate and self.results.file_2 == StatesType.second_mate @@ -140,8 +140,7 @@ def _evaluate_mate_relationship( else: self.results.relationship = StatesTypeRelationship.not_available LOGGER.debug( - "Library source is not determined, " - "mate relationship cannot be inferred by alignment." + "Mate relationship cannot be determined." ) def _align_mates(self): @@ -165,6 +164,7 @@ def _align_mates(self): concordant = 0 for read1 in samfile1: + # ensure that "unmapped" flag is not set and query name is set if not read1.flag & (1 << 2) and isinstance(read1.query_name, str): seq_id1 = read1.query_name if ( @@ -180,6 +180,7 @@ def _align_mates(self): read_counter = 0 for read2 in samfile2: + # ensure that "unmapped" flag is not set and query name is set if not read2.flag & (1 << 2) and isinstance(read2.query_name, str): seq_id2 = read2.query_name if seq_id2 != previous_seq_id2 \ @@ -219,8 +220,9 @@ def _update_relationship(self, concordant, read_counter): self.results.relationship = ( StatesTypeRelationship.split_mates ) - self.mapping.library_type.relationship \ - = StatesTypeRelationship.split_mates + self.mapping.library_type.relationship = ( + StatesTypeRelationship.split_mates + ) self.mapping.mapped = False self.mapping.star_dirs = [] else: diff --git a/htsinfer/get_read_orientation.py b/htsinfer/get_read_orientation.py index f66794b..b5cce78 100644 --- a/htsinfer/get_read_orientation.py +++ b/htsinfer/get_read_orientation.py @@ -82,8 +82,7 @@ def evaluate(self) -> ResultsOrientation: self.mapping.evaluate() else: LOGGER.debug( - "Library source is not determined, " - "read orientation cannot be inferred by alignment." + "Read orientation cannot be determined." ) return self.process_alignments(star_dirs=self.mapping.star_dirs) From 31af8ec271d1ade32cd792ea1de51e53c87a16c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Wed, 17 Jan 2024 11:46:07 +0100 Subject: [PATCH 13/15] refactor: update concordant read counting, mapping --- htsinfer/get_library_type.py | 71 ++++++++++++++++++++---------------- htsinfer/mapping.py | 1 + tests/test_mapping.py | 3 +- 3 files changed, 42 insertions(+), 33 deletions(-) diff --git a/htsinfer/get_library_type.py b/htsinfer/get_library_type.py index 3781179..3dd562c 100644 --- a/htsinfer/get_library_type.py +++ b/htsinfer/get_library_type.py @@ -158,59 +158,66 @@ def _align_mates(self): previous_seq_id2 = None reads1 = [] # List to store alignments for one read from file1 - mate1 = [] # List to store alignments for each read + mate1 = [] # List to store alignments for each read from file1 reads2 = [] # List to store alignments for one read from file2 + mate2 = [] # List to store alignments for each read from file2 concordant = 0 for read1 in samfile1: - # ensure that "unmapped" flag is not set and query name is set - if not read1.flag & (1 << 2) and isinstance(read1.query_name, str): - seq_id1 = read1.query_name - if ( - seq_id1 != previous_seq_id1 and - previous_seq_id1 is not None - ): - mate1.append(reads1.copy()) - reads1.clear() - if read1.reference_end: - reads1.append(read1) + seq_id1 = read1.query_name + if ( + seq_id1 != previous_seq_id1 and + previous_seq_id1 is not None + ): + mate1.append(reads1.copy()) + reads1.clear() + if read1.reference_end: + reads1.append(read1) previous_seq_id1 = seq_id1 mate1.append(reads1.copy()) read_counter = 0 for read2 in samfile2: - # ensure that "unmapped" flag is not set and query name is set - if not read2.flag & (1 << 2) and isinstance(read2.query_name, str): - seq_id2 = read2.query_name - if seq_id2 != previous_seq_id2 \ - and previous_seq_id2 is not None: - if read_counter < len(mate1) and self._compare_alignments( - mate1[read_counter], reads2 - ): - concordant += 1 - reads2.clear() - read_counter += 1 - if read2.reference_end: - reads2.append(read2) + seq_id2 = read2.query_name + if ( + seq_id2 != previous_seq_id2 and + previous_seq_id2 is not None + ): + mate2.append(reads2.copy()) + if self._compare_alignments( + mate1[read_counter], reads2 + ): + concordant += 1 + reads2.clear() + read_counter += 1 + if read2.reference_end: + reads2.append(read2) previous_seq_id2 = seq_id2 - - if read_counter < len(mate1) and self._compare_alignments( + mate2.append(reads2.copy()) + if self._compare_alignments( mate1[read_counter], reads2 ): concordant += 1 - LOGGER.debug(f"Number of aligned reads file 1: {len(mate1)}") - LOGGER.debug(f"Number of aligned reads file 2: {read_counter}") + + aligned_mate1 = len(list(filter(None, mate1))) + aligned_mate2 = len(list(filter(None, mate2))) + + LOGGER.debug(f"Number of aligned reads file 1: {aligned_mate1}") + LOGGER.debug(f"Number of aligned reads file 2: {aligned_mate2}") LOGGER.debug(f"Number of concordant reads: {concordant}") - self._update_relationship(concordant, read_counter) + + self._update_relationship( + concordant, min(aligned_mate1, aligned_mate2) + ) samfile1.close() samfile2.close() - def _update_relationship(self, concordant, read_counter): + def _update_relationship(self, concordant, aligned_reads): """Helper function to update relationship based on alignment.""" try: - ratio = concordant / read_counter + ratio = concordant / aligned_reads except ZeroDivisionError: self.results.relationship = ( StatesTypeRelationship.not_available diff --git a/htsinfer/mapping.py b/htsinfer/mapping.py index cb53f7a..e1cef46 100644 --- a/htsinfer/mapping.py +++ b/htsinfer/mapping.py @@ -299,6 +299,7 @@ def build_star_command( "--runThreadN", f"{str(self.threads_star)}", "--genomeDir", f"{str(index_dir)}", "--outFilterMultimapNmax", "50", + "--outSAMorder", "PairedKeepInputOrder", "--outSAMunmapped", "Within", "KeepPairs", ] cmd: List[str] = cmd_base[:] diff --git a/tests/test_mapping.py b/tests/test_mapping.py index 33798be..1bdecbb 100644 --- a/tests/test_mapping.py +++ b/tests/test_mapping.py @@ -187,7 +187,8 @@ def test_prepare_star_alignment_commands(self, tmpdir): file1_alignment_path = tmpdir / 'alignments/file_1' cmd = "STAR --alignIntronMax 1 --alignEndsType Local --runThreadN 1" \ + " --genomeDir " + str(index_dir) + " --outFilterMultimapNmax " \ - + "50 --outSAMunmapped Within KeepPairs --readFilesIn " \ + + "50 --outSAMorder PairedKeepInputOrder " \ + + "--outSAMunmapped Within KeepPairs --readFilesIn " \ + str(FILE_2000_RECORDS) + " --outFileNamePrefix " \ + str(file1_alignment_path) + "/" results = test_instance.prepare_star_alignment_commands( From b90893054d0a350ef7a2557765bb6db526cf2885 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Wed, 17 Jan 2024 13:59:24 +0100 Subject: [PATCH 14/15] update debug messages --- htsinfer/get_library_type.py | 6 ++++-- htsinfer/get_read_orientation.py | 5 +---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/htsinfer/get_library_type.py b/htsinfer/get_library_type.py index 3dd562c..37dad4f 100644 --- a/htsinfer/get_library_type.py +++ b/htsinfer/get_library_type.py @@ -128,9 +128,10 @@ def _evaluate_mate_relationship( StatesTypeRelationship.split_mates ) elif ( - self.library_source.file_1.short_name is not None and + self.library_source.file_1.short_name is not None or self.library_source.file_2.short_name is not None ): + LOGGER.debug("Determining mate relationship by alignment...") self.mapping.library_type.relationship \ = StatesTypeRelationship.not_available self.mapping.library_source = self.library_source @@ -140,7 +141,8 @@ def _evaluate_mate_relationship( else: self.results.relationship = StatesTypeRelationship.not_available LOGGER.debug( - "Mate relationship cannot be determined." + "Sequence IDs and library source are not determined, " + "mate relationship cannot be inferred." ) def _align_mates(self): diff --git a/htsinfer/get_read_orientation.py b/htsinfer/get_read_orientation.py index b5cce78..4b36e8b 100644 --- a/htsinfer/get_read_orientation.py +++ b/htsinfer/get_read_orientation.py @@ -79,11 +79,8 @@ def evaluate(self) -> ResultsOrientation: self.library_source.file_1.short_name is not None or self.library_source.file_2.short_name is not None ): + LOGGER.debug("Determining read relationship by alignment...") self.mapping.evaluate() - else: - LOGGER.debug( - "Read orientation cannot be determined." - ) return self.process_alignments(star_dirs=self.mapping.star_dirs) From 8fe0cbe78965f54f7c293fccded1801f6367b322 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= Date: Thu, 18 Jan 2024 11:10:07 +0100 Subject: [PATCH 15/15] update comments in mapping --- htsinfer/mapping.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/htsinfer/mapping.py b/htsinfer/mapping.py index e1cef46..52780f3 100644 --- a/htsinfer/mapping.py +++ b/htsinfer/mapping.py @@ -51,11 +51,7 @@ def __init__( self.star_dirs: List[Path] = [] def evaluate(self): - """Infer read orientation. - - Returns: - Orientation results object. - """ + """Align FASTQ files to reference transcripts with STAR.""" # get transcripts for current organims transcripts = self.subset_transcripts_by_organism() @@ -270,6 +266,10 @@ def prepare_star_alignment_commands( ) -> Dict[Path, List[str]]: """Prepare STAR alignment commands. + Input FASTQ files are assumed to be sorted according to reference names + or coordinates, the order of input reads is kept with the option + "PairedKeepInputOrder", no additional sorting of aligned reads is done. + Args: index_dir: Path to directory containing STAR index.