Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: infer library type relationship #157

Merged
merged 15 commits into from
Jan 21, 2024
Merged
149 changes: 92 additions & 57 deletions htsinfer/get_library_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def _evaluate_mate_relationship(
ids_2: As `ids_1` but for the putative second mate file.
"""
self.results.relationship = StatesTypeRelationship.not_mates
if ids_1 == ids_2:
if ids_1 == ids_2 and len(ids_1) > 0 and len(ids_2) > 0:
balajtimate marked this conversation as resolved.
Show resolved Hide resolved
if (
self.results.file_1 == StatesType.first_mate and
self.results.file_2 == StatesType.second_mate
Expand All @@ -127,13 +127,22 @@ def _evaluate_mate_relationship(
self.mapping.library_type.relationship = (
StatesTypeRelationship.split_mates
)
else:
elif (
self.library_source.file_1.short_name is not None and
self.library_source.file_2.short_name is not None
):
self.mapping.library_type.relationship \
= StatesTypeRelationship.not_available
self.mapping.library_source = self.library_source
self.mapping.paths = self.path_1, self.path_2
self.mapping.evaluate()
self._align_mates()
else:
self.results.relationship = StatesTypeRelationship.not_available
LOGGER.debug(
"Library source is not determined, "
"mate relationship cannot be inferred by alignment."
)
balajtimate marked this conversation as resolved.
Show resolved Hide resolved

def _align_mates(self):
"""Decide mate relationship by alignment."""
Expand All @@ -144,6 +153,8 @@ def _align_mates(self):
samfile1 = pysam.AlignmentFile(str(alignment_1), 'r')
samfile2 = pysam.AlignmentFile(str(alignment_2), 'r')

seq_id1 = None
seq_id2 = None
previous_seq_id1 = None
previous_seq_id2 = None

Expand All @@ -154,47 +165,68 @@ def _align_mates(self):
concordant = 0

for read1 in samfile1:
seq_id1 = read1.query_name
if seq_id1 != previous_seq_id1 \
and previous_seq_id1 is not None:
mate1.append(reads1.copy())
reads1.clear()
if read1.reference_end:
reads1.append(read1)
if not read1.flag & (1 << 2) and isinstance(read1.query_name, str):
balajtimate marked this conversation as resolved.
Show resolved Hide resolved
seq_id1 = read1.query_name
if (
seq_id1 != previous_seq_id1 and
previous_seq_id1 is not None
):
mate1.append(reads1.copy())
reads1.clear()
if read1.reference_end:
reads1.append(read1)
previous_seq_id1 = seq_id1
mate1.append(reads1.copy())

read_counter = 0
for read2 in samfile2:
seq_id2 = read2.query_name
if seq_id2 != previous_seq_id2 \
and previous_seq_id2 is not None:
if self._compare_alignments(mate1[read_counter], reads2):
concordant += 1
reads2.clear()
read_counter += 1
if read2.reference_end:
reads2.append(read2)
if not read2.flag & (1 << 2) and isinstance(read2.query_name, str):
seq_id2 = read2.query_name
if seq_id2 != previous_seq_id2 \
and previous_seq_id2 is not None:
if read_counter < len(mate1) and self._compare_alignments(
mate1[read_counter], reads2
):
concordant += 1
reads2.clear()
read_counter += 1
if read2.reference_end:
reads2.append(read2)
previous_seq_id2 = seq_id2

if self._compare_alignments(mate1[read_counter], reads2):
if read_counter < len(mate1) and self._compare_alignments(
balajtimate marked this conversation as resolved.
Show resolved Hide resolved
mate1[read_counter], reads2
):
concordant += 1
LOGGER.debug(f"Number of aligned reads file 1: {len(mate1)}")
LOGGER.debug(f"Number of aligned reads file 2: {read_counter}")
LOGGER.debug(f"Number of concordant reads: {concordant}")
self._update_relationship(concordant, read_counter)

if (concordant / read_counter) >= self.cutoff:
samfile1.close()
samfile2.close()

def _update_relationship(self, concordant, read_counter):
"""Helper function to update relationship based on alignment."""
try:
ratio = concordant / read_counter
except ZeroDivisionError:
self.results.relationship = (
StatesTypeRelationship.split_mates
StatesTypeRelationship.not_available
)
self.mapping.library_type.relationship \
= StatesTypeRelationship.split_mates
self.mapping.mapped = False
self.mapping.star_dirs = []
else:
self.results.relationship = (
StatesTypeRelationship.not_mates
)

samfile1.close()
samfile2.close()
if ratio >= self.cutoff:
self.results.relationship = (
StatesTypeRelationship.split_mates
)
self.mapping.library_type.relationship \
= StatesTypeRelationship.split_mates
balajtimate marked this conversation as resolved.
Show resolved Hide resolved
self.mapping.mapped = False
self.mapping.star_dirs = []
else:
self.results.relationship = (
StatesTypeRelationship.not_mates
)

class AlignedSegment:
"""Placeholder class for mypy "Missing attribute"
Expand Down Expand Up @@ -302,44 +334,47 @@ def evaluate(self) -> None:
self.result = StatesType.not_available
raise FileProblem(f"File is empty: {self.path}") from exc

if self.seq_id_format is None:
if self.seq_id_format is not None:
LOGGER.debug(
"Sequence identifier format: "
f"{self.seq_id_format.name}"
)
else:
self.result = StatesType.not_available
raise MetadataWarning(
LOGGER.debug(
"Could not determine sequence identifier format."
)
LOGGER.debug(
f"Sequence identifier format: {self.seq_id_format.name}"
)

# Ensure that remaining records are compatible with sequence
# identifier format and library type determined from first
# record
LOGGER.debug(
"Checking consistency of remaining reads with initially "
"determined identifier format and library type..."
)
for record in seq_iter:
records += 1
try:
self._get_read_type(
seq_id=record[0],
regex=self.seq_id_format.value,
)
except (
InconsistentFastqIdentifiers,
UnknownFastqIdentifier,
) as exc:
self.result = StatesType.not_available
raise MetadataWarning(
f"{type(exc).__name__}: {str(exc)}"
) from exc
if self.seq_id_format is not None:
LOGGER.debug(
"Checking consistency of remaining reads with "
"initially determined identifier format "
"and library type..."
)
for record in seq_iter:
records += 1
try:
self._get_read_type(
seq_id=record[0],
regex=self.seq_id_format.value,
)
except (
InconsistentFastqIdentifiers,
UnknownFastqIdentifier,
) as exc:
self.result = StatesType.not_available
raise MetadataWarning(
f"{type(exc).__name__}: {str(exc)}"
) from exc
LOGGER.debug(f"Total records processed: {records}")

except (OSError, ValueError) as exc:
self.result = StatesType.not_available
raise FileProblem(f"{type(exc).__name__}: {str(exc)}") from exc

LOGGER.debug(f"Total records processed: {records}")

def _get_read_type(
self,
seq_id: str,
Expand Down
10 changes: 9 additions & 1 deletion htsinfer/get_read_orientation.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,16 @@ def evaluate(self) -> ResultsOrientation:
self.mapping.transcripts_file = self.transcripts_file
self.mapping.tmp_dir = self.tmp_dir

if not self.mapping.mapped:
if not self.mapping.mapped and (
self.library_source.file_1.short_name is not None or
self.library_source.file_2.short_name is not None
):
self.mapping.evaluate()
else:
LOGGER.debug(
"Library source is not determined, "
"read orientation cannot be inferred by alignment."
)
balajtimate marked this conversation as resolved.
Show resolved Hide resolved

return self.process_alignments(star_dirs=self.mapping.star_dirs)

Expand Down
Loading
Loading