From b4711d673003dad6158784b458ca41ac291525a2 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Fri, 20 Sep 2024 18:14:42 +0300 Subject: [PATCH] import exon information from reference annotation --- src/gene_info.py | 29 ++++++++++++++++++----------- src/transcript_printer.py | 15 ++++++++++----- 2 files changed, 28 insertions(+), 16 deletions(-) diff --git a/src/gene_info.py b/src/gene_info.py index 469bceae..a3f1e067 100644 --- a/src/gene_info.py +++ b/src/gene_info.py @@ -183,7 +183,7 @@ def __init__(self, gene_db_list, db, delta=0, prepare_profiles=True): self.set_sources() self.gene_id_map = {} self.set_gene_ids() - self.gene_attributes = {} + self.feature_attributes = {} self.set_gene_attributes() if prepare_profiles: self.exon_property_map = self.set_feature_properties(self.all_isoforms_exons, self.exon_profiles) @@ -208,7 +208,7 @@ def from_models(cls, transcript_model_storage, delta=0): gene_info.sources = {} gene_info.other_features = {} gene_info.gene_id_map = {} - gene_info.gene_attributes = {} + gene_info.feature_attributes = {} introns = set() exons = set() @@ -294,7 +294,7 @@ def from_model(cls, transcript_model, delta=0): transcript_model.gene_id: transcript_model.source} gene_info.other_features = {transcript_model.transcript_id: transcript_model.other_features} gene_info.gene_id_map = {transcript_model.transcript_id: transcript_model.gene_id} - gene_info.gene_attributes = {} + gene_info.feature_attributes = {} gene_info.regions_for_bam_fetch = [(gene_info.start, gene_info.end)] gene_info.exon_property_map = None @@ -332,7 +332,7 @@ def from_region(cls, chr_id, start, end, delta=0, chr_record=None): gene_info.other_features = {} gene_info.sources = {} gene_info.gene_id_map = {} - gene_info.gene_attributes = {} + gene_info.feature_attributes = {} gene_info.regions_for_bam_fetch = [(start, end)] gene_info.exon_property_map = None gene_info.intron_property_map = None @@ -390,7 +390,7 @@ def deserialize(cls, infile, genedb): gene_info.set_sources() gene_info.gene_id_map = {} gene_info.set_gene_ids() - gene_info.gene_attributes = {} + gene_info.feature_attributes = {} gene_info.set_gene_attributes() gene_info.exon_property_map = gene_info.set_feature_properties(gene_info.all_isoforms_exons, gene_info.exon_profiles) gene_info.intron_property_map = gene_info.set_feature_properties(gene_info.all_isoforms_introns, gene_info.intron_profiles) @@ -475,19 +475,26 @@ def set_gene_ids(self): self.gene_id_map[t.id] = gene_db.id def set_gene_attributes(self): - self.gene_attributes = defaultdict(str) + self.feature_attributes = defaultdict(str) for gene_db in self.gene_db_list: for attr in gene_db.attributes.keys(): - if attr in ['gene_id', 'ID', 'level']: + if attr in ['gene_id', 'ID', 'level', 'Parent']: continue if gene_db.attributes[attr]: - self.gene_attributes[gene_db.id] += '%s "%s"; ' % (attr, gene_db.attributes[attr][0]) - for t in self.db.children(gene_db, featuretype=('transcript', 'mRNA'), order_by='start'): + self.feature_attributes[gene_db.id] += '%s "%s"; ' % (attr, gene_db.attributes[attr][0]) + for t in self.db.children(gene_db, featuretype=('transcript', 'mRNA')): for attr in t.attributes.keys(): - if attr in ['transcript_id', 'gene_id', 'ID', 'level', 'exons']: + if attr in ['transcript_id', 'gene_id', 'ID', 'level', 'exons', 'Parent']: continue if t.attributes[attr]: - self.gene_attributes[t.id] += '%s "%s"; ' % (attr, t.attributes[attr][0]) + self.feature_attributes[t.id] += '%s "%s"; ' % (attr, t.attributes[attr][0]) + for e in self.db.children(gene_db, featuretype=('exon')): + exon_id = t.id + "_%d_%d_%s" % (e.start, e.end, e.strand) + for attr in t.attributes.keys(): + if attr in ['transcript_id', 'gene_id', 'ID', 'Parent', 'level', 'exon_id', 'exon', 'exon_number']: + continue + if t.attributes[attr]: + self.feature_attributes[exon_id] += '%s "%s"; ' % (attr, t.attributes[attr][0]) # assigns an ordered list of all known exons and introns to self.exons and self.introns # returns 2 maps, isoform id -> intron / exon list diff --git a/src/transcript_printer.py b/src/transcript_printer.py index e84b8a52..3e00aa2e 100644 --- a/src/transcript_printer.py +++ b/src/transcript_printer.py @@ -99,8 +99,8 @@ def dump(self, gene_info, transcript_model_storage): for gene_id, coords in gene_order: if gene_id not in self.printed_gene_ids: gene_additiional_info = "" - if gene_info and gene_id in gene_info.gene_attributes: - gene_additiional_info = gene_info.gene_attributes[gene_id] + if gene_info and gene_id in gene_info.feature_attributes: + gene_additiional_info = gene_info.feature_attributes[gene_id] source = "IsoQuant" if gene_info and gene_id in gene_info.sources: source = gene_info.sources[gene_id] @@ -117,8 +117,8 @@ def dump(self, gene_info, transcript_model_storage): if not model.check_additional("exons"): model.add_additional_attribute("exons", str(len(model.exon_blocks))) transcript_additiional_info = "" - if gene_info and model.transcript_id in gene_info.gene_attributes: - transcript_additiional_info = " " + gene_info.gene_attributes[model.transcript_id] + if gene_info and model.transcript_id in gene_info.feature_attributes: + transcript_additiional_info = " " + gene_info.feature_attributes[model.transcript_id] transcript_line = '%s\t%s\ttranscript\t%d\t%d\t.\t%s\t.\tgene_id "%s"; transcript_id "%s"; %s\n' \ % (model.chr_id, model.source, model.exon_blocks[0][0], model.exon_blocks[-1][1], @@ -137,9 +137,14 @@ def dump(self, gene_info, transcript_model_storage): exons_to_print = sorted(exons_to_print, reverse=True) if model.strand == '-' else sorted(exons_to_print) for i, e in enumerate(exons_to_print): exon_str_id = self.exon_id_storage.get_id(model.chr_id, e, model.strand) + + exon_id = model.transcript_id + "_%d_%d_%s" % (e[0], e[1], model.strand) + exon_additiional_info = "" + if gene_info and exon_id in gene_info.feature_attributes: + exon_additiional_info = " " + gene_info.feature_attributes[model.transcript_id] feature_type = e[2] self.out_gff.write(prefix_columns + "%s\t%d\t%d\t" % (feature_type, e[0], e[1]) + suffix_columns + - ' exon "%d"; exon_id "%s";\n' % ((i + 1), exon_str_id)) + ' exon_number "%d"; exon_id "%s"; %s\n' % ((i + 1), exon_str_id, exon_additiional_info)) self.out_gff.flush() def dump_read_assignments(self, transcript_model_constructor):