Skip to content

Commit

Permalink
import exon information from reference annotation
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewprzh committed Sep 25, 2024
1 parent 074a949 commit b4711d6
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 16 deletions.
29 changes: 18 additions & 11 deletions src/gene_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def __init__(self, gene_db_list, db, delta=0, prepare_profiles=True):
self.set_sources()
self.gene_id_map = {}
self.set_gene_ids()
self.gene_attributes = {}
self.feature_attributes = {}
self.set_gene_attributes()
if prepare_profiles:
self.exon_property_map = self.set_feature_properties(self.all_isoforms_exons, self.exon_profiles)
Expand All @@ -208,7 +208,7 @@ def from_models(cls, transcript_model_storage, delta=0):
gene_info.sources = {}
gene_info.other_features = {}
gene_info.gene_id_map = {}
gene_info.gene_attributes = {}
gene_info.feature_attributes = {}
introns = set()
exons = set()

Expand Down Expand Up @@ -294,7 +294,7 @@ def from_model(cls, transcript_model, delta=0):
transcript_model.gene_id: transcript_model.source}
gene_info.other_features = {transcript_model.transcript_id: transcript_model.other_features}
gene_info.gene_id_map = {transcript_model.transcript_id: transcript_model.gene_id}
gene_info.gene_attributes = {}
gene_info.feature_attributes = {}

gene_info.regions_for_bam_fetch = [(gene_info.start, gene_info.end)]
gene_info.exon_property_map = None
Expand Down Expand Up @@ -332,7 +332,7 @@ def from_region(cls, chr_id, start, end, delta=0, chr_record=None):
gene_info.other_features = {}
gene_info.sources = {}
gene_info.gene_id_map = {}
gene_info.gene_attributes = {}
gene_info.feature_attributes = {}
gene_info.regions_for_bam_fetch = [(start, end)]
gene_info.exon_property_map = None
gene_info.intron_property_map = None
Expand Down Expand Up @@ -390,7 +390,7 @@ def deserialize(cls, infile, genedb):
gene_info.set_sources()
gene_info.gene_id_map = {}
gene_info.set_gene_ids()
gene_info.gene_attributes = {}
gene_info.feature_attributes = {}
gene_info.set_gene_attributes()
gene_info.exon_property_map = gene_info.set_feature_properties(gene_info.all_isoforms_exons, gene_info.exon_profiles)
gene_info.intron_property_map = gene_info.set_feature_properties(gene_info.all_isoforms_introns, gene_info.intron_profiles)
Expand Down Expand Up @@ -475,19 +475,26 @@ def set_gene_ids(self):
self.gene_id_map[t.id] = gene_db.id

def set_gene_attributes(self):
self.gene_attributes = defaultdict(str)
self.feature_attributes = defaultdict(str)
for gene_db in self.gene_db_list:
for attr in gene_db.attributes.keys():
if attr in ['gene_id', 'ID', 'level']:
if attr in ['gene_id', 'ID', 'level', 'Parent']:
continue
if gene_db.attributes[attr]:
self.gene_attributes[gene_db.id] += '%s "%s"; ' % (attr, gene_db.attributes[attr][0])
for t in self.db.children(gene_db, featuretype=('transcript', 'mRNA'), order_by='start'):
self.feature_attributes[gene_db.id] += '%s "%s"; ' % (attr, gene_db.attributes[attr][0])
for t in self.db.children(gene_db, featuretype=('transcript', 'mRNA')):
for attr in t.attributes.keys():
if attr in ['transcript_id', 'gene_id', 'ID', 'level', 'exons']:
if attr in ['transcript_id', 'gene_id', 'ID', 'level', 'exons', 'Parent']:
continue
if t.attributes[attr]:
self.gene_attributes[t.id] += '%s "%s"; ' % (attr, t.attributes[attr][0])
self.feature_attributes[t.id] += '%s "%s"; ' % (attr, t.attributes[attr][0])
for e in self.db.children(gene_db, featuretype=('exon')):
exon_id = t.id + "_%d_%d_%s" % (e.start, e.end, e.strand)
for attr in t.attributes.keys():
if attr in ['transcript_id', 'gene_id', 'ID', 'Parent', 'level', 'exon_id', 'exon', 'exon_number']:
continue
if t.attributes[attr]:
self.feature_attributes[exon_id] += '%s "%s"; ' % (attr, t.attributes[attr][0])

# assigns an ordered list of all known exons and introns to self.exons and self.introns
# returns 2 maps, isoform id -> intron / exon list
Expand Down
15 changes: 10 additions & 5 deletions src/transcript_printer.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,8 @@ def dump(self, gene_info, transcript_model_storage):
for gene_id, coords in gene_order:
if gene_id not in self.printed_gene_ids:
gene_additiional_info = ""
if gene_info and gene_id in gene_info.gene_attributes:
gene_additiional_info = gene_info.gene_attributes[gene_id]
if gene_info and gene_id in gene_info.feature_attributes:
gene_additiional_info = gene_info.feature_attributes[gene_id]
source = "IsoQuant"
if gene_info and gene_id in gene_info.sources:
source = gene_info.sources[gene_id]
Expand All @@ -117,8 +117,8 @@ def dump(self, gene_info, transcript_model_storage):
if not model.check_additional("exons"):
model.add_additional_attribute("exons", str(len(model.exon_blocks)))
transcript_additiional_info = ""
if gene_info and model.transcript_id in gene_info.gene_attributes:
transcript_additiional_info = " " + gene_info.gene_attributes[model.transcript_id]
if gene_info and model.transcript_id in gene_info.feature_attributes:
transcript_additiional_info = " " + gene_info.feature_attributes[model.transcript_id]

transcript_line = '%s\t%s\ttranscript\t%d\t%d\t.\t%s\t.\tgene_id "%s"; transcript_id "%s"; %s\n' \
% (model.chr_id, model.source, model.exon_blocks[0][0], model.exon_blocks[-1][1],
Expand All @@ -137,9 +137,14 @@ def dump(self, gene_info, transcript_model_storage):
exons_to_print = sorted(exons_to_print, reverse=True) if model.strand == '-' else sorted(exons_to_print)
for i, e in enumerate(exons_to_print):
exon_str_id = self.exon_id_storage.get_id(model.chr_id, e, model.strand)

exon_id = model.transcript_id + "_%d_%d_%s" % (e[0], e[1], model.strand)
exon_additiional_info = ""
if gene_info and exon_id in gene_info.feature_attributes:
exon_additiional_info = " " + gene_info.feature_attributes[model.transcript_id]
feature_type = e[2]
self.out_gff.write(prefix_columns + "%s\t%d\t%d\t" % (feature_type, e[0], e[1]) + suffix_columns +
' exon "%d"; exon_id "%s";\n' % ((i + 1), exon_str_id))
' exon_number "%d"; exon_id "%s"; %s\n' % ((i + 1), exon_str_id, exon_additiional_info))
self.out_gff.flush()

def dump_read_assignments(self, transcript_model_constructor):
Expand Down

0 comments on commit b4711d6

Please sign in to comment.