From 074a94910111e4c169964a168378e8c9a13e0f41 Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Fri, 20 Sep 2024 17:44:26 +0300
Subject: [PATCH] GFF3 checker

---
 src/gtf2db.py | 75 ++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 68 insertions(+), 7 deletions(-)

diff --git a/src/gtf2db.py b/src/gtf2db.py
index ef6fc4fa..ee82a43d 100755
--- a/src/gtf2db.py
+++ b/src/gtf2db.py
@@ -59,6 +59,8 @@ def get_color(transcript_kind):
                 gene_name = record["gene_name"][0]
             elif "gene_id" in record.attributes:
                 gene_name = record["gene_id"][0]
+            elif "Parent" in record.attributes:
+                gene_name = record["Parent"][0]
             else:
                 gene_name = "unknown_gene"
             transcript_name = record.id + "|" + transcript_type + "|" + gene_name
@@ -97,12 +99,13 @@ def check_input_gtf(gtf, db, complete_db):
     gtf_is_correct, corrected_gtf, out_fname, has_meta_features = check_gtf_duplicates(gtf)
     if not gtf_is_correct:
         outdir = os.path.dirname(db)
-        new_gtf_path = os.path.join(outdir, out_fname)
-        with open(new_gtf_path, "w") as out_gtf:
-            out_gtf.write(corrected_gtf)
         logger.error("Input GTF seems to be corrupted (see warnings above).")
-        logger.error("An attempt to correct this GTF was made, the result is written to %s" % new_gtf_path)
-        logger.error("NB! some transcript / gene ids in the corrected annotation are modified.")
+        if out_fname and corrected_gtf:
+            new_gtf_path = os.path.join(outdir, out_fname)
+            with open(new_gtf_path, "w") as out_gtf:
+                out_gtf.write(corrected_gtf)
+            logger.error("An attempt to correct this GTF was made, the result is written to %s" % new_gtf_path)
+            logger.error("NB! some transcript / gene ids in the corrected annotation are modified.")
         logger.error("Provide a correct GTF by fixing the original input GTF or checking the corrected one.")
         exit(-3)
     else:
@@ -167,6 +170,10 @@ def check_gtf_duplicates(gtf):
         handle = open(gtf, "rt")
         inner_ext = outer_ext
 
+    if inner_ext.lower() == 'gff3':
+        return check_gff3_duplicates(handle)
+
+    gff3_checked = False
     for l in handle.readlines():
         line_count += 1
         if l.startswith("#"):
@@ -177,8 +184,14 @@ def check_gtf_duplicates(gtf):
             corrected_gtf += l
             continue
 
-        feature_type = v[2]
-        attrs = v[8].split(" ")
+        attribute_column = v[8]
+        if not gff3_checked:
+            gff3_checked = True
+            if attribute_column.find("ID=") != -1:
+                handle.seek(0)
+                return check_gff3_duplicates(handle)
+
+        attrs = attribute_column.split(" ")
 
         gene_id_pos = -1
         for i in range(len(attrs)):
@@ -190,6 +203,7 @@ def check_gtf_duplicates(gtf):
             gtf_correct = False
             continue
 
+        feature_type = v[2]
         gene_str = attrs[gene_id_pos + 1]
         start_pos = gene_str.find('"')
         end_pos = gene_str.rfind('"')
@@ -259,6 +273,53 @@ def check_gtf_duplicates(gtf):
     return gtf_correct, corrected_gtf, gtf_name + ".corrected" + inner_ext.lower(), complete_genedb
 
 
+def check_gff3_duplicates(handle):
+    gtf_correct = True
+    gene_count = 0
+    transcript_count = 0
+    line_count = 0
+    feature_ids = {}
+
+    for l in handle.readlines():
+        line_count += 1
+        if l.startswith("#"):
+            continue
+        v = l.strip().split("\t")
+        if len(v) < 9:
+            continue
+
+        feature_type = v[2]
+        if feature_type == 'gene':
+            gene_count += 1
+        elif feature_type in ["transcript", "mRNA"]:
+            transcript_count += 1
+
+        attrs = v[8].split(";")
+        id_pos = -1
+        for i in range(len(attrs)):
+            if attrs[i].startswith('ID'):
+                id_pos = i
+        if id_pos == -1:
+            if feature_type in ["gene", "transcript", "mRNA"]:
+                logger.warning("Malformed GTF line %d (ID attribute value cannot be found)" % line_count)
+                logger.warning(l.strip())
+                gtf_correct = False
+            continue
+
+        id_str = attrs[id_pos]
+        id_value = id_str.split("=")[1]
+        if id_value in feature_ids:
+            logger.warning("Duplicated ID %s on line %d" % (id_value, line_count))
+            gtf_correct = False
+            feature_ids[id_value] += 1
+
+    complete_genedb = 1
+    if transcript_count == 0 or gene_count == 0:
+        complete_genedb = -1
+
+    return gtf_correct, None, None, complete_genedb
+
+
 def find_converted_db(converted_gtfs, gtf_filename, complete_genedb):
     gtf_mtime = converted_gtfs.get(gtf_filename, {}).get('gtf_mtime')
     db_mtime = converted_gtfs.get(gtf_filename, {}).get('db_mtime')