0.1.Prepare_Annotation_Files.Rmd

---
title: "RNAseq AAA Prepare Annotations Files"
author: "Gerard Temprano Sagrera"
date: "2022-12-07"
output: html_document
---

```{r, warning=FALSE, message=FALSE, echo=FALSE}
library(openxlsx);library(data.table);library(stringr);library(dplyr)
```

## Filtering: 

* 1. Select "protein coding" and "lnRNA" genes. 

* 2. Select annotation levels "1" and "2".


### Load, prepare and save v26 annotation file for GENES (used in our quantification):

```{r}
# Load annotation file:
ann <- fread("C://Users/Gerard/Desktop/AAA/RNAseq/Annotation/gencode.v26.annotation.gtf.gz", header = F, nThread = 6)
```

### Select only genes:

```{r}
ann <- ann[ann$V3 == "gene",]
```

```{r}
paste("We have", nrow(ann), "genes before filtering")
```

```{r}
# Keep only protein coding and lncRNA genes:
geneType <- unlist(lapply(ann$V9, function(x) strsplit(x, ";")[[1]][2]))
keepGeneType <- which(grepl("*protein_coding*", geneType) | grepl("*lincRNA*", geneType))
ann <- ann[keepGeneType,]
paste("We have", nrow(ann), "genes keeping only protein_coding and lincRNA annotations.")
```

```{r}
# Keep only annotation levels 1 and 2:
geneLevel <- unlist(lapply(ann$V9, function(x) strsplit(x, ";")[[1]][4]))
keepGeneLevel <- which(grepl("level 1", geneLevel) | grepl("level 2", geneLevel))
ann <- ann[keepGeneLevel,]
paste("We have", nrow(ann), "genes keeping only annotation levels 1 and 2.")
```

```{r}
# Add gene_id and gene_name to the annotation file:
ann <- ann[,c("V4","V5","V9")]

ann$gene_id <- str_split_fixed(ann$V9,";",7)[,1]
ann$gene_id <- gsub("gene_id ","",ann$gene_id)
ann$gene_id <- gsub("[^[:alnum:].]", "", ann$gene_id)

ann$gene_name <- str_split_fixed(ann$V9,";",7)[,3]
ann$gene_name <- gsub("gene_name","",ann$gene_name)
ann$gene_name <- gsub("[^A-Za-z0-9]","",ann$gene_name)

# Add length columns to the annotation file (will be used for TPM calculation):
ann$length <- ann$V5-ann$V4

# Remove old columns:
ann <- ann[,-c("V4","V5","V9")]
```

```{r}
#fwrite(ann, "C://Users/Gerard/Desktop/AAA/RNAseq/Annotation/gencode.v26.annotation.fixed.gtf.gz", sep = "\t")
```


```{r}
cc <- fread("C://Users/Gerard/Desktop/AAA/RNAseq/Gene_counts/Counts.RSEM.All.txt")
cc <- cc %>% remove_rownames %>% column_to_rownames(var="V1")

ann <- fread("C://Users/Gerard/Desktop/AAA/RNAseq/Annotation/gencode.v26.annotation.gtf.gz", header = F, nThread = 6)
ann$V9

ann$gene_id <- regmatches(ann$V9, regexpr("(?<=gene_id \\\")[^\\\"]+", ann$V9, perl=TRUE))
ann$gene_type <- regmatches(ann$V9, regexpr("(?<=gene_type \\\")[^\\\"]+", ann$V9, perl=TRUE))


table(ann[ann$gene_id %in% rownames(cc),]$gene_type)
```


### Load, prepare and save v26 annotation file for TRANSCRIPTS (used in our quantification):

```{r}
# Load annotation file:
ann <- fread("C://Users/Gerard/Desktop/AAA/RNAseq/Annotation/gencode.v26.annotation.gtf.gz", header = F, nThread = 6)
```

### Select only genes:

```{r}
ann <- ann[ann$V3 == "transcript",]
```

```{r}
paste("We have", nrow(ann), "transcripts before filtering")
```

```{r}
# Keep only protein coding and lncRNA genes:

geneType <- unlist(lapply(ann$V9, function(x) strsplit(x, ";")[[1]][3]))
keepGeneType <- which(grepl("*protein_coding*", geneType) | grepl("*lincRNA*", geneType))
ann <- ann[keepGeneType,]
paste("We have", nrow(ann), "transcripts keeping only protein_coding and lincRNA annotations.")
```


```{r}
# Keep only annotation levels 1 and 2:

geneLevel <- unlist(lapply(ann$V9, function(x) strsplit(x, ";")[[1]][7]))
keepGeneLevel <- which(grepl("level 1", geneLevel) | grepl("level 2", geneLevel))
ann <- ann[keepGeneLevel,]
paste("We have", nrow(ann), "genes keeping only annotation levels 1 and 2.")
```

```{r}
# Add gene_id and gene_name to the annotation file:
ann <- ann[,c("V4","V5","V9")]

ann$transcript_id <- str_split_fixed(ann$V9,";",7)[,2]
ann$transcript_id <- gsub("transcript_id ","",ann$transcript_id)
ann$transcript_id <- gsub("[^[:alnum:].]", "", ann$transcript_id)

ann$gene_id <- str_split_fixed(ann$V9,";",7)[,1]
ann$gene_id <- gsub("gene_id ","",ann$gene_id)
ann$gene_id <- gsub("[^[:alnum:].]", "", ann$gene_id)

ann$gene_name <- str_split_fixed(ann$V9,";",7)[,4]
ann$gene_name <- gsub("gene_name","",ann$gene_name)
ann$gene_name <- gsub("[^A-Za-z0-9]","",ann$gene_name)

# Add length columns to the annotation file (will be used for TPM calculation):
ann$length <- ann$V5-ann$V4

# Remove old columns:
ann <- ann[,-c("V4","V5","V9")]
```

```{r}
fwrite(ann, "C://Users/Gerard/Desktop/AAA/RNAseq/Annotation/gencode.v26.annotation.fixed.transcripts.gtf.gz", sep = "\t")
```


```{r}
cc <- fread("C://Users/Gerard/Desktop/AAA/RNAseq/Gene_counts/Counts.RSEM.All.txt")
cc <- cc %>% remove_rownames %>% column_to_rownames(var="V1")

ann <- fread("C://Users/Gerard/Desktop/AAA/RNAseq/Annotation/gencode.v26.annotation.gtf.gz", header = F, nThread = 6)
ann$V9

ann$gene_id <- regmatches(ann$V9, regexpr("(?<=gene_id \\\")[^\\\"]+", ann$V9, perl=TRUE))
ann$gene_type <- regmatches(ann$V9, regexpr("(?<=gene_type \\\")[^\\\"]+", ann$V9, perl=TRUE))


table(ann[ann$gene_id %in% rownames(cc),]$gene_type)
```