Source: https://github.com/markziemann/dee2_gene_signatures
In this analysis we will be characterising the new gene sets.
suppressPackageStartupMessages({
library("getDEE2")
library("mitch")
library("triwise")
library("dplyr")
library("gplots")
library("reshape2")
library("network")
library("eulerr")
})
This is the background - all genes in the "universe" according to Ensembl version 90, which is the same annotation set used by DEE2. You can see that there are a large number of non-protein coding genes.
# get universe of gene names and biotypes
if (! file.exists("hs.gtf.gz") ) {
download.file("ftp://ftp.ensembl.org/pub/release-90/gtf/homo_sapiens/Homo_sapiens.GRCh38.90.gtf.gz",destfile= "hs.gtf.gz")
}
g<-read.table("hs.gtf.gz",sep="\t")
g <- g[grep("gene",g$V3),9]
universe <- sapply(strsplit(g," "),"[[",6)
universe <- gsub(";","",universe)
biotypes <- sapply(strsplit(g," "),"[[",10)
biotypes <- gsub("_"," ",biotypes)
biotypes <- gsub(";","",biotypes)
biotypes_df <- data.frame(universe,biotypes)
universe <- unique(universe)
mytable <- table(biotypes)
mytable <- mytable[order(mytable)]
mytable <- mytable[which(mytable>100)]
par(mar=c(3,13,1,1)); barplot(mytable,horiz=TRUE,las=2,cex.names = 0.6,cex.axis = 0.6,
main="number of genes in each biotype class",xlim = c(0,20000)) ;grid()
mytable <- mytable/sum(mytable)*100
par(mar=c(3,13,1,1)); barplot(mytable,horiz=TRUE,las=2,cex.names = 0.6,cex.axis = 0.6,
main="proportion (%) of genes in each biotype class" ,xlim=c(0,40)) ;grid()
# Take note how many transcripts are protein coding versus non-protein coding
paste("protein coding:",length(which(biotypes=="protein coding")))
## [1] "protein coding: 19847"
paste("non-protein coding:",length(which(biotypes!="protein coding")))
## [1] "non-protein coding: 38455"
Here I download the current Reactome gene set library. I would like to see how many Ensembl genes have some sort of annotated function. Breaking it down into protein coding and non-protein coding, we can see that non-protein coding genes are severely underrepresented in Reactome. Reactome has 2400 sets and a total of 11193 genes as of 29/Sep/2020.
#o
download.file("https://reactome.org/download/current/ReactomePathways.gmt.zip", destfile="ReactomePathways.gmt.zip")
#unzip("ReactomePathways.gmt.zip")
reactome <- gmt_import("ReactomePathways.gmt")
reactome_genes <- unique(unname(unlist(reactome)))
length(reactome)
## [1] 2408
length(reactome_genes)
## [1] 11194
v1 <- list("Reactome"=reactome_genes, "Ensembl universe"=universe)
library("eulerr")
plot(euler(v1),quantities = TRUE)
length(reactome_genes)/length(universe)*100
## [1] 19.76063
prot <- biotypes_df[which(biotypes_df$biotypes=="protein coding"),1]
prot <- unique(prot)
nprot <- biotypes_df[which(biotypes_df$biotypes!="protein coding"),1]
nprot <- unique(nprot)
intersect(prot,nprot)
## [1] "RGS5" "TMEM247" "CYB561D2" "CDR1" "KBTBD11-OT1"
## [6] "MAL2" "SPATA13" "SFTA3" "GOLGA8M" "ELFN2"
nprot <- setdiff(nprot,prot)
v1 <- list("Reactome"=reactome_genes, "protein coding"=prot, "non-protein coding"=nprot)
plot(euler(v1),quantities = TRUE)
Now let's do the same with GO sets downloaded from Ensembl biomart. GO has 12264 sets and 8412 genes as of 26/Sep/2020.
go <- read.table("biomart_2020-09-26.txt.gz",sep="\t", fill = TRUE)
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
## EOF within quoted string
colnames(go) <- go[1,]
go <- go[2:nrow(go),]
go <- go[which(go$`GO term accession`!=""),]
go_genes <- unique(go$`Gene name`)
length(unique(go$`GO term accession`))
## [1] 12264
length(go_genes)
## [1] 8412
v1 <- list("GO"=go_genes, "protein coding"=prot, "non-protein coding"=nprot)
plot(euler(v1),quantities = TRUE)