Collecting the total number of articles for each human gene (19504 human genes)
Load the following packages
library(rvest)
library(dplyr)
library(readr)
Read csv file
humangenes <- read_csv("/Users/Desktop/human_proteins.csv")
View(humangenes)
colnames(humangenes)
Select the genes and make them unique
humangenes1 <- humangenes %>%
dplyr::select(Locus)%>%
unique()
allgenes <- humangenes1$Locus
pubmed_data <- data.frame()
Start downloading the number of articles per human gene. This process depends on the speed of the internet.
for(i in allgenes){
link <- paste0("https://pubmed.ncbi.nlm.nih.gov/?term=", i,"&sort=date")
page <- read_html(link)
gene_name <- i
number <- page %>% html_nodes("span.value") %>% html_text() %>%
replace(!nzchar(number), NA)
pubmed_data<- rbind(pubmed_data, data.frame(gene_name, number, stringsAsFactors = FALSE))
}
Save the data
write.csv(pubmed_data1, "/Users/Desktop/pubmed_data06.04.2022", row.names = FALSE)
Load the following packages
library(cowplot)
library(tidyverse)
library(ggrepel)
Generate the plot
p <- ggplot(pubmed_genes, aes(x= fct_reorder(gene_name, number), y =number),
alpha = 0.02 )+
geom_point() +
ylim(0, 30000)+
geom_point(color = ifelse(pubmed_genes$number> 25000, "red", "black"))+
theme_cowplot(font_size = 12)
Label the X and Y axis
p1 <- p +
theme(axis.text.x=element_blank(),
axis.ticks.x = element_blank())+
labs(x = "Human Genes", y = "Number of Articles",
title = "Number of Articles for each human gene")
Mark gene names with different color and label them
p2 <- p1 + geom_text_repel(aes(label=ifelse((pubmed_genes$number> 25000), gene_name, "")))
Create the interactive plot
library(plotly)
library(htmlwidgets)
p3 <- p2 %>%
ggplotly(tooltip = c("x", "y"))
#save the plot
htmlwidgets::saveWidget(as_widget(p3), "index.html")