Improvement of probe annotation using (i) custom annotation files, (ii) Better Bunny website tool and (iii) manual BLAST of probe sequence
pacman::p_load(dplyr,
limma,
ggplot2,
tibble,
flextable)
## V. Duranthon annotation
anno_conf <- read.csv(file = "data/150602_annotation_microarray_lapin_V2.csv",
sep = ";",
stringsAsFactors = F)
## rename colname of the fist column
colnames(anno_conf)[1] <- "ProbeName"
## describe TargetID colnames
colnames(anno_conf)[colnames(anno_conf) == "TargetID"] <- "Target.ID.Human"
colnames(anno_conf)[colnames(anno_conf) == "TargetID.1"] <- "Target.ID.Rabbit"
Remove oligos cleaned
anno_conf_nett <- anno_conf[anno_conf$ProbeName %in% anno_array$ProbeName,]
Merge data.frame
## merge Agilent and VD annotation
anno_merge <- dplyr::left_join(anno_array, anno_conf_nett, by = "ProbeName")
## Rename gene description colnames
colnames(anno_merge)[colnames(anno_merge) == "Description.x"] <- "Description_Agilent"
colnames(anno_merge)[colnames(anno_merge) == "Description.y"] <- "Description_VD"
## remove "FASTA" column (same as "Sequence" column)
anno_merge %>% dplyr::select(-FASTA) -> anno_merge
## remove column "ControlType" (non informative)
anno_merge %>% dplyr::select(-ControlType) -> anno_merge
## remove '-201', '-202', '-203' from the end of "Associated Transcript Name"
anno_merge$Associated.Transcript.Name <- gsub("-201", "", anno_merge$Associated.Transcript.Name)
anno_merge$Associated.Transcript.Name <- gsub("-202", "", anno_merge$Associated.Transcript.Name)
anno_merge$Associated.Transcript.Name <- gsub("-203", "", anno_merge$Associated.Transcript.Name)
## add "Immun" column identifying VJ genes
anno_merge$Add_annot_V_Jacquier_2015 <- "FALSE"
## annote immun sur la base du fichier de VJ
anno_merge$Add_annot_V_Jacquier_2015[anno_merge$Ensembl.Gene.ID %in% anno_immun_VJ$ENSEMBL_ID] <- "TRUE"
## count of "immun" oligo still on the array
sum(anno_merge$Add_annot_V_Jacquier_2015 == 'TRUE') ## 436
[1] 436
Clean and format anno_merge dataframe
## replace empty "Associated.Transcript.Name" annotation by 0
anno_merge$Associated.Transcript.Name[grepl("^$", anno_merge$Associated.Transcript.Name)] <- 0
## replace empty ENSOCUG annotation by 0
anno_merge$Ensembl.Gene.ID[grepl("^$", anno_merge$Ensembl.Gene.ID)] <- 0
## replace "Associated.Transcript.Name" with LOC by 0
anno_merge$Associated.Transcript.Name[grepl("^LOC", anno_merge$Associated.Transcript.Name)] <- 0
Statistics
## count of oligos with Associated.Transcript.Name = 0
grepl("^0$", anno_merge$Associated.Transcript.Name) %>% sum ## 8455
[1] 8455
## count of oligos with Ensembl.Gene.ID = 0
grepl("^0$", anno_merge$Ensembl.Gene.ID) %>% sum ## 6635
[1] 6635
## count of oligo without annotation
anno_merge$oligo[grepl("^0$", anno_merge$Associated.Transcript.Name) & grepl("^0$", anno_merge$Ensembl.Gene.ID)] %>% length() ## 6526
[1] 6526
### A_04_P Agilent probes annotation
anno_merge %>%
filter(grepl("A_04_", ProbeName)) %>%
dim() ## 22599 28
[1] 22599 28
Glance at anno_merge
anno_merge %>%
filter(grepl("A_04_", ProbeName)) %>%
select(ProbeName, GeneName, Ensembl.Gene.ID, Associated.Transcript.Name) %>%
rmarkdown::paged_table(options = list(rows.print = 10))
(http://cptweb.cpt.wayne.edu/BB/about.php) v3.0 08/2020
## import output file in "maj_bb_v3" object
maj_bb_v3 <- read.csv(file = "data/annot_bb_v3.0.csv",
sep = ",",
header = T,
stringsAsFactors = F)
## maj_bb_v3 don't have the same colnames as anno_merge
## colnames correction to allow merging
colnames(maj_bb_v3)[colnames(maj_bb_v3) == "Probe.ID"] <- "ProbeName"
Merge the 2 annotation objects by ProbeName
for A_04_ oligos
## join matching values from maj_bb_v3 to anno_merge
fusion <- left_join(x = anno_merge[grepl("A_04_", anno_merge$ProbeName),],
y = maj_bb_v3,
by = "ProbeName")
dim(fusion) ## 22599 32
[1] 22599 32
Updating Clean annotation (“LOC”, ““) Update gene name and ENSOCUG by BB annotation when they are different
Clean “LOC” annotations in Rabbit.Gene.Name
## count of probes with Rabbit.Gene.Name == "LOC" in bb
length(grep("^LOC", fusion$Rabbit.Gene.Name)) ## 526
[1] 526
## replace by 0
fusion$Rabbit.Gene.Name[grep(pattern = "^LOC", fusion$Rabbit.Gene.Name)] <- "0"
## check count of probes with gene name == "LOC" in original table
length(grep("^LOC", fusion$Associated.Transcript.Name)) ## 0
[1] 0
Clean ” ” annotations in Rabbit.Gene.Name
## count of probes with Rabbit.Gene.Name == "" in bb
sum(nchar(fusion$Rabbit.Gene.Name) == 0) ## 2526
[1] 2526
## replace by 0
fusion$Rabbit.Gene.Name[nchar(fusion$Rabbit.Gene.Name) == 0] <- "0"
## count of probes with gene name == "" in original table
sum(nchar(fusion$Associated.Transcript.Name) == 0) ## 0
[1] 0
Clean ” ” annotations in Rabbit.Ensembl.Gene.ID
(ENSOCUG)
## count of probes with Rabbit.Ensembl.Gene.ID (ENSOCUG) == "" in bb
sum(nchar(fusion$Rabbit.Ensembl.Gene.ID) == 0) ## 2537
[1] 2537
## replace by 0
fusion$Rabbit.Ensembl.Gene.ID[nchar(fusion$Rabbit.Ensembl.Gene.ID) == 0] <- "0"
## count of probes with ENSOCUG == "" in original table
sum(nchar(fusion$Associated.Transcript.Name) == 0) ## 0
[1] 0
fusion
objectAssociated.Transcript.Name
!= 0
tmp_HUGO_VD_1 <- dplyr::filter(fusion, Associated.Transcript.Name != 0) ##
dim(tmp_HUGO_VD_1) ## 20537 32
[1] 20537 32
Associated.Transcript.Name
== 0
tmp_HUGO_VD_0 <- dplyr::filter(fusion, Associated.Transcript.Name == 0) ##
dim(tmp_HUGO_VD_0) ## 2062 32
[1] 2062 32
Rabbit.Gene.Name
!= 0
[1] 19547 32
Associated.Transcript.Name
unchanged
tmp_HUGO_unchanged <- length(intersect(paste0(tmp_HUGO_bb$ProbeName, "_", tmp_HUGO_bb$Rabbit.Gene.Name), paste0(tmp_HUGO_VD_1$ProbeName, "_", tmp_HUGO_VD_1$Associated.Transcript.Name)))
tmp_HUGO_unchanged ## 16883
[1] 16883
Rabbit.Gene.Name
== 0
Check updated annotations
## Associated.Transcript.Name improved
dim(fusion)[1] - tmp_HUGO_unchanged - dim(tmp_0_bb)[1] ## 2664
[1] 2664
Custom probes with annotation (4665)
cust_ENSOCUG_OR_HUGO <- filter(anno_merge, grepl("CUST_", ProbeName), Ensembl.Gene.ID != 0 | Associated.Transcript.Name != 0)
dim(cust_ENSOCUG_OR_HUGO) ## 4665 28
[1] 4665 28
cust_ENSOCUG_OR_HUGO %>%
select(ProbeName, GeneName, Ensembl.Gene.ID, Associated.Transcript.Name) %>%
rmarkdown::paged_table(options = list(rows.print = 10))