Probe annotation and DEG

Improvement of probe annotation using (i) custom annotation files, (ii) Better Bunny website tool and (iii) manual BLAST of probe sequence

pacman::p_load(dplyr,
               limma,
               ggplot2,
               tibble,
               flextable)
load(file = "RData/data.RData")
source("R/functions.R")

Import annotations

Agilent microarray annotation

anno_array <- read.csv(file = "data/GPS_NED_MA0199_All_fData.csv", 
                       sep = ";", 
                       stringsAsFactors = F)
## rename colname of the fist column
colnames(anno_array)[1] <- "oligo"
## pick up oligo's name frm working_set
anno_array$oligo <- colnames(working_set)

custom INRAE BDR lab annotation

## V. Duranthon annotation
anno_conf <- read.csv(file = "data/150602_annotation_microarray_lapin_V2.csv", 
                      sep = ";", 
                      stringsAsFactors = F)
## rename colname of the fist column
colnames(anno_conf)[1] <- "ProbeName"
## describe TargetID colnames
colnames(anno_conf)[colnames(anno_conf) == "TargetID"] <- "Target.ID.Human"
colnames(anno_conf)[colnames(anno_conf) == "TargetID.1"] <- "Target.ID.Rabbit"

Remove oligos cleaned

anno_conf_nett <- anno_conf[anno_conf$ProbeName %in% anno_array$ProbeName,]

V. Jacquier annotation DOI 10.1186/s12864-015-1218-9

anno_immun_VJ <- xlsx::read.xlsx(file = "data/Liste453geneImmunityBMCgenocmicsJacquier2015.xlsx", 
                                 sheetIndex = 1, 
                                 colIndex = c(1:3), 
                                 as.data.frame = TRUE)

Merge data.frame

## merge Agilent and VD annotation
anno_merge <- dplyr::left_join(anno_array, anno_conf_nett, by = "ProbeName")
## Rename gene description colnames
colnames(anno_merge)[colnames(anno_merge) == "Description.x"] <- "Description_Agilent"
colnames(anno_merge)[colnames(anno_merge) == "Description.y"] <- "Description_VD"
## remove "FASTA" column (same as "Sequence" column)
anno_merge %>% dplyr::select(-FASTA) -> anno_merge
## remove column "ControlType" (non informative)
anno_merge %>% dplyr::select(-ControlType) -> anno_merge
## remove '-201', '-202', '-203' from the end of "Associated Transcript Name"
anno_merge$Associated.Transcript.Name <- gsub("-201", "", anno_merge$Associated.Transcript.Name)
anno_merge$Associated.Transcript.Name <- gsub("-202", "", anno_merge$Associated.Transcript.Name)
anno_merge$Associated.Transcript.Name <- gsub("-203", "", anno_merge$Associated.Transcript.Name)
## add "Immun" column identifying VJ genes
anno_merge$Add_annot_V_Jacquier_2015 <- "FALSE"
## annote immun sur la base du fichier de VJ
anno_merge$Add_annot_V_Jacquier_2015[anno_merge$Ensembl.Gene.ID %in% anno_immun_VJ$ENSEMBL_ID] <- "TRUE"
## count of "immun" oligo still on the array
sum(anno_merge$Add_annot_V_Jacquier_2015 == 'TRUE')  ## 436
[1] 436

Clean and format anno_merge dataframe

## replace empty "Associated.Transcript.Name" annotation by 0
anno_merge$Associated.Transcript.Name[grepl("^$", anno_merge$Associated.Transcript.Name)] <- 0
## replace empty ENSOCUG annotation by 0
anno_merge$Ensembl.Gene.ID[grepl("^$", anno_merge$Ensembl.Gene.ID)] <- 0
## replace "Associated.Transcript.Name" with LOC by 0
anno_merge$Associated.Transcript.Name[grepl("^LOC", anno_merge$Associated.Transcript.Name)] <- 0

Statistics

## count of oligos with Associated.Transcript.Name = 0
grepl("^0$", anno_merge$Associated.Transcript.Name) %>% sum  ## 8455
[1] 8455
## count of oligos with Ensembl.Gene.ID = 0
grepl("^0$", anno_merge$Ensembl.Gene.ID) %>% sum  ## 6635
[1] 6635
## count of oligo without annotation
anno_merge$oligo[grepl("^0$", anno_merge$Associated.Transcript.Name) & grepl("^0$", anno_merge$Ensembl.Gene.ID)] %>% length()  ## 6526
[1] 6526

Agilent annotation description

### A_04_P Agilent probes annotation
anno_merge %>% 
  filter(grepl("A_04_", ProbeName)) %>% 
  dim()  ## 22599 28
[1] 22599    28

Glance at anno_merge

anno_merge %>% 
  filter(grepl("A_04_", ProbeName)) %>% 
  select(ProbeName, GeneName, Ensembl.Gene.ID, Associated.Transcript.Name) %>%
  rmarkdown::paged_table(options = list(rows.print = 10))

Updating annotations

Update Agilent annotation (A_04_P) using Better Bunny tool

(http://cptweb.cpt.wayne.edu/BB/about.php) v3.0 08/2020

## import output file in "maj_bb_v3" object
maj_bb_v3 <- read.csv(file = "data/annot_bb_v3.0.csv", 
                      sep = ",", 
                      header = T, 
                      stringsAsFactors = F)
## maj_bb_v3 don't have the same colnames as anno_merge
## colnames correction  to allow merging
colnames(maj_bb_v3)[colnames(maj_bb_v3) == "Probe.ID"] <- "ProbeName"

Merge the 2 annotation objects by ProbeName for A_04_ oligos

## join matching values from maj_bb_v3 to anno_merge
fusion <- left_join(x = anno_merge[grepl("A_04_", anno_merge$ProbeName),],
                    y = maj_bb_v3,
                    by = "ProbeName")
dim(fusion)  ## 22599 32
[1] 22599    32

Updating Clean annotation (“LOC”, ““) Update gene name and ENSOCUG by BB annotation when they are different

Clean “LOC” annotations in Rabbit.Gene.Name

## count of probes with Rabbit.Gene.Name == "LOC" in bb
length(grep("^LOC", fusion$Rabbit.Gene.Name))  ## 526
[1] 526
## replace by 0
fusion$Rabbit.Gene.Name[grep(pattern = "^LOC", fusion$Rabbit.Gene.Name)] <- "0"
## check count of probes with gene name == "LOC" in original table
length(grep("^LOC", fusion$Associated.Transcript.Name))  ## 0
[1] 0

Clean ” ” annotations in Rabbit.Gene.Name

## count of probes with Rabbit.Gene.Name == "" in bb
sum(nchar(fusion$Rabbit.Gene.Name) == 0) ## 2526
[1] 2526
## replace by 0
fusion$Rabbit.Gene.Name[nchar(fusion$Rabbit.Gene.Name) == 0] <- "0"
## count of probes with gene name == "" in original table
sum(nchar(fusion$Associated.Transcript.Name) == 0)  ## 0
[1] 0

Clean ” ” annotations in Rabbit.Ensembl.Gene.ID (ENSOCUG)

## count of probes with Rabbit.Ensembl.Gene.ID (ENSOCUG) == "" in bb
sum(nchar(fusion$Rabbit.Ensembl.Gene.ID) == 0) ## 2537
[1] 2537
## replace by 0
fusion$Rabbit.Ensembl.Gene.ID[nchar(fusion$Rabbit.Ensembl.Gene.ID) == 0] <- "0"
## count of probes with ENSOCUG == "" in original table
sum(nchar(fusion$Associated.Transcript.Name) == 0)  ## 0
[1] 0

Statistics of fusion object

Associated.Transcript.Name != 0

tmp_HUGO_VD_1 <-  dplyr::filter(fusion, Associated.Transcript.Name != 0) ##
dim(tmp_HUGO_VD_1)  ## 20537  32
[1] 20537    32

Associated.Transcript.Name == 0

tmp_HUGO_VD_0 <- dplyr::filter(fusion, Associated.Transcript.Name == 0) ##
dim(tmp_HUGO_VD_0)  ## 2062  32
[1] 2062   32

Rabbit.Gene.Name != 0

tmp_HUGO_bb <- dplyr::filter(fusion, Rabbit.Gene.Name != 0)  ## 
dim(tmp_HUGO_bb)  ## 19547  32
[1] 19547    32

Associated.Transcript.Name unchanged

tmp_HUGO_unchanged <- length(intersect(paste0(tmp_HUGO_bb$ProbeName, "_", tmp_HUGO_bb$Rabbit.Gene.Name), paste0(tmp_HUGO_VD_1$ProbeName, "_", tmp_HUGO_VD_1$Associated.Transcript.Name)))
tmp_HUGO_unchanged  ## 16883
[1] 16883

Rabbit.Gene.Name == 0

tmp_0_bb <- dplyr::filter(fusion, Rabbit.Gene.Name == 0)
dim(tmp_0_bb)  ## 3052 32
[1] 3052   32

Check updated annotations

## Associated.Transcript.Name improved
dim(fusion)[1] - tmp_HUGO_unchanged - dim(tmp_0_bb)[1]  ## 2664
[1] 2664

Custom probes with annotation (4665)

cust_ENSOCUG_OR_HUGO <- filter(anno_merge, grepl("CUST_", ProbeName), Ensembl.Gene.ID != 0 |  Associated.Transcript.Name != 0)
dim(cust_ENSOCUG_OR_HUGO)  ## 4665 28
[1] 4665   28
cust_ENSOCUG_OR_HUGO %>% 
  select(ProbeName, GeneName, Ensembl.Gene.ID, Associated.Transcript.Name) %>% 
  rmarkdown::paged_table(options = list(rows.print = 10))