Import and format data
pacman::p_load(dplyr,
xlsx,
tibble,
ggplot2,
dendextend,
settings,
ggpubr,
flextable)
Import raw data of the microarray. The microarray data have been submitted to NCBI GEO (Gene Expression Omnibus; accession number: GSE104838 https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE104838 ).
working_set_raw <- read.csv("data/GPS_NED_MA0199_All_WorkingSet.csv",
sep = ";",
dec = ",",
row.names = 1) %>%
as.matrix
dim(working_set_raw)
[1] 33284 72
Format working_set_raw
Glance at working_set_raw after formatting
working_set_raw[1:10,1:5] %>%
data.frame %>%
rownames_to_column(var = "Oligo") %>%
flextable %>%
autofit
Oligo | oligo_4 | oligo_5 | oligo_7 | oligo_8 | oligo_9 |
---|---|---|---|---|---|
GPS24871 | 4.768867 | 10.56514 | 9.966930 | 7.712398 | 10.65519 |
GPS24872 | 5.123744 | 11.05010 | 10.175272 | 7.772364 | 10.28265 |
GPS24873 | 4.869991 | 10.68362 | 9.930170 | 7.962007 | 10.31245 |
GPS24874 | 5.424320 | 11.08182 | 9.761626 | 7.852183 | 10.55690 |
GPS24875 | 5.385345 | 10.97871 | 10.455120 | 8.092517 | 10.50853 |
GPS24876 | 4.721992 | 10.80676 | 9.905375 | 8.607926 | 10.23137 |
GPS24877 | 5.096335 | 10.77506 | 9.727906 | 8.346339 | 10.55745 |
GPS24878 | 5.021907 | 10.93837 | 9.878364 | 7.966467 | 10.56664 |
GPS24879 | 4.976638 | 10.72094 | 9.567397 | 8.185599 | 10.48794 |
GPS24880 | 6.157873 | 11.01759 | 9.741641 | 8.191910 | 10.16702 |
Import table with environment variables
Format metadata
## remove fist column and use it as rownames
xp_design <- data.frame(xp_design[,-1], row.names = xp_design[,1])
## Rename colnames
colnames(xp_design) <- c("id", "treatment", "age")
## sort by id
xp_design <- xp_design[order(xp_design$id),]
## Convert id in character
xp_design$id <- as.character(xp_design$id)
## Rename treatment variable
xp_design$treatment[xp_design$treatment == "CAB"] <- "FFab"
xp_design$treatment[xp_design$treatment == "CC"] <- "FF"
xp_design$treatment[xp_design$treatment == "PP"] <- "NF"
levels(xp_design$treatment) <- list(FFab="FFab", FF="FF", NF="NF", TT = "TT")
## add "J" to each level of variable day and convert to factor
xp_design$age <- factor(paste0("d", xp_design$age))
## Create variable `age_treatment`, convert to factor
xp_design$contrast <- factor(paste0(xp_design$age, "_", xp_design$treatment))
Glance at metadata after formatting
xp_design[1:10,] %>%
data.frame %>%
rownames_to_column(var = "Sample") %>%
flextable %>%
autofit
Sample | id | treatment | age | contrast |
---|---|---|---|---|
GPS24894 | 77 | NF | d35 | d35_NF |
GPS24914 | 78 | NF | d35 | d35_NF |
GPS24932 | 79 | NF | d35 | d35_NF |
GPS24913 | 80 | NF | d35 | d35_NF |
GPS24911 | 81 | FF | d35 | d35_FF |
GPS24930 | 82 | FF | d35 | d35_FF |
GPS24877 | 83 | NF | d35 | d35_NF |
GPS24895 | 84 | NF | d35 | d35_NF |
GPS24879 | 85 | TT | d35 | d35_TT |
GPS24897 | 86 | TT | d35 | d35_TT |
Import qualitative measures
Format param
## Rename first column with "id"
colnames(param)[1] <- "id"
## Convert `id` to character
param[,1] <- as.character(param[,1])
## Convert cage number to factor
param[,2] <- as.factor(param[,2])
Create table with all data describing samples
Rename colnames
meta_data_raw %>%
rename(day_2_w = pds_j2,
day_35_w = pds_j35,
slaughter_w = pds_abat) -> meta_data_raw
Select variables
meta_data_raw %>%
select(treatment,
age,
contrast,
day_2_w,
day_35_w,
slaughter_w) -> meta_data_raw
Remove samples from TT treatment in working_set and metadata objects
## Remove samples from TT in working_set
working_set_TT <- working_set_raw[meta_data_raw$treatment != "TT", ]
Glance at meta_data_raw
meta_data_TT %>%
head(20) %>%
flextable %>%
autofit
treatment | age | contrast | day_2_w | day_35_w | slaughter_w |
---|---|---|---|---|---|
FFab | d35 | d35_FFab | 73 | 721 | 721 |
FFab | d35 | d35_FFab | 88 | 766 | 766 |
FF | d35 | d35_FF | 86 | 934 | 934 |
FF | d35 | d35_FF | 99 | 1,073 | 1,073 |
FF | d35 | d35_FF | 99 | 830 | 830 |
NF | d35 | d35_NF | 75 | 919 | 919 |
NF | d35 | d35_NF | 88 | 721 | 721 |
FFab | d49 | d49_FFab | 77 | 883 | 1,800 |
FFab | d49 | d49_FFab | 92 | 996 | 1,836 |
FFab | d49 | d49_FFab | 117 | 1,164 | 2,184 |
FF | d49 | d49_FF | 77 | 753 | 1,350 |
FF | d49 | d49_FF | 82 | 796 | 1,776 |
NF | d49 | d49_NF | 60 | 868 | 1,775 |
NF | d49 | d49_NF | 91 | 886 | 1,722 |
FFab | d35 | d35_FFab | 74 | 827 | 827 |
FFab | d35 | d35_FFab | 84 | 913 | 913 |
FF | d35 | d35_FF | 78 | 906 | 906 |
FF | d35 | d35_FF | 80 | 863 | 863 |
NF | d35 | d35_NF | 66 | 813 | 813 |
NF | d35 | d35_NF | 64 | 888 | 888 |
Hierarchical classification with 1-corr distance matrix
L <- "GPS24914"
hc <- hclust(as.dist(1 - cor(t(working_set_TT))), method = "ward.D2")
clust_corr <- as.dendrogram(hc)
par(mar = c(10,4,3,1))
clust_corr %>%
set("by_lists_branches_col", L, TF_value = "red") %>%
set("leaves_cex", 2) %>%
color_labels(col = c("red"), labels = labels(.)[labels(.)=="GPS24914"]) %>%
ladderize(TRUE) %>%
plot(axes = F,
cex = 2,
main = "Hierarchical clustering with (1-corr) distance and Ward algorithm")
## reset par options
reset_par()
## nMDS with "1-cor" distance
set.seed(123)
tmp <- vegan::metaMDS(as.dist(1-cor(t(working_set_TT))), k = 2, maxit = 100)
Run 0 stress 0.1030165
Run 1 stress 0.106524
Run 2 stress 0.1074274
Run 3 stress 0.1074881
Run 4 stress 0.1089342
Run 5 stress 0.1106128
Run 6 stress 0.104248
Run 7 stress 0.103058
... Procrustes: rmse 0.006226324 max resid 0.02790997
Run 8 stress 0.1085403
Run 9 stress 0.1038872
Run 10 stress 0.1047714
Run 11 stress 0.1029835
... New best solution
... Procrustes: rmse 0.006748053 max resid 0.02406572
Run 12 stress 0.1036047
Run 13 stress 0.1033931
... Procrustes: rmse 0.02963116 max resid 0.08950939
Run 14 stress 0.1032798
... Procrustes: rmse 0.02395133 max resid 0.07011736
Run 15 stress 0.103278
... Procrustes: rmse 0.02433447 max resid 0.07724788
Run 16 stress 0.1074629
Run 17 stress 0.1085151
Run 18 stress 0.1030281
... Procrustes: rmse 0.0132038 max resid 0.0468257
Run 19 stress 0.1029785
... New best solution
... Procrustes: rmse 0.002907277 max resid 0.00843367
... Similar to previous best
Run 20 stress 0.1033395
... Procrustes: rmse 0.03002822 max resid 0.08943733
*** Best solution repeated 1 times
tmp2 <- rownames_to_column(data.frame(tmp$points[,1:2]))
ggplot(data.frame(tmp2), aes(x = MDS1, y = MDS2)) +
geom_point(aes(col = meta_data_TT$age), size = 4.5) +
geom_text(aes(label = ifelse(rownames(meta_data_TT) == "GPS24914", "GPS24914", '')),
hjust = -.3,
vjust = .6,
colour = "black",
cex = 3) +
labs(title = "nMDS with (1-corr) distance", subtitle = paste0("Stress = ", round(tmp$stress, 3))) +
labs(colour = "Age") +
theme_bw() +
theme(axis.ticks = element_blank(),
axis.text = element_blank(),
axis.title = element_text(size = 12, color = "black"),
axis.title.x = element_blank(),
axis.title.y = element_blank()) +
# scale_color_grey() +
theme(aspect.ratio = 1) + ## square shape
ggpubr::rremove("grid") -> p_nMDS_corr
p_nMDS_corr
## ggsave(filename = fig/nMDS_corr.png", plot = p_nMDS_corr, dpi=600, width = 6, height = 5)
Remove sample GPS24914 in working_set and metadata
Summary of qualitatives variables
meta_data %>%
select(where(is.factor)) %>%
summary
treatment age contrast
FF :19 d35:29 d35_FF :10
FFab:19 d49:27 d35_FFab:10
NF :18 d35_NF : 9
d49_FF : 9
d49_FFab: 9
d49_NF : 9