Dendrograms
Metabolites and lipids multivariate statistical analysis in R
Practical applications of dendrograms (examples)
Dendrograms in R via ggtree
# Building ggtree dendrograms in R.
# Calling libraries:
library(tidyverse)
library(ggtree)
library(rstatix)
# Adjusting column type:
data$Label <- as.factor(data$Label)
# Filtering out patients with pancreatitis:
data.no.PAN <-
data %>%
filter(Label != "PAN")
# Creating a long matrix:
data.long <-
data %>%
select(-`Sample Name`) %>%
pivot_longer(cols = `CE 16:1`:`SM 42:1;O2`,
names_to = "Lipids",
values_to = "Concentrations")
# Here, we perform clustering on the 12 most significant lipids from the M-W U test:
Mann.Whitney.test <-
data.long %>%
group_by(Lipids) %>%
wilcox_test(Concentrations ~ Label,
p.adjust.method = 'none')
# Separating most significant lipids:
Mann.Whitney.test.head <-
Mann.Whitney.test %>%
arrange(p) %>%
slice_head(n = 12)
Lipids <- Mann.Whitney.test.head$Lipids
# Creating tibble for hierarchical clustering:
data.selected <-
data.no.PAN %>%
select(`Sample Name`,
Label,
all_of(Lipids))
# Data log10-transformation and Pareto-scaling:
data.log10 <-
data.selected %>%
mutate_if(is.numeric, log10)
Pareto.scaling <- function(x) {(x-mean(x))/sqrt(sd(x))}
data.Pareto.scaled <-
data.log10 %>%
mutate_if(is.numeric, ~Pareto.scaling(.))
# Before computing distances, we MUST name rows according to samples.
# This will be later needed to identify tree branches.
# We can use tidyverse functions: remove_rownames() & column_to_rownames():
data.Pareto.scaled <-
data.Pareto.scaled %>%
remove_rownames() %>%
column_to_rownames(var = "Sample Name")
# Now, we can compute matrix of Euclidean distances between samples (base R functions):
distance <-
data.Pareto.scaled %>%
select(- Label) %>%
dist(diag = T,
method = 'euclidean')
# Hierarchical clustering using Ward.D2 algorithm (base R functions):
clustering <- hclust(distance, method = 'ward.D2')
# Tibble with columns necessary to create/annotate ggtree branches:
tip_data <-
data.selected %>%
select(`Sample Name`, Label)
# Selecting colors for the ggtree tips:
colors <- c("N" = "blue", "T"="red2")
# First, we create ggtree dendrogram.
# We select a circular shape to save space (206 samples).
# If your data set contains fewer observations, use a classic rectangular shape.
# In this case - change the layout to 'rectangular'.
ggtree(clustering, layout = "circular", size = 0.8) 


Last updated