Published

30 Mar 2026

Overview:

DIABLO Multi-Omics Integration

  1. Load already QC and normalized data
    Clean, normalize, and scale each omics dataset independently.

  2. Select taxa level and combine X
    Use PLS-DA to confirm that class separation exists before applying sparsity.

  3. Identify sample sets
    Identify the optimal number of components (ncomp) and variables to select per component (keepX) via cross-validation.

  4. Select response variable
    Build the sPLS-DA model using the tuned parameters.

  5. Specify design matrix
    Evaluate classification accuracy, balanced error rate (BER), and stability of selected variables.

  6. Model
    Retrieve the most discriminant variables identified by the sPLS-DA model.

  7. Visualize models
    Use the selected features as input for integration frameworks such as DIABLO (block.splsda()).

parameter tuning

There are several variables that wee kan tweak in order to optimise our integration model these include - weights - number of blocks - species or strain level of MG - number of features to select for tuning

I want to test a couple of combinations to see if there are any signifcant changes

1. Load already QC and normalized data

Code
##################
# LOAD LIBRARIES #
##################
library(tidyverse)
library(mixOmics)
library(readxl)
library(cowplot)
library(RColorBrewer)
library(scales)

library(VennDiagram)
library(gridExtra)
library(ComplexHeatmap)
library(circlize)

library(tidygraph)
library(igraph)
library(ggraph)
select <- dplyr::select
map <- purrr::map

# setwd("/Users/vilkal/work/Brolidens_work/Projects/Candida-omics/src")
# pal <- c("#ffa998", "#f588af", "#902267") # #e05e85
# feat <- c("#bdd3c5", "#48919d", "#3d4b6eff") # "#1f2e55", "#c6cdf7"
# pal <- c("#d18ac8", "#3db8ac", "#8861d2", "#6095d7", "#755fa3", "#c44999")
feat <- c("#e39ddaff", "#6dded3ff", "#a68cd5ff") #,"#6095d7","#755fa3","#c44999")

# pal <- c("#9e8dec", "#902267", "#cc93cf", "#a5c97b", "#de9548", "#63d3b4") #"#abcb4b"

pal <- list(
  CST = c(
    "I" = "#148F77",
    "II" = "#6d9f06",
    "III" = "#A3E4D7",
    "IV-B" = "#E794C1",
    "IV-C" = "#b5cb8c",
    "V" = "#43BA8F"
  ),
  subCST = c(
    "#a5c97b",
    "#cc93cf",
    "#de9548",
    "#63d3b4",
    "#9e8dec",
    "#902267",
    '#FB5273',
    "#91c5de"
  ),
  RVVC = c("#EAEAEA", "#fd814e"),
  BV = c("#EAEAEA", "#fd814e"),
  NAC = c("#EAEAEA", "#fd814e"),
  ST = c("#EAEAEA", "#fd814e"),
  missing_page = c("#EAEAEA", "#fd814e"),
  Clin = c("#a5c97b", "#cc93cf", "#de9548", "#63d3b4", "#9e8dec", "#902267"),
  Clin_gr = c("#ffa998", '#FB5273', "#902267"),
  RVVC_AS = c("#ffa998", '#FB5273', "#902267"),
  RVVC_pos = c(
    "#A6CEE3",
    "#1F78B4",
    #"#EAEAEA",
    #"gray",
    "#e490c4ff",
    "#c12d88ff"
  ), # "#FB9A99", "#E31A1C", "#B2DF8A", "#609c5bff", "#FDBF6F", "#FF7F00",, "#CAB2D6", "#6a3d9a" "#e490c4ff", "#c12d88ff"
  trx_louvain_7 = c(
    "#de9548",
    "#cc93cf",
    "#a5c97b",
    "#9e8dec",
    "#902267",
    "#63d3b4",
    '#FB5273'
  ),
  metab_louvain = c(
    "#cc93cf",
    "#a5c97b",
    "#de9548",
    "#902267",
    "#9e8dec",
    "#63d3b4"
    #'#FB5273'
  ),
  microb_louvain = c(
    "#cc93cf",
    "#de9548",
    "#a5c97b",
    "#9e8dec",
    "#902267",
    "#63d3b4",
    '#FB5273'
  ),
  fun_louvain = c(
    "#a5c97b",
    "#cc93cf",
    "#de9548",
    "#63d3b4",
    "#9e8dec",
    "#902267",
    '#FB5273'
    #"#984EA3"
    #"#B3CDE3"
  ),
  integ_louvain = c(
    "#a5c97b",
    "#cc93cf",
    "#de9548",
    "#63d3b4",
    "#9e8dec",
    "#902267",
    '#FB5273',
    "#984EA3"
    #"#B3CDE3"
  ),
  "Wet_smear:_clue_cells_(y/n)" = c("#2266ac", "#91c5de"),
  hyphae = c("#f26386", "#fd814e", "#a4d984", "#fbbc52"),
  pos = c("#91c5de", "#2266ac"),
  "Clinical_score_(0-5)" = brewer.pal(6, name = "Reds")
)
data_path <- "/Users/vilkal/work/Brolidens_work/Projects/Candida-omics/Tidy_data/Metagenomics/Metageonome_all_vagswab_metaphlan.csv"
# meta_path <- "/Users/vilkal/Downloads/Metadata_svamp.xlsx"

#############
# LOAD DATA #
#############
TRX_path <- "../Results/MixOmic/Transcriptomics_batch_filt.csv"
MG_path <- "../Results/MixOmic/Metageonome_vagswab_CLR_transformed.RDS"
MB_path <- "../Results/MixOmic/Metabolom_Normalized.csv"
FM_path <- "../Tidy_data/Metagenomics/Functional_annotation_CLR_filtered.RDS"

# expression data
TRX_batch <- read.csv(TRX_path, row.names = 1)
MG_matrix <- readRDS(MG_path)
MB_matrix <- read.csv(MB_path, row.names = 1)
FM_matrix <- readRDS(FM_path)

top_hvg <- read_csv("../Results/Tissue_QC/HVG.csv")

#############
# LOAD META #
#############
Clin_path <- "/Users/vilkal/work/Brolidens_work/Projects/Candida-omics/data/Metadata_svamp_FINAL.xlsx"
comb_meta_path <- "../Results/Combined_gr_meta_data.csv"

# meta data
Clin_data <- read_xlsx(Clin_path, sheet = "Metadata", na = "na")
meta <- read_csv(comb_meta_path) %>%
  rename(svamp_ID = "sample") %>%
  mutate(across(contains("louvain"), ~ factor(.x))) %>%
  mutate(across(2:25, ~ factor(.x)))

m <- read_csv("../Results/Sample_omic_table.csv")

2. Preparing multi-omics matrices

Steps for preparing multi-omics matrices (TR, MG, MB)

  1. Remove unwanted taxa
    • For MG data, drop columns whose taxa names match a set of regular expressions
      (e.g., remove Candida/fungi using regex patterns).
  2. Select a taxonomic level for MG
    • Choose one level from the clr_data list (species, genus, family, etc.).
  3. Orient all matrices so that samples are rows
    • TR and MB may need to be transposed and converted into matrices.
    • MG is already sample × feature.
  4. Compute the intersection of shared samples
    • Extract row names from TR, MG, and MB and identify samples common to all layers.
  5. Subset all matrices to include only overlapping samples
    • Ensures all omics blocks have the same sample set and can be correlated.
  6. Return the cleaned and aligned omics list
    • The final list contains X$TR, X$MG, and X$MB, all with identical sample ordering.

Note, S15 and S10 was filtered out from the luminal metagenomics data, One due to low total count and one had less than 60% of taxa left after filtering.

# ---- 2. Remove unwanted taxa (candida) from MG ----
# k__Eukaryota|p__Ascomycota|c__Saccharomycetes|o__Saccharomycetales|f__Debaryomycetaceae|g__Candida|s__Candida_albicans
reg <- c(
  "kingdom" = "",
  "phylum" = "Ascomycota",
  "class" = "Saccharomycetes",
  "order" = "Saccharomycetales",
  "family" = "Debaryomycetaceae",
  "genus" = "Candida",
  "species" = "Candida_albicans|Candida_parapsilosis|Candida_dubliniensis",
  "strain" = "Candida_albicans EUK5476|Candida_parapsilosis EUK5480|Candida_dubliniensis EUK42374|Candida_glabrata EUK5478"
)

prep_multi_omics <- function(
  taxa_level = "species",
  remove_regex_list = reg
) {
  if (!is.null(remove_regex_list)) {
    MG_matrix <- MG_matrix %>%
      mutate(across(
        clr_data,
        ~ imap(
          .,
          ~ .x[, !grepl(remove_regex_list[[.y]], colnames(.x)), drop = FALSE]
        )
      ))
  }
  # ---- 1. select taxa level and remove long strain id, replace with int
  MG_matrix <- MG_matrix[["clr_data"]][[taxa_level]]
  new_names <- sub("\\s+\\S+$", "", colnames(MG_matrix)) # remove strain id
  new_names <- make.unique(new_names, sep = " ") # Make names unique
  colnames(MG_matrix) <- new_names # Assign new names

  # ---- 3. select 5000 most higly variable genes
  TRX_HVG <- TRX_batch[top_hvg$gene[1:5000], ]

  # ---- 4. Ensure correct orientation (samples = rows) ----
  views <- list(
    Transcriptomics = as.matrix(t(TRX_HVG)),
    Metabolomics = as.matrix(t(MB_matrix)),
    Metagenomics = unclass(MG_matrix),
    Functional = unclass(t(FM_matrix$path_lvl))
  )

  # ---- 5. filter to include selected samples for MOFA
  m_ <- m %>%
    filter(sample %in% .$sample[m$at_least_two & m$all_BL])

  X <- views %>%
    imap(., ~ .x[m_$sample[m_[[.y]]], ])

  # ---- 6. Subset each matrix to baseline only and overlapping samples ----
  ids <<- map(X, rownames)
  ids_uniq <<- Reduce(intersect, map(X, rownames))

  X <- X %>%
    map(~ .x[ids_uniq, , drop = FALSE]) %>%
    set_names(., c("TR", "MB", "MG", "FU"))
  return(X)
}

X <- c("species", "strain") %>%
  set_names(.) %>%
  map(., ~ prep_multi_omics(taxa_level = .x))
gr <- meta %>%
  filter(svamp_ID %in% ids_uniq)

# group distribution
vars <- c(
  "Clinical_score_(0-5)",
  "Clin_gr",
  "RVVC",
  "hyphae",
  "RVVC_pos",
  "BV"
) %>%
  set_names()
map(vars, ~ table(gr[[.x]]))
$`Clinical_score_(0-5)`

 0  1  2  3  4  5 
42  6  9  5  5  3 

$Clin_gr

  0 1-3 4-5 
 42  20   8 

$RVVC

Control    RVVC 
     39      31 

$hyphae

  Control Control_H      RVVC    RVVC_H 
       36         3        15        16 

$RVVC_pos

Control_neg Control_pos    RVVC_neg    RVVC_pos 
         29          10           8          23 

$BV

 no yes 
 66   4 
# specify Y
Y <- c("Clinical_score_(0-5)", "Clin_gr", "RVVC_pos", "pos") %>%
  set_names() %>%
  #set_names(., c("Clin", "Clin_bin", "RVVC_pos", "pos")) %>%
  map(., ~ factor(as.character(gr[[.x]])))

saveRDS(Y, "../Results/MixOmic/Y_object.RDS")
saveRDS(X, "../Results/MixOmic/X_object.RDS")

# write_csv(meta, "../Results/MixOmic/meta_int.csv")
X <- readRDS("../Results/MixOmic/X_object.RDS")

plot sample overlap

make_venn <- function(i) {
  n12 <- length(intersect(i[["TR"]], i[["MG"]]))
  n23 <- length(intersect(i[["MG"]], i[["MB"]]))
  n13 <- length(intersect(i[["TR"]], i[["MB"]]))
  n123 <- length(Reduce(intersect, list(i[["TR"]], i[["MG"]], i[["MB"]])))

  venn.plot <- draw.triple.venn(
    area1 = length(i[["TR"]]),
    area2 = length(i[["MG"]]),
    area3 = length(i[["MB"]]),
    n12 = n12,
    n23 = n23,
    n13 = n13,
    n123 = n123,
    category = c("TR", "MG", "MB"),
    fill = feat,
    col = feat,
    alpha = 0.5,
    lty = rep("solid", 3),
    lwd = rep(2, 3),
    cat.fontfamily = rep("sans", 3),
    cex = 2,
    cat.cex = 2
  )
}

# all samples and intersecting samples
# Variables are created when running prep_multi_omics() function
l <- list(ids_all, ids)

plots <- map(l, ~ make_venn(.x)) %>%
  map(., ~ grobTree(.x))
grid.newpage()

grid.arrange(
  arrangeGrob(plots[[1]], top = "With follow up samples"),
  arrangeGrob(plots[[2]], top = "Baseline only samples"),
  ncol = 2
)

4. Select response variable

### ------------------------------------------------------------
### 1. Select symptom columns and prepare binary clinical data
### ------------------------------------------------------------

# Extract only samples of interest and symptom score columns
df <- meta %>%
  filter(svamp_ID %in% ids_uniq) %>% # keep only overlapping samples
  select(svamp_ID, starts_with("Clinical score:")) # symptoms (y/n)

# Convert symptom columns to numeric 0/1 and clean column names
df_bin <- df %>%
  filter(!is.na(svamp_ID)) %>% # remove missing IDs
  mutate(across(starts_with("Clinical score:"), ~ as.numeric(.x))) %>% # convert factors/characters to numeric
  rename_with(
    ~ gsub("^(Clinical score: )| \\(y/n\\)$", "", .x),
    contains("Clinical score:") # clean names: remove prefix + "(y/n)"
  ) %>%
  #filter(!is.na(discharge)) %>% # ensure 'discharge' column is valid
  column_to_rownames("svamp_ID") # convert sample ID to rownames

# Convert to numeric matrix for ComplexHeatmap
mat <- as.matrix(df_bin)

# Determine row annotation height scaling factor
my_h <- 0.1 * nrow(mat)

### ------------------------------------------------------------
### 2. Prepare annotation dataframe (metadata)
### ------------------------------------------------------------
vars <- c(
  "Clinical_score_(0-5)",
  "Clin_gr",
  "RVVC",
  "hyphae",
  "RVVC_pos",
  "BV"
)
annot_col <- meta %>%
  filter(svamp_ID %in% rownames(df_bin)) %>% # keep same samples as heatmap matrix
  select(svamp_ID, any_of(vars)) %>% # select annotation variables
  mutate(across(1:length(vars) + 1, ~ factor(.x))) # ensure annotations are factors

# Quick check: ensure annotation rows match heatmap rows
dim(mat)
[1] 70  5
dim(annot_col)
[1] 70  7
### ------------------------------------------------------------
### 3. Define color palettes
### ------------------------------------------------------------

Red <- brewer.pal(6, name = "Reds")
pal5 <- c("#f26386", "#fd814e", "#a4d984", "#fbbc52")
pal1 <- c("#ffa998", "#f588af", "#902267") # colors for binned clinical score
pal4 <- c("#902267", '#FB5273', '#4FCEEF')
pal2 <- c(
  "#9e8dec",
  #"#902267",
  "#cc93cf", # colors for Groups
  "#a5c97b",
  "#de9548"
)
pal3 <- c("#2266ac", "#91c5de")

### ------------------------------------------------------------
### 4. Define row annotations for heatmap (ComplexHeatmap)
### ------------------------------------------------------------

row_annot <- rowAnnotation(
  #df = annot_col[, c("sample", vars)],
  Clin_gr = annot_col$Clin_gr, # binned clinical grade
  "Clinical_score_(0-5)" = annot_col$`Clinical_score_(0-5)`, # raw clinical score
  BV = annot_col$BV, # sample group
  RVVC = annot_col$RVVC, # sample group
  hyphae = annot_col$hyphae, # sample group
  RVVC_pos = annot_col$RVVC_pos, # sample group

  show_annotation_name = TRUE, # show labels
  annotation_name_gp = gpar(fontsize = 8), # label font size

  # Legend formatting
  annotation_legend_param = list(
    grid_height = unit(0.1, "mm"),
    grid_width = unit(2, "mm"),
    title = "",
    labels_gp = gpar(fontsize = 7),
    title_gp = gpar(fontsize = 8)
  ),

  simple_anno_size = unit(0.3, "cm"), # annotation row height

  # Colors for each annotation variable
  col = pal[vars] %>% imap(., ~ set_names(.x, levels(annot_col[[.y]])))
)

### ------------------------------------------------------------
### 5. Color function for the symptom heatmap
### ------------------------------------------------------------

# 0 = no symptom (white), 1 = symptom (red)
col_fun <- colorRamp2(
  c(0, 1),
  c("white", "#e05e85")
)
# Create heatmap
H <- Heatmap(
  mat,
  width = unit(ncol(mat) * 5, "mm"),
  height = unit(nrow(mat) * 2, "mm"),
  name = "Symptom Present",
  col = col_fun,
  right_annotation = row_annot,
  #cluster_rows = FALSE,
  #cluster_columns = FALSE,
  row_names_side = "right",
  column_names_rot = 45,
  row_names_gp = gpar(fontsize = 7),
  heatmap_legend_param = list(
    at = c(0, 1),
    labels = c("No", "Yes")
  ),
  #   cell_fun = function(j, i, x, y, width, height, fill) {
  #   grid.rect(x, y, width, height, gp = gpar(col = "grey70", lwd = 0.5, fill = NA))
  # }
  layer_fun = function(j, i, x, y, width, height, fill) {
    # Draw cell borders without overwriting fill
    grid.rect(
      x = x,
      y = y,
      width = width,
      height = height,
      gp = gpar(col = "grey70", lwd = 0.4, fill = NA)
    )
  }
)
draw(H)

Step 9: UMAP on signifcant genes

# group information
annot_col <- meta %>%
  filter(svamp_ID %in% rownames(X$species$TR)) %>%
  # select(-`Age (years)`) %>%
  select(
    1:12,
    metab_louvain,
    microb_louvain,
    fun_louvain,
    CST,
    `Clinical_score_(0-5)`
  ) %>%
  mutate(across(2:8, ~ factor(.x))) %>%
  mutate(
    txt_clin = ifelse(
      grepl("0", .$`Clinical_score_(0-5)`),
      NA,
      as.character(.$`Clinical_score_(0-5)`)
    )
  ) %>%
  mutate(txt_id = ifelse(RVVC == "RVVC", .$svamp_ID, NA))

#### UMAP analysis ####
set.seed(1)
umap_dfs <- map(
  X$strain,
  ~ uwot::umap(
    .x,
    n_neighbors = 10,
    init = "spectral",
    scale = T
  )
) %>%
  map(
    .,
    ~ tibble(
      "UMAP 1" = .x[, 1],
      "UMAP 2" = .x[, 2],
      "ID" = rownames(.x)
    )
  ) %>%
  map(., ~ left_join(., annot_col, by = c("ID" = "svamp_ID")))

#### UMAP plotting by groups ####

# save plot to file
#ggsave(paste0("./Figures/09/", "Bulk_UMAP.png"), p)

UMAP_fun <- function(umap_df, group_var, txt_var, title = "") {
  p <- ggplot(
    umap_df,
    aes(
      x = `UMAP 1`,
      y = `UMAP 2`,
      fill = .data[[group_var]],
      shape = RVVC_pos,
      color = .data[[group_var]]
    )
  ) +
    #geom_jitter( shape=21, size=3, color="white", width=.5, height=.5) +  # Tassos used jitter
    geom_point(size = 3, alpha = 1) +
    scale_colour_manual(
      values = pal[[group_var]],
      name = "",
      aesthetics = c("colour", "fill")
    ) +
    scale_shape_manual(values = c(4, 21, 17, 15), name = "", ) + #specify individual shapes+
    geom_text(
      data = umap_df,
      aes(x = `UMAP 1`, y = `UMAP 2`, label = .data[[txt_var]]),
      size = 3,
      hjust = 0,
      nudge_x = 0.07,
      color = "gray51"
    ) +
    ggtitle(title) +
    theme_bw(base_size = 14) +
    theme(
      line = element_blank(),
      axis.ticks = element_line(color = "black"),
      aspect.ratio = 1,
      axis.title = element_blank()
    )
  return(p)
}
Code
meta_clus <- colnames(annot_col)

# quartz(width = 14, height = 5)
p <- imap(umap_dfs, ~ UMAP_fun(.x, "RVVC_pos", "txt_clin", title = .y))
patchwork::wrap_plots(p, nrow = 1) + patchwork::plot_layout(guides = "collect")

Code
p <- imap(umap_dfs, ~ UMAP_fun(.x, "metab_louvain", "txt_clin", title = .y))
patchwork::wrap_plots(p, nrow = 1) + patchwork::plot_layout(guides = "collect")

Code
p <- imap(umap_dfs, ~ UMAP_fun(.x, "microb_louvain", "txt_clin", title = .y))
patchwork::wrap_plots(p, nrow = 1) + patchwork::plot_layout(guides = "collect")

Code
p <- imap(umap_dfs, ~ UMAP_fun(.x, "fun_louvain", "txt_clin", title = .y))
patchwork::wrap_plots(p, nrow = 1) + patchwork::plot_layout(guides = "collect")

Code
p <- imap(
  umap_dfs,
  ~ UMAP_fun(.x, "Clinical_score_(0-5)", "txt_clin", title = .y)
)
patchwork::wrap_plots(p, nrow = 1) + patchwork::plot_layout(guides = "collect")