Metabolomics QC and Normalization

Published

20 Jan 2026

Load Libraries

Code
##################
# LOAD LIBRARIES #
##################
library(tidyverse)
library(readxl)

# BiocManager::install("limma")
library(limma)
library(mixOmics)
library(edgeR)

library(cowplot)
library(RColorBrewer)
library(scales)
select <- dplyr::select
map <- purrr::map

# setwd("/Users/vilkal/work/Brolidens_work/Projects/Candida-omics/src")
#################
# COLOUR PALLET #
#################
pal <- c("#ffa998", "#e05e85", "#902267") # #e05e85, #f588af

This is a good ressource to read about PLS-DA https://divingintogeneticsandgenomics.com/post/partial-least-square-regression-for-marker-gene-identification-in-scrnaseq-data/

Overview:

Feature Selection Workflow for Multi-Omics Integration

  1. Clean up data
    Compile the data into appropriate format.

  2. Remove QC samples
    Filter QC samples.

  3. Normalize
    Use quntile normalization from limma package.

  4. Plot unsupervised UMAPs
    Plot all metabolomics samples by UMAP across various meta data groups.

  5. Plot shankey plot
    Plot the relationship between samples for different meta data groups.

Code
pal <- list(
  subCST = c(
    "#a5c97b",
    "#cc93cf",
    "#de9548",
    "#63d3b4",
    "#9e8dec",
    "#902267",
    '#FB5273',
    "#91c5de"
  ),
  RVVC = c("#EAEAEA", "#fd814e"),
  BV = c("#EAEAEA", "#fd814e"),
  NAC = c("#EAEAEA", "#fd814e"),
  ST = c("#EAEAEA", "#fd814e"),
  missing_RVVC = c("#EAEAEA", "#fd814e"),
  Clin = c("#a5c97b", "#cc93cf", "#de9548", "#63d3b4", "#9e8dec", "#902267"),
  Clin_gr = c("#ffa998", '#FB5273', "#902267"),
  RVVC_AS = c("#ffa998", '#FB5273', "#902267"),
  RVVC_pos = c(
    "#A6CEE3",
    "#1F78B4",
    "#EAEAEA",
    "gray",
    "#e490c4ff",
    "#c12d88ff"
  ), # "#FB9A99", "#E31A1C", "#B2DF8A", "#609c5bff", "#FDBF6F", "#FF7F00",, "#CAB2D6", "#6a3d9a" "#e490c4ff", "#c12d88ff"
  metab_louvain = c(
    "#a5c97b",
    "#cc93cf",
    "#de9548",
    "#63d3b4",
    "#9e8dec",
    "#902267",
    '#FB5273'
  ),
  microb_louvain = c(
    "#de9548",
    "#cc93cf",
    "#a5c97b",
    "#9e8dec",
    "#902267",
    "#63d3b4"
  ),
  integ_louvain = c(
    "#a5c97b",
    "#cc93cf",
    "#de9548",
    "#63d3b4",
    "#9e8dec",
    "#902267",
    '#FB5273',
    "#984EA3"
    #"#B3CDE3"
  ),
  "Wet_smear:_clue_cells_(y/n)" = c("#2266ac", "#91c5de"),
  hyphae = c("#f26386", "#fd814e", "#a4d984", "#fbbc52"),
  pos = c("#91c5de", "#2266ac"),
  "Clinical_score_(0-5)" = brewer.pal(6, name = "Reds")
)
Code
#############
# LOAD DATA #
#############
meta_path <- "/Users/vilkal/Downloads/Metadata_svamp.xlsx"
meta_path <- "/Users/vilkal/work/Brolidens_work/Projects/Candida-omics/data/Metadata_svamp_FINAL.xlsx"
CST_path <- "/Users/vilkal/work/Brolidens_work/Projects/Candida-omics/Tidy_data/Metagenomics/Read_count_info_vaginalswab.csv"
metab_path <- "/Users/vilkal/work/Brolidens_work/Projects/Candida-omics/src/Paulo/results/microb_metab_clustering.csv"
meta_data <- read_xlsx(meta_path, sheet = "Metadata")
meta_groups <- read_csv("../Results/meta_int_groups.csv")

metab_clus <- read.csv(metab_path) %>% rename(svamp_ID = 1) %>% as_tibble()
CST_info <- read_csv(CST_path) %>% dplyr::select(svamp_ID, CST, subCST)

matrix_data <- read_csv("../Tidy_data/Metabolomics/Metabolom_matrix.csv")
metadata <- read_csv("../Tidy_data/Metabolomics/metabolomics_metadata.csv")
annotation_table <- read_csv(
  "../Tidy_data/Metabolomics/metabolomics_annotation.csv"
)

Step 1. Preprocessing

Cleaning up data

The metabolomic data was provided in a unformatted table, so compiling the data into appropriate format is required. Differences in sample number, metabolite nomenclature, etc.

Code
#############
# LOAD DATA #
#############
#---  Load sample info ----
smpl_IDs <- read_xlsx("../data/svamp_ID_list.xlsx") %>%
  select(svamp_ID, barcode_metabolomics)

#---  Load raw file from metabolomics sequencing facility ----
mbol_path <- "../data/Metabolomics/UTF-8SMC_P0800_GCMS_Data.xlsx"
RAW <- read_xlsx(mbol_path)


##############
# CLEAN DATA #
##############
# this code is based on Paulos analysis of the metabolomics data
# View first few column names and first column values
head(colnames(RAW), 30)
head(RAW[[1]], 30)

#--- 1. Identify structure of the table ----
# Find where annotation columns end (contains "order" keyword)
col_boundary <- which(str_detect(
  colnames(RAW),
  regex("order", ignore_case = TRUE)
))
# Find where sample data begins (first row that starts with "1" in first column)
row_boundary <- which(RAW[[1]] == "1")[1] - 1

#--- 2. Split into three logical components ----
# Sample metadata (left block, below header)
metadata <- RAW %>%
  slice((row_boundary + 1):n()) %>%
  select(all_of(1:col_boundary))

# Compound annotations (top block, right side)
annotations_raw <- RAW %>%
  slice(1:row_boundary) %>%
  select(1, all_of((col_boundary + 1):ncol(RAW)))
#select(1, all_of((col_boundary + 1):ncol(RAW)))

# Measurement matrix (sample × metabolite intensities)
matrix_data <- RAW %>%
  slice((row_boundary + 1):n()) %>%
  select(
    `Compound Name`,
    "barcode_metabolomics" = `customer sample Identifier`,
    (col_boundary + 1):ncol(RAW)
  ) %>%
  left_join(., smpl_IDs, by = "barcode_metabolomics") %>%
  relocate(svamp_ID, .before = "barcode_metabolomics") %>%
  mutate(
    svamp_ID = ifelse(
      grepl("^QC\\d", .$`Compound Name`),
      .$`Compound Name`,
      .$svamp_ID
    )
  ) %>%
  select(-`Compound Name`, -barcode_metabolomics) %>%
  pivot_longer(cols = -svamp_ID, names_to = "Compund") %>%
  pivot_wider(names_from = svamp_ID) %>%
  select(Compund, starts_with("QC"), sort(colnames(.)))


#--- 3. Build tidy annotation table ----
# Extract selected annotation rows and pivot them into a tidy format
annotation_table <- annotations_raw %>%
  #rename(row_label=`GCMS-NA@NA`) %>%
  slice(., c(2:6)) %>% # rows with HMDB, KEGG, PubChem, Mz, Rt
  pivot_longer(-1, names_to = "names", values_to = "value") %>%
  pivot_wider(names_from = "Compound Name", values_from = value) %>%
  # Create uniqe compound IDs by using M/z and Rt
  mutate(
    Compound_ID = paste0(
      "GCMS-",
      round(as.numeric(`M/z (unique)`)),
      "@",
      round(as.numeric(`RI (resolved)`))
    )
  )

#--- 4. Remove quality control compunds ----
annotation_table <- annotation_table %>%
  filter(!grepl("Standard", .$names))

# Results:
# metadata         → sample metadata (e.g. sample ID, run order)
# matrix_data      → numeric matrix (samples × compounds)
# annotation_table → clean annotation metadata per compound

#############
# SAVE DATA #
#############
write_csv(matrix_data, "../Tidy_data/Metabolomics/Metabolom_matrix.csv")
write_csv(metadata, "../Tidy_data/Metabolomics/metabolomics_metadata.csv")
write_csv(
  annotation_table,
  "../Tidy_data/Metabolomics/metabolomics_annotation.csv"
)
metab_clus <- metab_clus %>%
  select(svamp_ID, ends_with("_louvain")) %>%
  mutate_at(., 2:ncol(.), factor)

meta_data <- meta_data %>%
  select(
    svamp_ID,
    38:42,
    22,
    24:25,
    54,
    starts_with("Clinical score")
  )

meta <- meta_groups %>%
  #mutate(svamp_ID = str_replace(svamp_ID, "S75_Bl", "S75_BL")) %>%
  #dplyr::rename(sample = "svamp_ID") %>%
  left_join(CST_info, by = "svamp_ID") %>%
  left_join(metab_clus, by = "svamp_ID") %>%
  #left_join(trx_mapping, by = "sample") %>%
  mutate(
    Lacto = ifelse(
      CST == "I",
      "Lacto_c",
      ifelse(CST == "III", "Lacto_i", "other")
    )
  ) %>%

  left_join(meta_data, by = "svamp_ID") %>%
  rename_with(~ gsub(" ", "_", .x)) %>%
  mutate("Clinical_score_(0-5)" = as.character(`Clinical_score_(0-5)`))

Step 2: Remove QC samples

Filter samples

###############
# FILTER DATA #
###############
data <- matrix_data %>%
  # remove reference compunds
  filter(!grepl("Standard", .$Compund)) %>%
  # remove controll samples
  select(-starts_with("QC"))

# transforme to matrix
matrix <- data %>%
  column_to_rownames(var = "Compund") %>%
  as.matrix(.)

Step 3: Normalize data

# Normalize data
##################
# NORMALIZE DATA #
##################
# samples should be columns and features rows
norm <- normalizeQuantiles(log1p(matrix))

# Transpose for mixOmic analysis
matrix.t <- matrix %>%
  t()

matrix.t[1:5, 1:4]
    Lactic acid Glycolic acid Alanine alpha-Hydroxybutyric acid
S01     78.1233       28.3764 37.4065                    0.1534
S02     79.0192       17.6899 55.2835                    2.4247
S03     72.6377       21.7367 31.1321                    0.0474
S04     52.0407       33.4765 27.9321                    0.6472
S05     53.3506       40.8274 29.9669                    0.2101

NB! the majority(?) of mixOmics functions expect a format of samples as rows and features as columns, thus the transformation is VERY important!

########################
# SAVE NORMALIZED DATA #
########################
norm %>%
  as_tibble(., rownames = "svamp_ID") %>%
  write_csv(., "../Results/MixOmic/Metabolom_Normalized.csv")
# norm <- read_csv("../Results/MixOmic/Metabolom_Normalized.csv")

Step 4: UMAP

#### UMAP analysis ####
set.seed(1)
umap <- uwot::umap(
  t(norm),
  n_neighbors = 15,
  init = "spectral",
  scale = T,
  seed = 1234
)

# group information
annot_col <- meta %>%
  filter(svamp_ID %in% colnames(norm)) %>%
  # select(-`Age (years)`) %>%
  mutate(across(2:7, ~ factor(.x)))

umap_df <- tibble(
  "UMAP 1" = umap[, 1],
  "UMAP 2" = umap[, 2],
  "ID" = rownames(umap)
) %>%
  left_join(., annot_col, by = c("ID" = "svamp_ID"))

#### UMAP plotting by groups ####
txt_df <- umap_df %>%
  select(1:21, `Clinical_score_(0-5)`) %>%
  mutate(
    txt_clin = ifelse(
      grepl("0", umap_df$`Clinical_score_(0-5)`),
      NA,
      as.character(.$`Clinical_score_(0-5)`)
    )
  ) %>%
  mutate(txt_id = ifelse(RVVC == "RVVC", .$ID, NA))

# save plot to file
#ggsave(paste0("./Figures/09/", "Bulk_UMAP.png"), p)

UMAP_fun <- function(group_var, txt_var, shape_gr = NULL, shape = NULL) {
  p <- ggplot(
    umap_df,
    aes(
      x = `UMAP 1`,
      y = `UMAP 2`,
      fill = .data[[group_var]],
      color = .data[[group_var]],
      #shape = .data[[shape_gr]]
    )
  ) +
    #geom_jitter( shape=21, size=3, color="white", width=.5, height=.5) +  # Tassos used jitter
    geom_point(
      size = 3,
      alpha = 1,
      aes(shape = if (!is.null(shape_gr)) .data[[shape_gr]] else NULL)
    ) +
    scale_colour_manual(
      values = pal[[group_var]],
      name = "",
      aesthetics = c("colour", "fill")
    ) +
    {
      if (!is.null(shape_gr) && !is.null(shape)) {
        scale_shape_manual(values = shape, name = "")
      }
    } +
    # scale_shape_manual(values = c(21, 5, 4), name = "" ) + #specify individual shapes+
    geom_text(
      data = txt_df,
      aes(x = `UMAP 1`, y = `UMAP 2`, label = .data[[txt_var]]),
      size = 3,
      hjust = 0,
      nudge_x = 0.07,
      color = "gray51"
    ) +
    theme_bw(base_size = 14) +
    theme(
      line = element_blank(),
      axis.ticks = element_line(color = "black"),
      aspect.ratio = 1,
      axis.title = element_blank()
    )
  return(p)
}
# shapes: c( "x", "dimond", "circle")
s <- c(4, 5, 21)

UMAP_fun("RVVC_AS", "txt_id", shape = NULL)
Warning: Removed 46 rows containing missing values or values outside the scale range
(`geom_text()`).

UMAP_fun("RVVC_AS", "txt_clin", shape_gr = "RVVC_AS", shape = s)
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 83 rows containing missing values or values outside the scale range
(`geom_text()`).

UMAP_fun("microb_louvain", "txt_clin")
Warning: Removed 83 rows containing missing values or values outside the scale range
(`geom_text()`).

UMAP_fun("integ_louvain", "txt_clin")
Warning: Removed 83 rows containing missing values or values outside the scale range
(`geom_text()`).

UMAP_fun("metab_louvain", "txt_clin")
Warning: Removed 83 rows containing missing values or values outside the scale range
(`geom_text()`).

Step 5: Shankey plot

library(niceRplots)

plot_meta_sankey <- function(
  vars = c("integ_louvain", "RVVC_AS"),
  palette = pal1,
  drop_na = TRUE,
  return_pdf = TRUE,
  pdf_name = "sankey"
) {
  stopifnot(length(vars) == 2)

  ## Select and clean metadata
  m <- meta %>%
    dplyr::select(dplyr::all_of(vars))

  if (drop_na) {
    m <- tidyr::drop_na(m)
  }

  ## Extract labels
  tmp <- m[, vars, drop = FALSE]
  tmp <- as.data.frame(tmp)

  ## Ensure factors
  tmp[[1]] <- factor(tmp[[1]])
  tmp[[2]] <- factor(tmp[[2]])

  ## Plot sankey
  #pdf( paste0("../results/integ_louvain_sankey.pdf"),width = 1.5*3.5,height = 1.5*3.5)
  #par(mar=c(4,4,4,4))
  #plot_sankey(df = tmp, pal = pal1)
  #dev.off()

  plot_expr <- {
    plot_sankey(df = tmp, pal = palette)
  }

  if (return_pdf) {
    ## Plot sankey
    pdf(
      paste0("../Results/sankey/", pdf_name, ".pdf"),
      width = 1.5 * 3.5,
      height = 1.5 * 3.5
    )
    par(mar = c(4, 4, 4, 4))
    plot_sankey(df = tmp, pal = palette)
    dev.off()
  }
  #plot_sankey(df = tmp, pal = palette)
}

# quartz(width = 6, height = 6, dpi = 150)
v <- c("metab_louvain", "CST")
plot_meta_sankey(v, pdf_name = "sankey_metab_CST", palette = pal[[v[1]]])

quartz_off_screen 
                2 
v <- c("microb_louvain", "CST")
plot_meta_sankey(v, pdf_name = "sankey_microb_CST", palette = pal[[v[1]]])

quartz_off_screen 
                2 
v <- c("integ_louvain", "RVVC_AS")
plot_meta_sankey(v, pdf_name = "sankey_integ_RVVC", palette = pal[[v[1]]])

quartz_off_screen 
                2 
v <- c("metab_louvain", "RVVC_pos")
plot_meta_sankey(v, pdf_name = "sankey_metab_RVVC_pos", palette = pal[[v[1]]])

quartz_off_screen 
                2 
v <- c("RVVC_pos", "metab_louvain")
plot_meta_sankey(v, pdf_name = "sankey_RVVC_pos__metab", palette = pal[[v[1]]])

quartz_off_screen 
                2 
v <- c("metab_louvain", "BV")
plot_meta_sankey(v, pdf_name = "sankey_metab_BV", palette = pal[[v[1]]])

quartz_off_screen 
                2 
#pdf( paste0("../results/integ_louvain_sankey.pdf"),width = 1.5*3.5,height = 1.5*3.5)
#par(mar=c(4,4,4,4))
#plot_sankey(df = tmp, pal = pal1)
#dev.off()