Functional enrichment

Published

7 May 2026

Overview

This report presents a structured MOFA2 analysis integrating transcriptomics, metagenomics, and metabolomics data. The goal is to identify latent factors capturing shared and view-specific sources of variation, and to generate a set of publication-ready figures that can be combined into a multi‑panel figure.

Main outcomes: - Latent factor structure across multi‑omics layers - Variance explained per factor and per view - Associations between factors and clinical / technical covariates - Feature loadings driving key factors


Analysis workflow (step‑by‑step)

  1. Load required libraries and define plotting palettes
  2. Load omics data and sample metadata
  3. Format MOFA input
  4. Select samples for inclusion
  5. Unsupervised UMAPs
  6. Configure and run the MOFA model
  7. Load trained MOFA model
  8. Perform model diagnostics and quality checks
  9. Covariates- Factor correlation
  10. Annotate latent factors using group information
  11. Feature-level figures
  12. Unified UMAP (all views)

1. Libraries and global settings

Code
library(MOFA2)
library(tidyverse)
library(jsonlite)
library(patchwork)
library(openxlsx)
library(ggrepel)
library(RColorBrewer)

# remotes::install_github("ncats/RAMP-DB")
library(RaMP)

# setwd("/Users/vilkal/work/Brolidens_work/Projects/Candida-omics/src")

select <- dplyr::select
map <- purrr::map
rename <- dplyr::rename

# default ggplot2 pallet:
gg_color_hue <- function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}
gg_color_hue(10)
library(paletteer)

gene_col <- c(
  "#4F6980",
  "#849DB1",
  "#A2CEAA",
  "#638B66",
  "#BFBB60",
  "#9F8F12",
  "#F47942",
  "#FBB04E",
  "#B66353",
  "#D7CE9F"
)
gene_col <- rev(c(
  "#6388B4FF",
  "#8CC2CAFF",
  "#EF6F6AFF",
  "#FFAE34FF",
  "#55AD89FF",
  "#C3BC3FFF",
  "#BB7693FF",
  "#94A323",
  "#A9B5AEFF",
  "#E76BF3"
))
gene_col <- c(
  "#B71E42FF",
  "#DE478E",
  "#BC72F0FF",
  "#795FAFFF",
  "#D53DD0",
  "#EA6312",
  "#FFAE34FF",
  "#EF6F6AFF",
  "#8CC2CAFF",
  "#6388B4FF"
)
pal2 <- list(
  gene_clus = set_names(gene_col, c(paste0("pos_", 1:6), paste0("neg_", 1:4))),
  pos = set_names(gene_col[1:6], paste0(1:6)),
  neg = set_names(gene_col[7:10], paste0(1:4))
)

Define consistent colour palettes to ensure visual coherence across all figures.

Code
select <- dplyr::select
map <- purrr::map
rename <- dplyr::rename

pal <- list(
  CST = c(
    "I" = "#148F77",
    "II" = "#6d9f06",
    "III" = "#A3E4D7",
    "IV-B" = "#E794C1",
    "IV-C" = "#b5cb8c",
    "V" = "#43BA8F"
  ),
  subCST = c(
    "#a5c97b",
    "#cc93cf",
    "#de9548",
    "#63d3b4",
    "#9e8dec",
    "#902267",
    '#FB5273',
    "#91c5de"
  ),
  RVVC = c("#EAEAEA", "#fd814e"),
  BV = c("#EAEAEA", "#fd814e"),
  NAC = c("#EAEAEA", "#fd814e"),
  ST = c("#EAEAEA", "#fd814e"),
  missing_page = c("#EAEAEA", "#fd814e"),
  Clin = c("#a5c97b", "#cc93cf", "#de9548", "#63d3b4", "#9e8dec", "#902267"),
  Clin_gr = c("#ffa998", '#FB5273', "#902267"),
  RVVC_AS = c("#ffa998", '#FB5273', "#902267"),
  RVVC_pos = c(
    "#A6CEE3",
    "#1F78B4",
    #"#EAEAEA",
    #"gray",
    "#e490c4ff",
    "#c12d88ff"
  ), # "#FB9A99", "#E31A1C", "#B2DF8A", "#609c5bff", "#FDBF6F", "#FF7F00",, "#CAB2D6", "#6a3d9a" "#e490c4ff", "#c12d88ff"
  trx_louvain_7 = c(
    "#de9548",
    "#cc93cf",
    "#a5c97b",
    "#9e8dec",
    "#902267",
    "#63d3b4",
    '#FB5273'
  ),
  metab_louvain = c(
    "#cc93cf",
    "#a5c97b",
    "#de9548",
    "#902267",
    "#9e8dec",
    "#63d3b4"
    #'#FB5273'
  ),
  microb_louvain = c(
    "#cc93cf",
    "#de9548",
    "#a5c97b",
    "#9e8dec",
    "#902267",
    "#63d3b4",
    '#FB5273'
  ),
  fun_louvain = c(
    "#a5c97b",
    "#cc93cf",
    "#de9548",
    "#63d3b4",
    "#9e8dec",
    "#902267",
    '#FB5273'
    #"#984EA3"
    #"#B3CDE3"
  ),
  integ_louvain = c(
    "#a5c97b",
    "#cc93cf",
    "#de9548",
    "#63d3b4",
    "#9e8dec",
    "#902267",
    '#FB5273',
    "#984EA3"
    #"#B3CDE3"
  ),
  "Wet_smear:_clue_cells_(y/n)" = c("#2266ac", "#91c5de"),
  hyphae = c("#f26386", "#fd814e", "#a4d984", "#fbbc52"),
  pos = c("#91c5de", "#2266ac"),
  fct1_clus = c("#33A02C", "#B2DF8A", "#FF7F00", "#FDBF6F"), #"#FDBF6F" "#FF7F00", "#CAB2D6", "#6A3D9A"
  "Clinical_score_(0-5)" = brewer.pal(6, name = "Reds")
)
# default ggplot2 pallet:
gg_color_hue <- function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}
gg_color_hue(4)
[1] "#F8766D" "#7CAE00" "#00BFC4" "#C77CFF"
clean_GO.fun <- function(path) {
  str_replace_all(path, "_", " ") %>%
    str_remove(., "^GOBP ") %>%
    str_to_lower(.)
}

2. Data loading

Multi‑omics matrices and sample metadata are loaded from pre‑processed RDS/CSV files. These data have already undergone quality control and normalisation upstream.

comb_meta_path <- "../Results/Combined_gr_meta_data.csv"

# meta data
meta <- read_csv(comb_meta_path) %>%
  #rename(sample = "svamp_ID") %>%
  mutate(across(2:27, ~ factor(.x)))

KEGG_path <- "../../Spatial_DMPA/resources/c2.cp.kegg_legacy.v2023.2.Hs.json"
GO_path <- "../../Spatial_DMPA/resources/c5.all.v2023.2.Hs.json"
Wiki_path <- "../resources/wikipathways-20260310-gmt-Homo_sapiens.gmt"
# TFT_path <- "../../Spatial_DMPA/resources/c3.tft.gtrd.v2023.2.Hs.symbols.gmt"

3. Load trained MOFA model

We load the model with the highest elbo score

Code
elbo_table <- read_csv("../Results/MOFA/elbo_table_noS84.csv")
knitr::kable(elbo_table, )
seed elbo
3 -177584.7
100 -177577.2
50 -177588.7
Code
best_seed <- elbo_table$seed[which.max(abs(elbo_table$elbo))]

MOFAmodel <- load_model(
  #"../Results/MOFA/MOFA_matched_seed_50.hdf5"
  # "../Results/MOFA/MOFA_allBL_seed_"
  paste0("../Results/MOFA/MOFA_allBL_noS84_seed_", best_seed, ".hdf5")
)

4. Cluster genes

# Extract factors, and join metadata
clusters <- list()
clusters$fct1_clus <- cluster_samples(MOFAmodel, k = 4, factors = 1)$cluster
clusters$fct2_clus <- cluster_samples(MOFAmodel, k = 4, factors = 2)$cluster
clusters$fct4_clus <- cluster_samples(MOFAmodel, k = 4, factors = 4)$cluster

clust <- bind_cols(sample = names(clusters[[1]]), clusters)

# meta <- meta %>%
#   dplyr::left_join(., clust, by = "sample") %>%
#   dplyr::mutate(across(all_of(names(clusters)), ~ factor(.x)))

# # add meta data to model
# samples_metadata(MOFAmodel) <- samples_metadata(MOFAmodel)[, c(
#   "sample",
#   "group"
# )] %>%
#   left_join(meta, by = c("sample"))

5. Get enrichment DB

Code
read_json.fun <- function(db) {
  db <- db %>%
    map(., ~ as_tibble(.x[4])) %>%
    bind_rows(., .id = "pathway") %>%
    mutate(genes = map(db, ~ pluck(.x, "geneSymbols")))
  return(db)
}

read_gmt.fun <- function(file) {
  read_lines("../resources/wikipathways-20260310-gmt-Homo_sapiens.gmt") %>%
    strsplit("\t") %>%
    map(
      ~ tibble(
        pathway = .x[1],
        description = .x[2],
        feature = .x[-c(1, 2)]
      )
    ) %>%
    bind_rows() %>%
    group_by(pathway) %>%
    summarise(
      genes = list(feature),
      .groups = "drop"
    ) %>%
    separate(
      .,
      pathway,
      into = c("pathway", "source", "exactSource", "species"),
      sep = "%",
      remove = T
    )
}

GO_database <- read_json.fun(fromJSON(GO_path))
# KEGG_database <- read_json.fun(fromJSON(KEGG_path))
# Wiki_database <- read_gmt.fun(Wiki_path)

In WikiPathways (and therefore RaMP):

A gene can participate multiple times in a pathway Each occurrence may correspond to:

a different node a different interaction a different chromosomal region a different sub-pathway context

RaMP flattens these representations into a long table, which preserves biological structure rather than forcing uniqueness. So you’re not seeing duplicates because of a mistake — you’re seeing duplicates because the same gene is referenced multiple times within the same pathway model.

make_binary_matrix <- function(database) {
  databases$wiki %>%
    select(pathway, exactSource, genes) %>%
    unnest(genes) %>%
    distinct(pathway, genes) %>%
    mutate(value = 1) %>%
    pivot_wider(
      names_from = genes,
      values_from = value,
      values_fill = 0
    ) %>%
    select(pathway, genes)
  column_to_rownames("pathway") %>%
    as.matrix()
}

run_single_enrichment <- function(
  MOFAmodel,
  binary_matrix,
  view,
  factors,
  sign,
  statistical.test = "parametric"
) {
  run_enrichment(
    MOFAmodel,
    view = view,
    factors = factors,
    feature.sets = binary_matrix,
    sign = sign,
    statistical.test = statistical.test
  )
}

run_enrichment_grid <- function(
  MOFAmodel,
  databases,
  views,
  signs,
  factors = 1:3,
  statistical.test = "parametric"
) {
  # Precompute binary matrices
  binary_matrices <- purrr::map(databases, make_binary_matrix)

  # Parameter grid
  param_grid <- expand.grid(
    db = names(binary_matrices),
    view = views,
    sign = signs,
    stringsAsFactors = FALSE
  )

  # Run all combinations
  results <- purrr::pmap(
    param_grid,
    function(db, view, sign) {
      run_single_enrichment(
        MOFAmodel = MOFAmodel,
        binary_matrix = binary_matrices[[db]],
        view = view,
        factors = factors,
        sign = sign,
        statistical.test = statistical.test
      )
    }
  )

  # Name results clearly
  names(results) <- paste(
    param_grid$db,
    param_grid$view,
    param_grid$sign,
    sep = "__"
  )

  return(results)
}

databases <- list(
  KEGG = KEGG_database,
  GO = GO_database,
  wiki = Wiki_database
)

signs <- c("negative", "positive", "all")
views <- c("Transcriptomics")

enrichment_results <- run_enrichment_grid(
  MOFAmodel = MOFAmodel,
  databases = databases,
  views = views,
  signs = signs,
  factors = 1:3
)
# initializes the RaMP database object, downloading and caching the latest SQLite database
# if no version already exists in local cache.
rampDB <- RaMP()
Loading RaMP-DB version 3.0.7 from cache.
# note that you can use the following method to check database versions hosted in your
# computer's local cache and databases that are available to download in our remote repository.
RaMP::listAvailableRaMPDbVersions()
[1] "Locally available versions of RaMP SQLite DB, currently on your computer:"
[1] "3.0.7"
[1] "Available remote RaMP SQLite DB versions for download:"
[1] "3.0.7" "3.0.6" "2.5.4" "2.5.0" "2.4.3" "2.4.2" "2.4.0" "2.3.2" "2.3.1"
[1] "The following RaMP Database versions are available for download:"
[1] "3.0.6" "2.5.4" "2.5.0" "2.4.3" "2.4.2" "2.4.0" "2.3.2" "2.3.1"
[1] "Use the command db <- RaMP(<new_version_number>) to download the specified version."
# RaMP::getPrefixesFromAnalytes(db = rampDB, analyteType = 'metabolite')
RaMP::getPrefixesFromAnalytes(db = rampDB, analyteType = 'gene')
     analyteType
1 Genes/Proteins
                                                                                idTypes
1 hmdb, gene_symbol, uniprot, entrez, ensembl, EN, ncbiprotein, wikidata, brenda, chebi
# ============================================================
# STEP 1: Get all pathways available in RaMP
# (this function is explicitly shown / implied in the vignette)
# ============================================================

pathways <- RaMP::getPathwayNameList()
Loading RaMP-DB version 3.0.7 from cache.
# expected columns typically include:
# pathway_name, pathway_source, pathway_id (depending on version)

# ============================================================
# STEP 2: For each pathway, retrieve its analytes
# IMPORTANT:
# - RaMP has NO global "pathway x analyte" table
# - The vignette shows that getAnalyteFromPathway() must be
#   called per pathway
# ============================================================

# RaMP_long <- getAnalyteFromPathway(pathways, analyteType = "metabolite")
RaMP_long <- getAnalyteFromPathway(pathways, analyteType = "gene", )
Loading RaMP-DB version 3.0.7 from cache.
[1] "fired!"
[1] "gene return..."
[1] "Timing .."
   user  system elapsed 
  2.704   1.673   6.114 
RaMP_long <- RaMP_long %>% as_tibble()

table(RaMP_long$pathwayType)

    kegg    pfocr reactome     wiki 
    8466     2106     2388     2498 
table(RaMP_long$geneOrCompound)

 gene 
15458 
RaMP_long <- RaMP_long %>%
  select(
    pathway = "pathwayName",
    exactSource = "pathwayId",
    db = "pathwayType",
    genes = "analyteName"
  )

# ============================================================
# STEP 3: Add GO db to RaMP_long
# ============================================================

RaMP_long <- GO_database %>%
  unnest(genes) %>%
  mutate(db = "GO") %>%
  filter(grepl("^GOBP_", pathway)) %>%
  bind_rows(RaMP_long)

# databases:
table(RaMP_long$db)

      GO     kegg    pfocr reactome     wiki 
  627225     8466     2106     2388     2498 
# number of pathways
table(unique(RaMP_long[, c("pathway", "db")])$db)

      GO     kegg    pfocr reactome     wiki 
    7647      363       66       17       36 
# ============================================================
# STEP 4: Split into individual DB's
# ============================================================

RaMP_db <- RaMP_long %>%
  nest(.by = "db") %>%
  mutate(data = set_names(data, .data[["db"]]))

6. Run MOFA GSEA

Enrichment step (run_enrichment)

  1. Takes all gene weights
  2. Subsets genes by each pathway
  3. Performs a statistical test (e.g. parametric test) on:

weights inside the pathway vs weights outside the pathway

These are the steps for doing Gene Set Enrichment Analysis (GSEA) with MOFA:

(1) **Define your gene set matrix:** this can be specified as a binary matrix where rows are gene sets and columns are genes. A value of 1 indicates that gene j belongs to pathway i. A value of 0 indicates elsewise.
(2) **Select a gene set statistic:** the statistic used to quantify the scores at the pathway level. Must be one of the following: mean.diff (difference in the average weight between foreground and background genes) or rank.sum (difference in the sum of ranks between foreground and background genes).
(3) **Select a statistical test:** the statistical test used to compute the significance of the gene set statistics under a competitive null hypothesis. Must be one of the following: parametric (a simple and very liberal parametric t-test), cor.adj.parametric (parametric t-test adjusted by the correlation between features), permutation (unparametric, the null distribution is created by permuting the weights. This option is computationally expensive, but it preserves the correlation structure between features in the data.).

An important consideration when running GSEA is that MOFA contains positive and negative weights. There will be cases where the genes with negative weights all belong to a specific pathway but genes with positive weights belong to other pathways. If this is true, doing GSEA with all of them together could dilute the signal. Hence, we recommend the user to do GSEA separately for (+) and (-) weights, and possibly also jointly with all weights.

Code
# ============================================================
# STEP 4: Build the binary membership matrix
# rows   = pathways
# cols   = metabolites (HMDB IDs)
# values = 0 / 1
# ============================================================

make_binary_matrix <- function(database) {
  database %>%
    # RaMP_db$data$kegg %>%
    distinct(exactSource, pathway, genes) %>%
    mutate(value = 1) %>%
    pivot_wider(
      id_cols = c(exactSource, pathway),
      names_from = genes,
      values_from = value,
      values_fill = 0
    ) %>%
    select(-exactSource) %>%
    column_to_rownames("pathway") %>%
    as.matrix()
}

# Precompute binary matrices
binary_matrices <- RaMP_db %>%
  filter(db != "pfocr") %>%
  mutate(matrix = imap(data, ~ make_binary_matrix(.x)))

# ============================================================
# STEP 6: Sanity check overlap with MOFA features
# (this step is CRITICAL)
# ============================================================

imap(
  binary_matrices$matrix,
  ~ length(
    intersect(
      colnames(.x),
      features_names(MOFAmodel)[["Transcriptomics"]]
    )
  )
)
$GO
[1] 3590

$wiki
[1] 546

$reactome
[1] 415

$kegg
[1] 500
Code
# ============================================================
# STEP 7: Run MOFA enrichment (UNCHANGED from your pipeline)
# ============================================================

enrich.pos <-
  imap(
    binary_matrices$matrix,
    ~ run_enrichment(
      MOFAmodel,
      view = "Transcriptomics",
      factors = 1,
      feature.sets = .x,
      sign = "positive",
      alpha = 0.1, # FDR treshold
      statistical.test = "parametric"
    )
  )
Intersecting features names in the model and the gene set annotation results in a total of 3590 features.

Running feature set Enrichment Analysis with the following options...
View: Transcriptomics 
Number of feature sets: 2805 
Set statistic: mean.diff 
Statistical test: parametric 
Subsetting weights with positive sign
Intersecting features names in the model and the gene set annotation results in a total of 546 features.

Running feature set Enrichment Analysis with the following options...
View: Transcriptomics 
Number of feature sets: 23 
Set statistic: mean.diff 
Statistical test: parametric 
Subsetting weights with positive sign
Intersecting features names in the model and the gene set annotation results in a total of 415 features.

Running feature set Enrichment Analysis with the following options...
View: Transcriptomics 
Number of feature sets: 9 
Set statistic: mean.diff 
Statistical test: parametric 
Subsetting weights with positive sign
Intersecting features names in the model and the gene set annotation results in a total of 500 features.

Running feature set Enrichment Analysis with the following options...
View: Transcriptomics 
Number of feature sets: 57 
Set statistic: mean.diff 
Statistical test: parametric 
Subsetting weights with positive sign
Code
enrich.neg <-
  imap(
    binary_matrices$matrix,
    ~ run_enrichment(
      MOFAmodel,
      view = "Transcriptomics",
      factors = c(1, 2),
      feature.sets = .x,
      sign = "negative",
      alpha = 0.1, # FDR treshold
      statistical.test = "parametric"
    )
  )
Intersecting features names in the model and the gene set annotation results in a total of 3590 features.

Running feature set Enrichment Analysis with the following options...
View: Transcriptomics 
Number of feature sets: 2805 
Set statistic: mean.diff 
Statistical test: parametric 
Subsetting weights with negative sign
Intersecting features names in the model and the gene set annotation results in a total of 546 features.

Running feature set Enrichment Analysis with the following options...
View: Transcriptomics 
Number of feature sets: 23 
Set statistic: mean.diff 
Statistical test: parametric 
Subsetting weights with negative sign
Intersecting features names in the model and the gene set annotation results in a total of 415 features.

Running feature set Enrichment Analysis with the following options...
View: Transcriptomics 
Number of feature sets: 9 
Set statistic: mean.diff 
Statistical test: parametric 
Subsetting weights with negative sign
Intersecting features names in the model and the gene set annotation results in a total of 500 features.

Running feature set Enrichment Analysis with the following options...
View: Transcriptomics 
Number of feature sets: 57 
Set statistic: mean.diff 
Statistical test: parametric 
Subsetting weights with negative sign
Code
enrich.both <-
  imap(
    binary_matrices$matrix,
    ~ run_enrichment(
      MOFAmodel,
      view = "Transcriptomics",
      factors = 1,
      feature.sets = .x,
      sign = "all",
      alpha = 0.1, # FDR treshold
      statistical.test = "parametric"
    )
  )
Intersecting features names in the model and the gene set annotation results in a total of 3590 features.

Running feature set Enrichment Analysis with the following options...
View: Transcriptomics 
Number of feature sets: 2805 
Set statistic: mean.diff 
Statistical test: parametric 
Intersecting features names in the model and the gene set annotation results in a total of 546 features.

Running feature set Enrichment Analysis with the following options...
View: Transcriptomics 
Number of feature sets: 23 
Set statistic: mean.diff 
Statistical test: parametric 
Intersecting features names in the model and the gene set annotation results in a total of 415 features.

Running feature set Enrichment Analysis with the following options...
View: Transcriptomics 
Number of feature sets: 9 
Set statistic: mean.diff 
Statistical test: parametric 
Intersecting features names in the model and the gene set annotation results in a total of 500 features.

Running feature set Enrichment Analysis with the following options...
View: Transcriptomics 
Number of feature sets: 57 
Set statistic: mean.diff 
Statistical test: parametric 
# add gene information to suppl tables
get_genes.fun <- function(enrich_object) {
  phtw_genes <- enrich_object[["feature.sets"]] %>%
    # enrichment_list$neg$GO[["feature.sets"]] %>%
    as_tibble(rownames = "pathway") %>%
    pivot_longer(-pathway, names_to = "genes") %>%
    filter(value == 1) %>%
    summarise(
      n_genes = n(),
      gene_list = list(genes),
      .by = c("pathway")
    )
}

# assemble suppl tables
enrichment_to_tibble <- function(
  enrichment_obj,
  keep_slots = c("pval", "pval.adj", "set.statistics")
) {
  temp <- enrichment_obj %>%
    imap(
      .,
      ~ .x %>% # enrichment_list$neg$GO %>%
        enframe(.) %>%
        filter(name %in% keep_slots) %>%
        mutate(data = map(value, ~ as_tibble(.x, rownames = "pathway"))) %>%
        select(name, data) %>%
        mutate(
          data = map2(data, name, function(df, nm) {
            rename_with(
              df,
              function(col) paste0(nm, "_", col),
              contains("Factor")
            )
          })
        ) %>%
        pull(data) %>%
        reduce(full_join, by = "pathway") %>%

        arrange(across(starts_with("pval_"), ~.x)) %>%
        mutate(source = .y)
    )
  if ("pval" %in% keep_slots) {
    temp <- imap(
      temp,
      ~ left_join(.x, get_genes.fun(enrichment_obj[[.y]]), by = "pathway")
    )
  }
  return(temp)
}
Code
# ============================================================
# Save enrichment RDS objects
# ============================================================
enrichment_list <- list(pos = enrich.pos, neg = enrich.neg, both = enrich.both)
imap(
  enrichment_list,
  ~ saveRDS(.x, paste0("../Results/Enrichment/", "GSEA_", .y, ".RDS"))
)
# enrichment_list <- map(c("pos", "neg", "both") %>% set_names(),
#   ~ readRDS(paste0("../Results/Enrichment/", "GSEA_", .x, ".RDS")))

# ============================================================
# Save Pathway level information
# ============================================================
enrichment_list <- list(pos = enrich.pos, neg = enrich.neg, both = enrich.both)

result <- map(enrichment_list, enrichment_to_tibble)

# Suppl. Tables per comparison
imap(
  result,
  ~ write.xlsx(
    .x,
    keepNA = TRUE,
    na.string = "",
    overwrite = TRUE,
    file = paste0("../Results/Tables/", "GSEA_", .y, ".xlsx")
  )
)
# c("pos", "neg", "both") %>%
# map(., ~read.xlsx(paste0("../Results/Tables/", "GSEA_", .y, ".xlsx")))

# ============================================================
# Save feature level enrichment weights
# ============================================================
result_ <- map(
  enrichment_list,
  ~ enrichment_to_tibble(.x, keep_slots = c("feature.statistics"))
)

imap(
  result_,
  ~ write.xlsx(
    .x,
    keepNA = TRUE,
    na.string = "",
    overwrite = TRUE,
    file = paste0("../Results/Tables/", "GSEA_genes_", .y, ".xlsx")
  )
)
# c("pos_genes", "neg_genes", "both_genes") %>%
# map(., ~read.xlsx(paste0("../Results/Tables/", "GSEA_genes_", .y, ".xlsx")))

The enrichment analysis returns a list of 5 elements:

**feature.sets:** the feature set matrix filtered by the genes that overlap with the MOFA model.
**pval:** the nominal p-values.
**pval.adj:** the FDR-adjusted p-values.
**feature.statistics:** the feature statistics (i.e. weights standardized by factor).
**set.statistics:** matrices with the gene set statistics.
**sigPathways:** list with significant pathways per factor at a specified FDR threshold
  • Important:
    Pathway-level metrics (set.statistics, pval, pval.adj) are pathway-specific. Gene-level values (feature.sets, feature.statistics) remain the same regardless of pathway membership (calculated before enrichment).

  • set.statistics: Possible values are set.statistic = c(“mean.diff”, “rank.sum”). For the default “mean.diff”, the reported values can be interpreted as “On average, how strong are the negative/positive scaled MOFA weights of the features belonging to each pathway for the spesific factor”

plot_enrichment_heatmap(enrich.neg$GO)

plot_enrichment(enrich.neg$GO, factor = 1, max.pathways = 15)

MOFA has a function (plot_enrichment_detailed) designed to evaluate what genes are driving the enrichment result for each individual pathway It is advised to not rely only on the p-values and to visualise which genes are driving the enrichment within each pathways. Because ther are problematic cases where a single gene is driving the enrichment results in very small pathways.

The resulting plot shows the genes ploted aginst scaled Weight on the x-axis. In run_enrichment(), the loadings are STANDARDIZED (z-scored) per factor before the GSEA is performed. The satndardized values is returned as “feature.statistic” by run_enrichment().

plot_enrichment_detailed(
  enrichment.results = enrichment_list$neg$GO,
  factor = 1,
  max.pathways = 3
)

plot_enrichment_detailed(
  enrichment.results = enrichment_list$pos$GO,
  factor = 1,
  max.pathways = 10
)

png(height = 25, filename = "./test_GSEA.png")
p <- plot_enrichment_detailed(
  enrichment.results = enrichment_list$both$GO,
  factor = 1,
  max.pathways = 34,
)
dev.off()
quartz_off_screen 
                2 

7. Heatmap of GSEA results

Code
# 1. Extracts GO enrichment results from the input object.
# 2. Selects pathways based on a score threshold (positive, negative, or both).
# 3. Builds a gene–pathway matrix and removes empty genes (columns).
# 4. Filters genes based on a minimum gene score threshold (weights).
# 5. Automatically adjusts plot height based on the number of pathways.
# 6. Creates row annotations (pathway scores and adjusted p-values).
# 7. Clusters genes (columns) and assigns them to groups.
# 8. Creates column annotations (gene weights and cluster groups).
# 9. Generates a heatmap with clustering and annotations.
# 10. Draws and saves the heatmap as a PNG file.
# 11. Returns the heatmap object, matrix, clustering results, and groups.

library(ComplexHeatmap)
library(circlize)
library(paletteer)

# Color function
col_fun <- colorRamp2(
  c(0, 1),
  c("#ecc4d0", "#e05e85")
)

# enrich_obj = enrichment_list$neg
# score_threshold = 3
# file = "../Results/GSEA_heatmap_neg.png"
# width = 10
# height = 5
# k_clusters = 4

make_gsea_heatmap <- function(
  enrich_obj,
  gene_col = "pos",
  cell_width = 0.1,
  score_threshold = 3,
  gene_treshold = 0,
  file,
  width = 10,
  height = 6,
  k_clusters = 4
) {
  temp <- enrich_obj$GO

  # Select pathways
  if (length(score_threshold) == 1) {
    sets <- rownames(temp$set.statistics)[
      abs(temp$set.statistics[, 1]) > score_threshold
    ]
  } else {
    set <- tibble(
      name = rownames(temp$set.statistics),
      value = as.vector(temp$set.statistics[, 1])
    )
    sets_pos <- set %>% filter(value > 0 & value > score_threshold[1])
    sets_neg <- set %>% filter(value < 0 & abs(value) > score_threshold[2])
    sets <- c(sets_pos$name, sets_neg$name)
  }

  # Matrix prep
  mat <- temp$feature.sets[sets, , drop = FALSE]
  mat <- mat[, colSums(mat) != 0] # remove cols sum to zero
  #rownames(mat) <- clean_GO.fun(rownames(mat))

  # filter genes
  keep_genes <- temp$feature.statistics[colnames(mat), "Factor1"] >
    gene_treshold
  mat <- mat[, keep_genes]

  if (nrow(mat) == 0 || ncol(mat) == 0) {
    warning("Matrix is empty after filtering")
    return(NULL)
  }

  # Auto height calculation
  base_height = 2 # border height
  per_pathway = 0.1 # height per row
  max_height = 35 # cap height
  height <<- min(max_height, base_height + nrow(mat) * per_pathway)

  # Annotations
  row_ha <- rowAnnotation(
    set = temp$set.statistics[sets, "Factor1"],
    p_val = temp$pval.adj[sets, "Factor1"],
    annotation_legend_param = list(set = list(direction = "vertical"))
  )

  # Extract column dendrogram
  hc <- hclust(dist(t(mat)))
  dend <- as.dendrogram(hc)
  # dend <- column_dend(ht)
  # hc <- as.hclust(dend)
  groups <- cutree(hc, k = k_clusters)

  weight_col <- colorRamp2(
    seq(from = 0, to = 1, length.out = 7),
    c("#EAEBEB", paletteer_c("ggthemes::Sunset-Sunrise Diverging", 7)[-1])
  )
  column_ha <- HeatmapAnnotation(
    weight = temp$feature.statistics[colnames(mat), "Factor1"],
    gr = as.character(groups),
    col = list(gr = pal2[[gene_col]], weight = weight_col),
    annotation_legend_param = list(weight = list(direction = "vertical"))
  )

  # Heatmap
  H <- Heatmap(
    mat,
    row_labels = clean_GO.fun(rownames(mat)),
    width = unit(ncol(mat) * cell_width, "mm"),
    name = "Present",
    col = col_fun,
    column_split = groups, # hierarchical static
    cluster_rows = TRUE,
    top_annotation = column_ha,
    right_annotation = row_ha,
    row_names_side = "right",
    show_column_names = FALSE,
    column_names_rot = 90,
    row_names_gp = gpar(fontsize = 7),
    heatmap_legend_param = list(
      at = c(0, 1),
      labels = c("No", "Yes"),
      direction = "vertical"
    )
  )

  # Draw once (needed for dendrogram extraction)
  ht <- draw(
    H,
    heatmap_legend_side = "left",
    annotation_legend_side = "left",
    merge_legend = TRUE,
    padding = unit(c(2, 5, 2, 30), "mm")
  )

  # Save plot
  png(file = file, units = "in", res = 600, width = width, height = height)
  draw(ht)
  dev.off()

  # Return everything useful
  list(
    heatmap = ht,
    matrix = mat,
    dendrogram = dend,
    hclust = hc,
    column_groups = groups
  )
}

gene_treshold:

the default of 0 makes sure to remove genes that have the oposite weight to the specified these are simply set to weight = 0 in the MOFA enrichment function, but retained in the matrix

score_threshold:

the default mean.diff represents the difference in the average weight between foreground and background genes

ht_list_neg <- make_gsea_heatmap(
  enrich.neg,
  gene_col = "neg",
  score_threshold = 3,
  gene_treshold = 0,
  cell_width = 0.1,
  file = "../Results/Enrichment/GSEA_heatmap_neg_3.png",
  width = 10,
  k_clusters = 4
)

ht_list_pos <- make_gsea_heatmap(
  enrich.pos,
  score_threshold = 9,
  gene_treshold = 0,
  file = "../Results/Enrichment/GSEA_heatmap_pos.png",
  width = 12,
  k_clusters = 6
)

ht_list_both <- make_gsea_heatmap(
  enrich.both,
  score_threshold = c(9, 3),
  gene_treshold = 0,
  file = "../Results/Enrichment/GSEA_heatmap_both.png",
  width = 12,
  k_clusters = 6
)
`use_raster` is automatically set to TRUE for a matrix with more than
2000 columns You can control `use_raster` argument by explicitly
setting TRUE/FALSE to it.

Set `ht_opt$message = FALSE` to turn off this message.
'magick' package is suggested to install to give better rasterization.

Set `ht_opt$message = FALSE` to turn off this message.

# intersect(names(ht_list_neg$column_groups), names(ht_list_pos$column_groups))
groups <- list(
  neg = ht_list_neg$column_groups,
  pos = ht_list_pos$column_groups
)

groups <- groups %>%
  imap(~ setNames(paste0(.y, "_", .x), names(.x))) %>%
  flatten_chr()

enframe(groups) %>%
  unnest(cols = c("value")) %>%
  rename(genes = "name", clus = "value") %>%
  write_csv(., "../Results/Enrichment/Genes_clust.csv")
# group_df <- read_csv("../Results/Enrichment/Genes_clust.csv")

8. Load saved enrich results

Code
# ============================================================
# Get enrichment RDS objects
# ============================================================
enrichment_list <- map(
  c("pos", "neg", "both") %>% set_names(),
  ~ readRDS(paste0("../Results/Enrichment/", "GSEA_", .x, ".RDS"))
)

# ============================================================
# Get enrichment results tables (pathway level info)
# ============================================================
results <- c("pos", "neg", "both") %>%
  set_names() %>%
  map(
    .,
    ~ read.xlsx(
      paste0("../Results/Tables/", "GSEA_", .x, ".xlsx"),
      sheet = "GO"
    )
  ) %>%
  map(., ~ as_tibble(.x))

# ============================================================
# Get enrichment results tables (gene level info)
# ============================================================
gene_enrich <- c("pos", "neg", "both") %>%
  set_names() %>%
  map(
    .,
    ~ read.xlsx(
      paste0("../Results/Tables/", "GSEA_genes_", .x, ".xlsx"),
      sheet = "GO"
    )
  )
# ============================================================
# Get gene groups (clus)
# ============================================================
group_df <- read_csv("../Results/Enrichment/Genes_clust.csv")
Rows: 1702 Columns: 2
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): genes, clus

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
groups <- set_names(group_df$clus, group_df$genes)
# ============================================================
# Get gene weights
# ============================================================

gene_df <- bind_rows(gene_enrich[1:2], .id = "sign") %>%
  as_tibble() %>%
  select(genes = "pathway", sign, scaled_weight = feature.statistics_Factor1)

# I think code above is unnessecary to save from run_enrichment() and join to other data
# instead it would be simply sufficient to: group_by(.data[[facet]]) %>% mutate(scaled_weight = weight / max(abs(weight)))
# its important that its grouped by "factor" and/or "view" to be correct
# keep for now, should replace later to simplify

weights_df <- get_weights(
  MOFAmodel,
  as.data.frame = TRUE,
  views = "Transcriptomics"
) %>%
  as_tibble() %>%
  rename(genes = "feature", weight = "value") %>%
  filter(factor == "Factor1") %>%
  #filter(factor == "Factor1") %>% pull(feature) %>% unique()

  arrange(factor, desc(abs(weight))) %>%
  left_join(., gene_df, by = "genes") %>%
  left_join(., group_df, by = "genes") %>%
  filter(scaled_weight != 0)

9. Plot raw weights per pathway

Code
# 1. Filters weights_df to keep only genes from Factor1
# 2. Extracts gene-level enrichment values from enrichment_object
# 3. Expands pathway–gene relationships from feature.sets
# 4. Keeps only rows where the gene is part of a pathway (value == 1)
# 5. Joins gene weights, enrichment scores, and pathway statistics together
# 6. Filters to only the pathways listed in phtw
# 7. Scales gene weights within each factor
# 8. Ranks genes within each pathway by absolute scaled weight
# 9. Marks the top n_label genes per pathway
# 10. Builds formatted pathway labels with gene counts and adjusted p-values
# 11. Selects top genes per pathway for labeling
# 12. Creates a violin plot of gene weights per pathway
# 13. Adds points for non-top genes
# 14. Adds jittered highlighted points and labels for top genes
# 15. Facets plot by pathway and applies theme/axis styling

p.val_round <- function(p.val) {
  case_when(
    p.val >= 0.05 ~ sprintf("%.2f", p.val),
    p.val >= 0.001 ~ sprintf("%.3f", p.val),
    TRUE ~ sprintf("%.1e", p.val)
  )
}

plot_weights_pathways <- function(
  enrichment_object,
  weights_df,
  enrich_results = results$neg,
  phtw,
  n_label = 10,
  x_axis = "weight" # or use "scaled_weight" for scaled MOFA loadings values
) {
  temp_df <- enrichment_object$feature.sets %>%
    # enrichment_list$neg$GO$feature.sets %>%
    as_tibble(rownames = "pathway") %>%
    pivot_longer(-pathway, names_to = "genes") %>%
    filter(value == 1) %>%
    mutate(Ngenes = n(), .by = "pathway") %>%
    left_join(., weights_df, by = "genes") %>%
    left_join(., enrich_results, by = "pathway") %>%
    #left_join(., dplyr::select(enrich_results, pathway, contains(c("pval","set.statistics"))), by = "pathway") %>%
    arrange(pval.adj_Factor1)

  temp_df <<- temp_df
  # phtw <- "GOBP_LONG_CHAIN_FATTY_ACID_METABOLIC_PROCESS"

  df <- temp_df %>%
    filter(pathway %in% phtw) %>%
    mutate(
      rank = rank(desc(scaled_weight)),
      .by = "pathway",
      .after = "genes"
    ) %>%
    mutate(top = rank <= n_label) %>%
    mutate(
      pathway = paste0(
        pathway,
        "\n",
        "n=",
        Ngenes,
        " (p-adj ",
        #.$pval.adj_Factor1,
        p.val_round(.$pval.adj_Factor1),
        ")"
      )
    )

  # top genes to label
  df_labels <- df %>%
    group_by(pathway) %>%
    slice_max(order_by = abs(scaled_weight), n = n_label) %>%
    ungroup()

  position = position_jitter(width = 0, height = 0.4, seed = 123)

  ggplot(df, aes(x = .data[[x_axis]], y = pathway)) +
    geom_violin(
      # bounds = c(-0.7, 0.7),
      colour = alpha("gray", alpha = 0.6),
      scale = "count",
      adjust = 0.5
    ) +
    #geom_point(data = df %>% filter(!top), color = "grey70", size = 0.5 ) +
    geom_point(
      data = df %>% filter(!top),
      aes(fill = clus, color = clus),
      shape = 21,
      show.legend = F,
      size = 0.5
    ) +

    geom_jitter(
      aes(fill = clus),
      data = df_labels,
      size = 1,
      shape = 21,
      position = position
    ) +
    geom_text_repel(
      data = df_labels,
      aes(label = genes),
      segment.size = 0.2,
      segment.color = "gray",
      segment.alpha = 0.5,
      direction = "both",
      point.size = 0.1,
      hjust = 0.2,
      vjust = 0,
      size = 1.5,
      position = position
    ) +
    scale_fill_manual(
      values = pal2$gene_clus,
      na.value = "grey70",
      aesthetics = c("colour", "fill")
    ) +
    labs(x = "Weight", y = NULL) +

    theme_bw(base_size = 11 / .pt) +
    theme(
      panel.grid.major.y = element_blank(),
      panel.grid.minor = element_blank(),
      axis.text.y = element_blank() #element_text(size = 10)
    ) +
    {
      if (grepl("scaled_weight", x_axis)) {
        xlim(c(0, 1.1))
      } else {
        xlim(c(-max(df$weight), max(df$weight) + 0.1))
      }
    } +
    facet_wrap(~pathway, ncol = 1, scales = "free")
}
# ============================================================
# Get pathways to plot
# ============================================================
phtw <- results$neg %>%
  filter(pval.adj_Factor1 <= 0.05) %>%
  pull(., "pathway") %>%
  split(., ceiling(seq_along(.) / 5))

# ============================================================
# plot neg weights
# ============================================================
plt_list <- map(
  phtw,
  ~ plot_weights_pathways(
    enrichment_list$neg$GO,
    weights_df = weights_df,
    x_axis = "weight",
    enrich_results = results$neg,
    phtw = .x
  )
)
# quartz(width = 4, height = 1 )
plt_list[1]
$`1`
Warning: Removed 13 rows containing non-finite outside the scale range
(`stat_ydensity()`).
Warning: Removed 13 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 13 rows containing missing values or values outside the scale range
(`geom_text_repel()`).

ggsave(
  filename = paste0("../Results/Enrichment/GO_MOFA_weights_neg.pdf"),
  gridExtra::marrangeGrob(
    grobs = plt_list,
    ncol = 1,
    top = NULL,
    nrow = 1,
    height = 11.69,
    width = 4
  )
)
Saving 4 x 5 in image
Warning: Removed 13 rows containing non-finite outside the scale range
(`stat_ydensity()`).
Warning: Removed 13 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 13 rows containing missing values or values outside the scale range
(`geom_text_repel()`).
Warning: Removed 9 rows containing non-finite outside the scale range
(`stat_ydensity()`).
Warning: Removed 9 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 9 rows containing missing values or values outside the scale range
(`geom_text_repel()`).
# ============================================================
# plot pos weights
# ============================================================
phtw <- results$pos %>%
  filter(set.statistics_Factor1 >= 9) %>%
  pull(., "pathway") %>%
  split(., ceiling(seq_along(.) / 5))
# phtw <- ht_list_pos$hclust$labels[hclust$order] %>%
#   split(., ceiling(seq_along(.) / 5))

plt_list <- map(
  phtw,
  ~ plot_weights_pathways(
    enrichment_list$pos$GO,
    weights_df = weights_df,
    x_axis = "weight",
    enrich_results = results$pos,
    phtw = .x
  )
)
plt_list[1]
$`1`

ggsave(
  filename = paste0("../Results/Enrichment/GO_MOFA_weights_pos.pdf"),
  gridExtra::marrangeGrob(
    grobs = plt_list,
    ncol = 1,
    top = NULL,
    nrow = 1,
    height = 11.69,
    width = 4
  )
)
Saving 4 x 5 in image
phtw <- c(
  "GOBP_LIPID_METABOLIC_PROCESS",
  "GOBP_FATTY_ACID_METABOLIC_PROCESS",
  "GOBP_KERATINOCYTE_DIFFERENTIATION",
  "GOBP_ICOSANOID_METABOLIC_PROCESS"
)

# quartz(width = 4, height = 1*length(phtw) )
plot <- plot_weights_pathways(
  enrichment_list$neg$GO,
  weights_df = weights_df,
  x_axis = "weight",
  enrich_results = results$neg,
  phtw = phtw,
  n_label = 10
)

ggsave(
  filename = paste0("../Results/Enrichment/Figure_x_neg.pdf"),
  height = 1 * length(phtw),
  width = 4
)
plot

phtw <- c(
  "GOBP_NEUTROPHIL_MIGRATION",
  "GOBP_NEUTROPHIL_CHEMOTAXIS",
  "GOBP_GRANULOCYTE_CHEMOTAXIS",
  "GOBP_DEFENSE_RESPONSE_TO_OTHER_ORGANISM",
  "GOBP_IMMUNE_RESPONSE_REGULATING_CELL_SURFACE_RECEPTOR_SIGNALING_PATHWAY",
  "GOBP_CYTOKINE_MEDIATED_SIGNALING_PATHWAY",
  "GOBP_B_CELL_MEDIATED_IMMUNITY",
  "GOBP_REGULATION_OF_RESPONSE_TO_EXTERNAL_STIMULUS"
)

plot <- plot_weights_pathways(
  enrich.pos$GO,
  weights_df = weights_df,
  x_axis = "weight",
  enrich_results = results$pos,
  phtw = phtw,
  n_label = 10
)

ggsave(
  filename = paste0("../Results/Enrichment/Figure_x_pos.pdf"),
  height = 1 * length(phtw),
  width = 4
)

plot

8. Plot top weights

Code
plot_top_weights_tidy <- function(
  weights_df,
  facet = "view", # "factor" or "view"
  nfeatures = 10,
  fact = 1,
  scale = TRUE,
  abs = TRUE,
  sign = "all"
) {
  # Remove zeros
  W <- weights_df %>% filter(weight != 0)

  W <- W %>%
    filter(grepl(paste0("^Factor", fact, "$", collapse = "|"), .$factor))

  if (scale) {
    W <- W %>%
      group_by(.data[[facet]]) %>%
      mutate(weight = weight / max(abs(weight)))
  }

  # Sign
  W <- W %>%
    mutate(sign_label = if_else(weight > 0, "+", "-"))

  if (sign == "positive") {
    W <- W %>% filter(weight > 0)
  } else if (sign == "negative") {
    W <- W %>% filter(weight < 0)
  }

  if (abs) {
    W <- W %>% mutate(weight = abs(weight))
  }

  # Select top genes per factor
  W <- W %>%
    group_by(.data[[facet]]) %>%
    slice_max(order_by = weight, n = nfeatures) %>%
    #arrange(weight) %>%
    #fct_reorder(genes, weight) %>%
    #mutate(genes_id = factor(genes)) %>%
    ungroup()

  # Factor ordering
  W <- W %>%
    arrange(factor, weight) %>%
    mutate(genes_id = factor(genes, levels = unique(genes)))
  #mutate(genes_id = factor(genes, levels = rev(unique(genes))))

  # Plot
  p <- ggplot(W, aes(x = genes_id, y = weight, color = sign_label)) + # , color = clus
    geom_point(size = 2) +
    geom_segment(
      aes(xend = genes_id, yend = 0),
      linewidth = 0.75,
      show.legend = F
    ) +
    #scale_colour_manual(values = pal2[["gene_clus"]], na.value = "gray") +
    scale_colour_manual(values = c("#6388B4FF", "#EF6F6AFF")) +
    coord_flip() +
    labs(y = "Weight", x = NULL) +
    theme_bw() +
    theme(
      axis.text.y = element_text(size = rel(1.1)),
      panel.grid.major.y = element_blank(),
      legend.position = "none"
    ) +
    facet_wrap(as.formula(paste("~", facet)), ncol = 2, scales = "free")

  if (abs) {
    p <- p +
      ylim(0, max(W$weight) + 0.1) +
      geom_text(
        aes(label = sign_label),
        y = max(W$weight) + 0.1,
        size = 4,
        show.legend = F
      )
  }

  return(p)
}
# MOFA functions:
plot_enrichment(enrich.neg$GO, factor = 1, max.pathways = 35)

plot_top_weights(
  MOFAmodel,
  fact = c(1, 2),
  view = "Transcriptomics",
  nfeatures = 10
)

plot_enrichment_detailed(
  enrichment.results = enrich.neg$GO,
  factor = 1,
  max.genes = 7,
  max.pathways = 3
)

# quartz(width = 8, height = 8)
lvl <- c("Transcriptomics", "Functional", "Metabolomics", "Metagenomics")
# my function
get_weights(
  MOFAmodel,
  as.data.frame = TRUE,
) %>%
  as_tibble() %>%
  rename(genes = "feature", weight = "value") %>%
  filter(factor == "Factor1") %>%
  mutate(view = factor(.$view, levels = lvl)) %>%
  plot_top_weights_tidy(., facet = "view", nfeatures = 15)

10. Plot waffle

Code
# ============================================================
# FUNCTION: make_gene_waffle_plot()
# ============================================================
# Purpose:
# - Extract genes per pathway from a MOFA enrichment result
# - Join clustering information
# - Filter to selected pathways (phtw)
# - Create and save a waffle plot
#
# Arguments:
# - enrich_obj : e.g. enrichment_list$neg or enrichment_list$pos
# - ontology   : character, e.g. "GO" or "KEGG"
# - phtw       : data frame with column `pathway`
# - group_df   : data frame with columns `genes` and `clus`
# - pal2       : named list with palette pal2[["gene_clus"]]
# - out_file   : file path (without extension)
# - n_rows     : number of waffle rows (default 5)
# ============================================================
clean_GO.fun <- function(path) {
  str_replace_all(path, "_", " ") %>%
    str_remove(., "^GOBP ") %>%
    str_to_lower(.)
}

make_gene_waffle_plot <- function(
  enrich_obj,
  enrich_result,
  ontology = "GO",
  phtw,
  group_df,
  pal = pal2,
  out_file,
  n_rows = 5,
  width = 4,
  height = 35
) {
  # ------------------------------------------------------------
  # STEP 1: Build long data frame of genes per pathway
  # ------------------------------------------------------------

  df <- enrich_obj[[ontology]]$feature.sets %>%
    #df<- enrichment_list$neg$GO$feature.sets %>%
    as_tibble(rownames = "pathway") %>%
    pivot_longer(
      -pathway,
      names_to = "genes",
      values_to = "value"
    ) %>%
    filter(value == 1) %>%
    mutate(
      Ngenes = n(),
      .by = "pathway"
    ) %>%
    left_join(group_df, by = "genes") %>%
    left_join(
      select(enrich_result, pathway, pval.adj_Factor1),
      by = "pathway"
    ) %>%
    filter(pathway %in% phtw) %>%
    #arrange(match(pathway, phtw)) %>%

    mutate(pathway_id = factor(pathway, levels = phtw)) %>%
    mutate(pathway = clean_GO.fun(pathway)) %>%
    mutate(
      pathway = paste0(
        pathway,
        "\n",
        "n=",
        Ngenes,
        " (p-adj ",
        #.$pval.adj_Factor1,
        p.val_round(.$pval.adj_Factor1),
        ")"
      )
    ) %>%
    arrange(clus)

  label_map <- df %>%
    distinct(pathway_id, pathway) %>%
    tibble::deframe()

  temp <<- df
  # ------------------------------------------------------------
  # STEP 2: Create waffle plot
  # ------------------------------------------------------------

  p <- ggplot(df, aes(fill = clus, values = value)) +
    geom_waffle(n_rows = n_rows) +
    theme_void() +
    scale_fill_manual(
      values = pal[["gene_clus"]],
      na.value = "gray"
    ) +
    facet_wrap(
      ~pathway_id,
      ncol = 1,
      labeller = labeller(pathway_id = label_map)
    ) +
    theme(
      #plot.title.position = "plot", strip.text =
      strip.text = element_text(hjust = 0.1)
    )

  # ------------------------------------------------------------
  # STEP 3: Save plot
  # ------------------------------------------------------------
  ggsave(
    filename = paste0(out_file, ".png"),
    plot = p,
    bg = "white",
    limitsize = FALSE,
    width = width,
    height = height
  )
  # ------------------------------------------------------------
  # STEP 4: Return ggplot object (useful for interactive use)
  # ------------------------------------------------------------

  return(p)
}
# remotes::install_github("hrbrmstr/waffle")
library(waffle)

enrichment_list <- c("pos", "neg") %>%
  set_names() %>%
  map(., ~ readRDS(paste0("../Results/Enrichment/", "GSEA_", .x, ".RDS")))

phtw_n <- results$neg %>%
  filter(pval.adj_Factor1 < 0.01) %>%
  filter(set.statistics_Factor1 > 2)

phtw_p <- results$pos %>%
  filter(pval.adj_Factor1 < 0.01) %>%
  filter(set.statistics_Factor1 > 9)

# the input matrix for enrichment "pos" or "neg" is the same
# so it dosent matter which we use here to get the gene sets for each pathway
identical(
  enrichment_list$pos$GO$feature.sets,
  enrichment_list$neg$GO$feature.sets
)
[1] TRUE
# ============================================================
# Plot Waffles
# ============================================================

# NEGATIVE enrichment, GO, first pathway set
p_neg_go <- make_gene_waffle_plot(
  enrich_obj = enrichment_list$neg,
  enrich_result = results$neg,
  phtw = phtw_n$pathway,
  group_df = group_df,
  height = ((0.05 * 5) + 0.9) * length(phtw_n$pathway), # 35,
  out_file = "../Results/Enrichment/Genes_per_pathway_waffle_GO_neg"
)

# POSITIVE enrichment, GO, different pathway set
p_pos_go <- make_gene_waffle_plot(
  enrich_obj = enrichment_list$pos,
  enrich_result = results$pos,
  ontology = "GO",
  phtw = phtw_p$pathway,
  group_df = group_df,
  width = 4,
  height = ((0.05 * 5) + 0.9) * length(phtw_p$pathway), # 35,
  out_file = "../Results/Enrichment/Genes_per_pathway_waffle_GO_pos"
)

# quartz(width = 2.5, height = ((0.05 * 5) + 0.9) * 1, dpi = 150) # 1.5
make_gene_waffle_plot(
  enrich_obj = enrichment_list$pos,
  enrich_result = results$pos,
  ontology = "GO",
  phtw = phtw_p$pathway[51],
  group_df = group_df,
  width = 4,
  height = ((0.05 * 5) + 0.9) * 1,
  out_file = "../Results/Enrichment/Waffle_test.png"
)

11. Plot genes per clusters

Code
plot_genes_by_clus <- function(
  weights_df,
  nfeatures = NULL,
  fact = 1,
  facet_type = "wrap",
  abs = TRUE,
  order = NULL,
  sign = "all"
) {
  W <- weights_df %>% mutate(value = weight)
  W <- W %>% filter(value != 0) # Remove zeros

  W <- W %>%
    filter(grepl(paste0("^Factor", fact, "$", collapse = "|"), .$factor))

  # Sign
  W <- W %>%
    mutate(sign_label = if_else(value > 0, "+", "-"))

  if (sign == "positive") {
    W <- W %>% filter(value > 0)
  } else if (sign == "negative") {
    W <- W %>% filter(value < 0)
  }

  if (abs) {
    W <- W %>% mutate(value = abs(value))
  }

  # Select top features per cluster
  if (!(is.null(nfeatures))) {
    W <- W %>%
      slice_max(order_by = value, n = nfeatures, .by = "clus")
  }

  # Factor ordering
  if (!(is.null(order))) {
    W <- W %>%
      arrange(scaled_weight) %>%
      mutate(genes = factor(genes, levels = unique(genes)))
  }
  W <<- W

  facet_layer <- if (facet_type == "grid") {
    # One shared x-axis across clusters
    facet_grid(rows = vars(clus), scales = "free_y", space = "free_y")
  } else {
    # Separate x-axis per cluster
    facet_wrap(~clus, ncol = 1, scales = "free", space = "free_y")
  }

  # Plot
  p <- ggplot(W, aes(x = scaled_weight, y = genes, color = clus)) +
    geom_point(aes(size = value), show.legend = F) + # size = 2
    geom_segment(
      aes(xend = 0, yend = genes),
      linewidth = 0.75,
      show.legend = F
    ) +
    scale_colour_manual(values = pal2[["gene_clus"]], na.value = "gray") +
    coord_cartesian(clip = "off") +
    labs(y = "Weight", x = NULL) +
    theme_minimal(base_size = 8) +
    theme(
      plot.margin = margin(0.5, 0.5, 0.5, 0.5, unit = "cm"),
      plot.title.position = "plot",
      plot.title = element_text(hjust = 0, size = 8),
      strip.text.y = element_text(angle = 270, face = "bold"),
      strip.placement = "outside",
      axis.title.x = element_text(
        margin = margin(t = 0.5, b = 0.5, unit = "cm")
      ),
      axis.title.y = element_blank(),
      #axis.text = element_text(size = 10),
      #axis.text.y = element_text(size = rel(1.1)),
      legend.position = "top",
      panel.grid.major.y = element_blank(),
    ) +
    facet_layer +
    #facet_grid(rows = vars(clus), scales = "free", space = "free_y") + # one x-axis only
    #facet_wrap(~clus, ncol = 1, scales = "free", space = "free_y") + # x-axis per facet
    xlim(0, 1 + 0.1) +
    geom_text(
      aes(label = sign_label),
      x = 1 + 0.1,
      size = 4,
      show.legend = F
    )

  return(p)
}
# quartz(width = 4, height = (0.1 * nrow(weights_df)) + 2, dpi = 150) # 1.5
p_df <- weights_df %>%
  filter(scaled_weight > 0.1) %>%
  split(~sign)
p_list <- map(p_df, ~ plot_genes_by_clus(.x))


# ============================================================
# SAVE plots
# ============================================================
iwalk(
  p_list,
  ~ ggsave(
    filename = paste0(
      "../Results/Enrichment/Genes_per_enrichment_clus_",
      .y,
      ".png"
    ),
    plot = .x,
    width = 4,
    height = (0.1 * nrow(p_df[[.y]])) + 2,
    bg = "white",
    limitsize = FALSE
  )
)
p_list$neg

11. Plot genes per pathway

I think I should set a threshold for scaled weight to include and specify that in the legend of the figure

sig_pthw <- results$neg %>%
  #bind_rows(results[1:2], .id = "sign") %>%
  filter(pval.adj_Factor1 < 0.01) %>%
  filter(set.statistics_Factor1 > 2)

# enrichment_list$neg$GO$feature.sets and enrichment_list$pos$GO$feature.sets
# are identical
split_pthw.fun <- function(df) {
  df %>%
    mutate(., n_genes = n(), .by = "pathway") %>%
    mutate(
      .,
      split_label = case_when(
        n_genes > 130 & str_detect(clus, "pos") ~ "pos",
        n_genes > 130 & str_detect(clus, "neg") ~ "neg",
        n_genes > 130 & is.na(clus) ~ "neg",
        n_genes <= 130 ~ "all",
        TRUE ~ NA_character_
      )
    ) %>%
    select(-n_genes)
}

phtw_genes <- enrichment_list$neg$GO$feature.sets %>%
  as_tibble(rownames = "pathway") %>%
  pivot_longer(-pathway, names_to = "genes") %>%
  filter(value == 1) %>%
  mutate(clus = groups[.$genes]) %>%
  split_pthw.fun() %>%
  summarise(
    genes = list(genes),
    n_clus = n_distinct(clus, na.rm = F) - 1,
    .by = c("pathway", "split_label")
  ) %>%
  left_join(., results$neg, by = "pathway") %>%
  filter(pathway %in% sig_pthw$pathway)

# filter out genes based on scaled weight treshold
summary(weights_df$scaled_weight)
filt_wght <- weights_df %>%
  filter(scaled_weight >= 0.1)

summary(filt_wght$weight)

plot_lst <- phtw_genes %>%
  mutate(
    match = pmap(., ~ intersect(filt_wght$genes, ..3)),
    plots = pmap(
      .,
      ~ plot_genes_by_clus(
        filter(filt_wght, genes %in% ..3),
        order = T,
        facet_type = "grid"
      )
    ),
    plots = map2(pathway, plots, ~ .y + ggtitle(.x))
  ) %>%
  mutate(height = pmap_chr(., ~ (0.09 * length(..5)) + 0.1 * ..4 + 0.6))
plot_lst

# quartz(width = 4, height = (0.09 * 110) + 0.1*10+0.6, dpi = 150) # 1.5
# plot_lst$plots[[2]]
# quartz(width = 4, height = (0.09 * 26) + 0.1*6+0.6, dpi = 150) # 1.5
# plot_lst$plots[[5]]
# quartz(width = 4, height = (0.09 * 10) + 0.1*3+0.6, dpi = 150) # 1.5
# plot_lst$plots[[27]]
# ============================================================
# Manually write a multi-page PDF
# ============================================================
library(grid)
pdf(
  file = "../Results/Enrichment/GO_MOFA_weights_scaled.pdf",
  width = 4, # fixed width
  onefile = TRUE # multiple pages
)
# Loop over plots and print each on its own page
walk2(
  plot_lst$plots,
  plot_lst$height,
  ~ {
    grid.newpage() # start a new page
    pushViewport(
      viewport(
        width = unit(1, "npc"),
        height = unit(.y, "in")
      )
    )
    print(.x, newpage = FALSE)
  }
)
dev.off()

# quartz(width = 4, height = (0.09 * 10) + 0.1*3+0.6, dpi = 150) # 1.5
plot_genes_by_clus(
  filter(weights_df, genes %in% phtw_genes$genes[[28]]),
  order = TRUE
)
sig_pthw <- list()
sig_pthw$neg <- results$neg %>%
  #bind_rows(results[1:2], .id = "sign") %>%
  filter(pval.adj_Factor1 < 0.01) %>%
  filter(set.statistics_Factor1 > 2) %>%
  pull(., "pathway")

sig_pthw$pos <- c(
  "GOBP_GRANULOCYTE_CHEMOTAXIS",
  "GOBP_RESPONSE_TO_BACTERIUM",
  "GOBP_ANTIGEN_RECEPTOR_MEDIATED_SIGNALING_PATHWAY",
  "GOBP_POSITIVE_REGULATION_OF_RESPONSE_TO_BIOTIC_STIMULUS",
  "GOBP_POSITIVE_REGULATION_OF_LEUKOCYTE_CELL_CELL_ADHESION",
  "GOBP_CELL_ACTIVATION_INVOLVED_IN_IMMUNE_RESPONSE",
  "GOBP_CYTOKINE_MEDIATED_SIGNALING_PATHWAY"
)

# enrichment_list$neg$GO$feature.sets and enrichment_list$pos$GO$feature.sets
# are identical
split_pthw.fun <- function(df) {
  df %>%
    mutate(., n_genes = n(), .by = "pathway") %>%
    mutate(
      .,
      split_label = case_when(
        n_genes > 130 & str_detect(clus, "pos") ~ "pos",
        n_genes > 130 & str_detect(clus, "neg") ~ "neg",
        n_genes > 130 & is.na(clus) ~ "neg",
        n_genes <= 130 ~ "all",
        TRUE ~ NA_character_
      )
    ) %>%
    select(-n_genes)
}

phtw_genes <- enrichment_list$neg$GO$feature.sets %>%
  as_tibble(rownames = "pathway") %>%
  pivot_longer(-pathway, names_to = "genes") %>%
  filter(value == 1) %>%
  mutate(clus = groups[.$genes]) %>%
  split_pthw.fun() %>%
  summarise(
    genes = list(genes),
    n_clus = n_distinct(clus, na.rm = F) - 1,
    .by = c("pathway", "split_label")
  )

phtw_genes <- results[1:2] %>%
  map(., ~ select(.x, pathway, pval.adj_Factor1)) %>%
  map(., ~ left_join(phtw_genes, .x, by = "pathway")) %>%
  imap(., ~ filter(.x, pathway %in% sig_pthw[[.y]]))

# filter out genes based on scaled weight treshold
summary(weights_df$scaled_weight)
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
4.983e-05 4.501e-02 9.385e-02 1.279e-01 1.753e-01 1.000e+00 
filt_wght <- weights_df %>%
  filter(scaled_weight >= 0.1)

summary(filt_wght$weight)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-1.4845 -0.2378  0.2448  0.1522  0.4379  1.8412 
plot_lst <- phtw_genes %>%
  map(
    .,
    ~ .x %>% # phtw_genes$pos %>%
      mutate(
        match = pmap(., ~ intersect(filt_wght$genes, ..3)),
        .after = "n_clus"
      ) %>%
      mutate(
        plots = pmap(
          .,
          ~ plot_genes_by_clus(
            filter(filt_wght, genes %in% ..3),
            order = T,
            facet_type = "grid"
          )
        ),
        plots = map2(pathway, plots, ~ .y + ggtitle(.x))
      ) %>%
      mutate(
        height = pmap_dbl(
          .,
          ~ (0.09 * length(..5)) + 0.1 * ..4 + 0.6
        )
      )
  )
plot_lst$neg
# A tibble: 38 × 8
   pathway     split_label genes n_clus match pval.adj_Factor1 plots      height
   <chr>       <chr>       <lis>  <dbl> <lis>            <dbl> <list>      <dbl>
 1 GOBP_LONG_… all         <chr>      2 <chr>         1.32e- 4 <ggplt2::>   2.96
 2 GOBP_ORGAN… neg         <chr>      4 <chr>         3.61e- 4 <ggplt2::>   8.47
 3 GOBP_ORGAN… pos         <chr>      4 <chr>         3.61e- 4 <ggplt2::>   3.43
 4 GOBP_LIPID… neg         <chr>      4 <chr>         2.33e- 5 <ggplt2::>  13.6 
 5 GOBP_LIPID… pos         <chr>      5 <chr>         2.33e- 5 <ggplt2::>   5.87
 6 GOBP_FATTY… all         <chr>      6 <chr>         2.91e- 6 <ggplt2::>   6.6 
 7 GOBP_ICOSA… all         <chr>      5 <chr>         1.47e- 4 <ggplt2::>   3.44
 8 GOBP_XENOB… all         <chr>      6 <chr>         7.13e- 5 <ggplt2::>   2.91
 9 GOBP_EPIDE… pos         <chr>      5 <chr>         6.91e-13 <ggplt2::>   2.54
10 GOBP_EPIDE… neg         <chr>      4 <chr>         6.91e-13 <ggplt2::>   6.85
# ℹ 28 more rows
# quartz(width = 4, height = (0.09 * 110) + 0.1*10+0.6, dpi = 150) # 1.5
# plot_lst$plots[[2]]
# quartz(width = 4, height = (0.09 * 26) + 0.1*6+0.6, dpi = 150) # 1.5
# plot_lst$plots[[5]]
# quartz(width = 4, height = (0.09 * 10) + 0.1*3+0.6, dpi = 150) # 1.5
# plot_lst$plots[[27]]

# quartz(width = 4, height = (0.09 * 10) + 0.1*3+0.6, dpi = 150) # 1.5
plot_genes_by_clus(
  filter(weights_df, genes %in% phtw_genes$neg$genes[[28]]),
  order = TRUE
)

# ============================================================
# Manually write a multi-page PDF
# ============================================================
library(grid)
pdf(
  file = "../Results/Enrichment/GO_MOFA_weights_scaled_neg.pdf",
  width = 4, # fixed width
  height = max(plot_lst$neg$height),
  onefile = TRUE # multiple pages
)
# Loop over plots and print each on its own page
walk2(
  plot_lst$neg$plots,
  plot_lst$neg$height,
  ~ {
    grid.newpage() # start a new page
    pushViewport(
      viewport(
        width = unit(1, "npc"),
        height = unit(.y, "in")
      )
    )
    print(.x, newpage = FALSE)
  }
)
dev.off()
quartz_off_screen 
                2 
pdf(
  file = "../Results/Enrichment/GO_MOFA_weights_scaled_pos.pdf",
  width = 4, # fixed width
  height = max(plot_lst$pos$height),
  onefile = TRUE # multiple pages
)
# Loop over plots and print each on its own page
walk2(
  plot_lst$pos$plots,
  plot_lst$pos$height,
  ~ {
    grid.newpage() # start a new page
    pushViewport(
      viewport(
        width = unit(1, "npc"),
        height = unit(.y, "in")
      )
    )
    print(.x, newpage = FALSE)
  }
)
dev.off()
quartz_off_screen 
                2