Transcriptomics Normalization and Exploration

Published

3 Jan 2026

Overview:

Quality Control and Normalization

  1. Sample Information

  2. Metadata and group assignment

  3. Normalize counts with TMM

  4. Gene-filtering
    Identify treshold for low abundant genes to filter

  5. Compare re-sequenced samples

  6. Differential Gene Expression analysis

  7. Exploratory Heatmap

  8. UMAP on signifcant genes

Load Libraries

Code
##################
# LOAD LIBRARIES #
##################
library(tidyverse)
# BiocManager::install("limma")
# BiocManager::install("edgeR")
# BiocManager::install("ComplexHeatmap")
library(limma)
library(edgeR)

library(scales)
library(ComplexHeatmap)
library(circlize)
library(mixOmics)
library(readxl)
library(cowplot)
library(RColorBrewer)

select <- dplyr::select
map <- purrr::map

# setwd("/Users/vilkal/work/Brolidens_work/Projects/Candida-omics/src")

Sample Information

There are only two biopsies collected in the longitudinal study —
at visit 02 and visit 04, corresponding to baseline and the 3-month follow-up.

Longitudinal Study Setup

Visits:

  • 01 – Inclusion
  • 02 – Secretion + biopsy (baseline)
  • 03 – Secretion only (1 week post-treatment)
  • 04 – Secretion + biopsy (3 months post-treatment)
  • 05 – Secretion only (6 months post-treatment)

Sequencing Runs

The samples have been run in two sequencing batches:

  • Run 1: BEA23P074_23
  • Run 2: BEA24P003_28

Sample Summary

Based on the files delivered from BEA, I believe we have sequenced:
81 unique samples, of which 73 are baseline and 8 are 3-month follow-up.

NB! The baseline sample of (S69) has 12:01 (DS_ID) listed as baseline instead of 12:02. This was a mistake, but we decided to keep it.

Potential Control Samples

There are six samples that might be controls, but this is uncertain.
I could not find any documentation explaining their naming:

FF060322243, FF060322244, 231025:1, 230918., 231108:1, 231108:1

Code
#############
# LOAD DATA #
#############
# meta_path <- "/Users/vilkal/Downloads/Metadata_svamp.xlsx"
# meta_data <- read_xlsx(meta_path, sheet = "Metadata", skip = 1)
meta_path <- "/Users/vilkal/work/Brolidens_work/Projects/Candida-omics/data/Metadata_svamp_FINAL.xlsx"
meta_data <- read_xlsx(meta_path, sheet = "Metadata", na = "na")

trx_path <- "../Tidy_data/Transcriptomics/Counts_all_trx_samples.csv"
raw_matrix <- read_csv(trx_path)

mapping_path <- "/Users/vilkal/work/Brolidens_work/Projects/Candida-omics/Tidy_data/Transcriptomics/Mapping_all_trx_samples.csv"
trx_mapping <- read_csv(mapping_path)

Step 1: Metadata and group assignment

g <- c(
  "0" = "0",
  "1" = "1-3",
  "2" = "1-3",
  "3" = "1-3",
  "4" = "4-5",
  "5" = "4-5"
)

trx_mapping <- trx_mapping %>%
  select(svamp_ID, batch = BEA_ID) %>%
  mutate(batch = str_replace(batch, "_.+", ""))

gr_data <- meta_data %>%
  rename(
    `Symptom score (0-5)` = "Symptom score (0-5) - ibland har de svarat olika skriftligt i frågeformuläret och muntligt vid inklusiion, då har jag valt den högsta scoren"
  ) %>%
  select(
    svamp_ID,
    `Clinical score (0-5)`,
    `Clinical score: hyphae (y/n)`,
    `Fungal culture: C. albicans (y/n)`,
    `Fungal culture: Non-albicans candida spp. (y/n)`,
    `Symptom score (0-5)`,
    `Recurring fungal infections > 2/year (y/n)`
  ) %>%
  filter(!(is.na(svamp_ID))) %>%
  left_join(., trx_mapping, by = "svamp_ID") %>%
  mutate(
    group_cross = case_when(
      `Fungal culture: C. albicans (y/n)` == "1" &
        `Symptom score (0-5)` >= 1 &
        `Recurring fungal infections > 2/year (y/n)` == "1" ~
        "RVVCpos",

      `Fungal culture: C. albicans (y/n)` == "0" &
        `Recurring fungal infections > 2/year (y/n)` == "1" ~
        "RVVCneg",

      `Fungal culture: C. albicans (y/n)` == "1" &
        `Symptom score (0-5)` == 0 &
        `Recurring fungal infections > 2/year (y/n)` == "0" ~
        "AS",

      `Fungal culture: C. albicans (y/n)` == "0" &
        `Recurring fungal infections > 2/year (y/n)` == "0" ~
        "Control",

      TRUE ~ NA_character_
    )
  ) %>%
  mutate(
    group = case_when(
      `Recurring fungal infections > 2/year (y/n)` == "1" ~
        "RVVC",
      grepl("_\\d\\w", svamp_ID) ~
        "RVVC",
      `Recurring fungal infections > 2/year (y/n)` == "0" ~
        "Control",
      `Fungal culture: C. albicans (y/n)` == "1" &
        `Recurring fungal infections > 2/year (y/n)` == "0" ~
        "Candidapos",

      TRUE ~ NA_character_
    )
  ) %>%
  mutate(
    pos = case_when(
      `Fungal culture: C. albicans (y/n)` == "1" |
        `Fungal culture: Non-albicans candida spp. (y/n)` == 1 ~
        "pos",
      TRUE ~ "neg"
    ),
    .after = "svamp_ID"
  ) %>%
  mutate(
    hyphae = case_when(
      `Clinical score: hyphae (y/n)` == "1" ~ paste0(group, "_H"),
      TRUE ~ group
    ),
    RVVC_AS = case_when(
      group == "RVVC" & `Clinical score (0-5)` == "0" ~ paste0(group, "_AS"),
      TRUE ~ group
    ),
    Follow_up = case_when(
      grepl("_\\d\\w", svamp_ID) ~ "Follow_up",
      TRUE ~ "Baseline"
    ),
    .after = "group"
  ) %>%
  mutate(
    Clin_gr = g[as.character(.$`Clinical score (0-5)`)],
    .after = "svamp_ID"
  ) %>%
  select(
    "svamp_ID",
    Clin_gr,
    group,
    hyphae,
    group_cross,
    RVVC_AS,
    Follow_up,
    everything()
  )

Check the distribution of samples per groups

vars <- c(
  "Clinical score (0-5)",
  "Clin_gr",
  "group",
  "hyphae",
  "RVVC_AS",
  "group_cross"
) %>%
  set_names()
gr_data_filt <- gr_data %>%
  filter(gr_data$svamp_ID %in% colnames(raw_matrix))
map(vars, ~ table(gr_data_filt[[.x]]))
$`Clinical score (0-5)`

 0  1  2  3  4  5 
50  8  9  5  6  3 

$Clin_gr

  0 1-3 4-5 
 50  22   9 

$group

Control    RVVC 
     38      39 

$hyphae

  Control Control_H      RVVC    RVVC_H 
       35         3        22        17 

$RVVC_AS

Control    RVVC RVVC_AS 
     38      27      12 

$group_cross

     AS Control RVVCneg RVVCpos 
      7      29       8      21 

Get count table

# remove quality controll samples
# assuming that the NA_1-6 samples are quality ctrls,
# we remove them before normalizing and filtering
countTable <- raw_matrix %>%
  select(-starts_with("NA")) # %>%
#select(-ends_with("_run2"))

# Create matrix countTable
countTable <- countTable %>%
  select(-c(1:6), -entrez) %>% # remove metadata columns
  filter(!is.na(symbol)) %>% # remove rows without a symbol
  filter(if_any(where(is.numeric), ~ . != 0)) %>% # remove rows that are all zeros
  summarise(
    across(where(is.numeric), \(x) sum(x, na.rm = TRUE)),
    .by = "symbol"
  )
# number of uniqe samples excluding "_run2" samples
samples <- select(countTable, where(is.numeric) & !(matches("_run2$"))) %>%
  colnames()
length(samples)
[1] 81

Step 2: Normalize counts with TMM

# get normalized matrix
# Create DGElist object to relate counts to groups
dge <- countTable %>%
  column_to_rownames(., var = "symbol") %>%
  DGEList(.)

# Normalize counts with TMM
dge_TMM <- calcNormFactors(dge, method = "TMM")

Step 3: Gene-filtering

Alternative 1:

We calculate the mean log2 CPM of each gene to determine what filtering cuttof to use. By looking at the histogram of the mean log2 CPM values you can determine what log2 CPM value would be suitable to use. You want to see a normal distribution of the mean log2 values. In this dataset vi see that there are a higer frequency of genes around log2CPM between 1 and 2 than what you would expect from the normal distribution.

# Identify low count cut-off
meanLog2CPM <- rowMeans(log2(cpm(countTable[, -1]) + 1))
# hist(meanLog2CPM)
ggplot() +
  geom_histogram(aes(meanLog2CPM), fill = "red", alpha = 0.7, bins = 50) +
  theme_minimal() +
  scale_y_continuous(labels = label_comma()) + # <-- commas
  geom_vline(xintercept = 1.5, color = "gray", linetype = "dashed") +
  ggtitle("Distribution of Mean log2 CPM")

Figure 1: Histogram of mean log2 CPM expression per gene. The vertical gray dashed line marks 1.5 log2 cpm

# number of genes that will be removed:
s <- sum(meanLog2CPM <= 1.5)

# knitr::kable(., digits = 1)
# Alternative 1.
genes_keep_1 <- countTable %>%
  filter(rowMeans(across(any_of(samples))) > 1.5) %>%
  pluck(., "symbol")

countTable %>%
  mutate(meanLog2CPM = rowMeans(across(any_of(samples))), .after = "symbol") %>%
  mutate(
    cumulative_sum = rowSums(across(any_of(samples))),
    .after = "symbol"
  ) %>%
  filter(meanLog2CPM < 1.5) %>%
  arrange(desc(cumulative_sum)) %>%
  select(1:3)
# A tibble: 11,882 × 3
   symbol     cumulative_sum meanLog2CPM
   <chr>               <dbl>       <dbl>
 1 PRKAR1AP1             121        1.49
 2 LINC02607             121        1.49
 3 IGKV1D-39             121        1.49
 4 LRRC66                121        1.49
 5 STK32A                121        1.49
 6 ZNF346-IT1            121        1.49
 7 H2AC15                121        1.49
 8 H1-5                  121        1.49
 9 H2BC17                121        1.49
10 RPS15AP40             121        1.49
# ℹ 11,872 more rows

A cuttof at 1.5 seem to be appropriate and would result in the removal of {r} s genes.

Alternative 2:

Another aproach is to filter genes by setting a minimum treshold of reads/counts per million in x number of samples e.g > 1 CPM in three samples > 5 CPM in three samples > 1 read in 5 samples

Keep genes where CPM > 1 in more than 3 samples

# Alternative 2.
# get counts per mullion (CPM) not logged, for filtering:
cpm_TMM <- cpm(dge_TMM, log = FALSE)

genes_keep_2 <- cpm_TMM %>%
  as_tibble(., rownames = "symbol") %>%
  filter(rowSums(across(all_of(samples), ~ .x >= 1)) >= 3) %>%
  pluck(., "symbol")

# look at top filtered genes
cpm_TMM %>%
  as_tibble(., rownames = "symbol") %>%
  filter(!(symbol %in% genes_keep_2)) %>%
  mutate(
    n_over_1cpm = rowSums(across(all_of(samples), ~ .x >= 1)),
    .after = "symbol"
  ) %>%
  mutate(sum_cpm = rowSums(across(all_of(samples))), .after = "symbol") %>%
  filter(n_over_1cpm < 3) %>%
  arrange(desc(n_over_1cpm)) %>%
  select(1:3)
# A tibble: 16,655 × 3
   symbol       sum_cpm n_over_1cpm
   <chr>          <dbl>       <dbl>
 1 MIR6859-1      20.3            2
 2 MIR200A        23.9            2
 3 CENPS          29.4            2
 4 PADI4           9.27           2
 5 SH2D5          18.6            2
 6 LOC101929609   29.8            2
 7 PLA2G12AP1     31.8            2
 8 LINC02784      29.1            2
 9 FYB2           20.9            2
10 SGIP1          14.5            2
# ℹ 16,645 more rows
length(genes_keep_1)
[1] 21468
length(genes_keep_2)
[1] 16695
length(intersect(genes_keep_1, genes_keep_2))
[1] 16695

Alternative one is less conservative presarving more lowly expressed genes. Alternative 2 is more sensitive. We go for alternative 2 as more than 16 000 genes is more than enough.

# Step 3. filter low abundant genes
# DEGlist object
dge_TMM_filt <- dge_TMM[genes_keep_2, ]

# count table
countTable_filt <- countTable %>%
  filter(symbol %in% genes_keep_2)

Step 4: Save filtered and normalized counts

########################
# SAVE NORMALIZED DATA #
########################
cpm_log_TMM <- cpm(dge_TMM_filt, log = T) # for heatmap

cpm_log_TMM %>%
  as_tibble(., rownames = "symbol") %>%
  # remove samples sequenced twice
  select(symbol, all_of(samples)) %>%
  write_csv(., "../Results/MixOmic/Transcriptomics_Normalized_filt.csv")
# norm <- read_csv("../Results/MixOmic/Transcriptomics_Normalized_filt.csv")

# cpm(dge_TMM, log = T) %>%
#   as_tibble(., rownames = "symbol") %>%
#   # remove samples sequenced twice
#   select(symbol, all_of(samples)) %>%
#   write_csv(., "../Results/MixOmic/Transcriptomics_Norm_NO_FILT.csv")
# norm <- read_csv("../Results/MixOmic/Transcriptomics_Normalized.csv")

Step 5: compare re-sequenced samples

#############################
# PLOT RE-SEQUENCED SAMPLES #
#############################
n <- names(select(countTable, contains("_run2")))
samp <- str_remove(n, "_run2")
n <- c(n, samp) # samples sequenced twice

# Create a tidy summary comparing all pairs
pair_summary <- map_dfr(samp, function(samp) {
  run1 <- cpm_log_TMM[, samp, drop = TRUE]
  run2 <- cpm_log_TMM[, paste0(samp, "_run2"), drop = TRUE]

  tibble(
    sample = samp,
    pearson = cor(run1, run2, method = "pearson"),
    spearman = cor(run1, run2, method = "spearman"),
    mean_diff = mean(run1 - run2),
    sd_diff = sd(run1 - run2)
  )
})

knitr::kable(pair_summary)
sample pearson spearman mean_diff sd_diff
S07 0.9422612 0.9706496 -0.2780622 0.979386
S20 0.9448052 0.9683841 -0.2521979 1.061346
S22 0.9403599 0.9663649 -0.2789114 1.091206
S63_BL 0.9408223 0.9697045 -0.2547495 1.074093
S69_BL 0.9288711 0.9621371 -0.3055548 1.130253

Figure 3: Comparison of technical replicates across sequencing runs. Scatterplots show log₂-transformed CPM values for the same biological samples sequenced in two independent sequencing runs. Each point represents one gene. The dashed red line indicates the 1:1 relationship expected for perfect agreement between runs. Overall, the replicate samples display high concordance, confirming that technical variation between sequencing runs is minimal relative to biological signal. Pearson and Spearman correlation coefficients were calculated for each sample pair and are summarized in the accompanying table.

# generate one plot per pair
air_plots <- map(samp, function(samp) {
  df <- tibble(
    gene = rownames(cpm_log_TMM),
    run1 = cpm_log_TMM[, samp],
    run2 = cpm_log_TMM[, paste0(samp, "_run2")]
  )

  ggplot(df, aes(x = run1, y = run2)) +
    geom_point(alpha = 0.3) +
    geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
    labs(
      title = paste("Comparison:", samp),
      x = "Run1 log2-CPM",
      y = "Run2 log2-CPM"
    ) +
    theme_minimal()
}) %>%
  set_names(samp)

cowplot::plot_grid(plotlist = air_plots, ncol = 2)

Figure 3: Comparison of technical replicates across sequencing runs. Scatterplots show log₂-transformed CPM values for the same biological samples sequenced in two independent sequencing runs. Each point represents one gene. The dashed red line indicates the 1:1 relationship expected for perfect agreement between runs. Overall, the replicate samples display high concordance, confirming that technical variation between sequencing runs is minimal relative to biological signal. Pearson and Spearman correlation coefficients were calculated for each sample pair and are summarized in the accompanying table.

# Convert counts to long format
long_counts <- countTable_filt %>%
  pivot_longer(
    cols = -symbol,
    names_to = "svamp_ID",
    values_to = "counts"
  ) %>%
  mutate(is_mito = grepl("^MT", symbol)) %>%
  left_join(trx_mapping %>% select(svamp_ID, batch), by = "svamp_ID")

qc_df <- long_counts %>%
  group_by(svamp_ID, batch) %>%
  summarise(
    total_counts = sum(counts),
    mito_counts = sum(counts[is_mito]),
    percent_mito = mito_counts / total_counts * 100,
    .groups = "drop"
  )

ggplot(qc_df, aes(x = batch, y = total_counts, fill = batch)) +
  geom_violin(trim = FALSE) +
  geom_boxplot(width = 0.1, outlier.shape = NA) +
  geom_jitter(width = 0.25, size = 1) +
  theme_minimal() +
  labs(
    title = "Total counts per sample by batch",
    x = "Batch",
    y = "Total counts"
  )

ggplot(qc_df, aes(x = batch, y = percent_mito, fill = batch)) +
  geom_violin(trim = FALSE) +
  geom_boxplot(width = 0.1, outlier.shape = NA) +
  geom_jitter(width = 0.25) +
  theme_minimal() +
  labs(
    title = "Mitochondrial percentage per sample by batch",
    x = "Batch",
    y = "Percent mitochondrial reads"
  )

Remove ctrl and second run samples

countTable_final <- countTable_filt %>%
  # remove samples sequenced twice
  select(-ends_with("_run2")) %>%
  # remove controll samples
  select(-starts_with("NA"))
batch_smpl <- countTable_filt[, c("symbol", n), drop = TRUE]
# Convert to long format
long_counts <- batch_smpl %>%
  pivot_longer(
    cols = -symbol,
    names_to = "sample",
    values_to = "counts"
  ) %>%
  mutate(
    run = ifelse(grepl("_run2$", sample), "run2", "run1"),
    bio_sample = gsub("_run2$", "", sample),
    is_mito = grepl("^MT", symbol)
  )

qc_df <- long_counts %>%
  group_by(bio_sample, run) %>%
  summarise(
    total_counts = sum(counts),
    mito_counts = sum(counts[is_mito]),
    percent_mito = mito_counts / total_counts * 100,
    .groups = "drop"
  )

ggplot(qc_df, aes(x = run, y = total_counts, group = bio_sample)) +
  geom_line(alpha = 0.6) +
  geom_point(size = 3) +
  theme_minimal() +
  labs(
    title = "Paired total counts: run1 vs run2",
    x = "Sequencing run",
    y = "Total counts"
  )

ggplot(qc_df, aes(x = run, y = percent_mito, group = bio_sample)) +
  geom_line(alpha = 0.6) +
  geom_point(size = 3) +
  theme_minimal() +
  labs(
    title = "Paired mitochondrial percentage: run1 vs run2",
    x = "Sequencing run",
    y = "Percent mitochondrial reads"
  )

Step 6: Batch correction

# Check consitency between group info and samples
samples <- colnames(countTable_final)
intersect(samples, gr_data$svamp_ID)
 [1] "S01"    "S02"    "S03"    "S04"    "S05"    "S06"    "S07"    "S08"   
 [9] "S09"    "S10"    "S11"    "S12"    "S13"    "S15"    "S17"    "S18"   
[17] "S19"    "S20"    "S22"    "S23"    "S25"    "S26"    "S27"    "S29"   
[25] "S30"    "S34"    "S35"    "S36"    "S37"    "S38"    "S39"    "S40"   
[33] "S41"    "S42"    "S43"    "S44"    "S45"    "S46"    "S47"    "S48"   
[41] "S49"    "S50"    "S51"    "S52"    "S53"    "S54"    "S55"    "S56"   
[49] "S57"    "S58"    "S59"    "S60"    "S61"    "S62"    "S63_BL" "S64_3M"
[57] "S64_BL" "S66_3M" "S66_BL" "S67_BL" "S68_BL" "S69_1M" "S69_BL" "S70_3M"
[65] "S70_BL" "S71_BL" "S72_3M" "S72_BL" "S73_3M" "S73_BL" "S74_3M" "S74_BL"
[73] "S75_3M" "S75_Bl" "S77_BL" "S78_BL" "S80_BL" "S81_BL" "S82_BL" "S83_BL"
[81] "S85_BL"
setdiff(samples, gr_data$svamp_ID)
[1] "symbol"
gr <- c("0", "1-3", "4-5")

gr_data <- gr_data %>%
  filter(svamp_ID %in% samples) %>%
  mutate(Clin_gr = factor(.$"Clin_gr", levels = gr)) %>%
  mutate(
    `Clinical score (0-5)` = factor(
      as.character(.$`Clinical score (0-5)`),
      levels = unique(.$`Clinical score (0-5)`)
    )
  )

# change missing value NA to "NA"
gr_data_ <- gr_data %>%
  mutate(RVVC_AS = tidyr::replace_na(RVVC_AS, "NA"))

design <- model.matrix(~RVVC_AS, data = gr_data_)
rownames(design) <- gr_data$svamp_ID
batch <- gr_data$batch

# remove batch effect
expr_corrected <- removeBatchEffect(
  cpm_log_TMM[, samples[-1]],
  batch = batch,
  design = design
)
plot_pca <- function(X, title) {
  pca <- prcomp(t(X), scale. = TRUE)

  pca_df <- data.frame(
    PC1 = pca$x[, 1],
    PC2 = pca$x[, 2],
    batch = gr_data[["batch"]],
    condition = gr_data[["RVVC_AS"]]
  )

  ggplot(pca_df, aes(x = PC1, y = PC2, color = batch, shape = condition)) +
    geom_point(size = 3) +
    theme_minimal() +
    labs(title = paste0("PCA ", title, " batch correction"))
}

plot_pca(expr_corrected, "after")
Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).

plot_pca(cpm_log_TMM[, samples[-1]], "before")
Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).

Save batch corrected and filtered and normalized expression values

########################
# SAVE NORMALIZED DATA #
########################
expr_corrected %>%
  as_tibble(., rownames = "symbol") %>%
  write_csv(., "../Results/MixOmic/Transcriptomics_batch_filt.csv")
# norm <- read_csv("../Results/MixOmic/Transcriptomics_Normalized_filt.csv")

Step 7: Differential Gene Expression analysis

##################
# DEG's FUNCTION #
##################
# df <- meta_trx
# sample_df <- gr_data
# group <- "Clin_gr"

edgR_dge.fun <- function(CountMatrix, gr, sample_df, confound = NULL) {
  #groups <- sample_df[, group]
  groups <- pull(sample_df, gr)
  n_gr <- length(na.omit(unique(groups)))

  if (any(is.na(groups))) {
    groups <- tidyr::replace_na(groups, "X")
  }

  if (!is.null(confound)) {
    c <- "_conf"
    design <- design
  } else {
    c <- "_no_conf"
    design <- model.matrix(~groups, data = sample_df) # without confounders
  }

  colnames(design) <- sub(gr, "", colnames(design))
  colnames(design) <- sub("\\(Intercept\\)", "Intercept", colnames(design))
  colnames(design) <- sub(" ", "_", colnames(design))

  y <- DGEList(counts = CountMatrix, remove.zeros = T)
  y <- calcNormFactors(y, method = "TMM")
  y <- estimateGLMCommonDisp(y, design)
  y <- estimateGLMTagwiseDisp(y, design)
  fit <- glmFit(y, design)

  lrt <- glmLRT(fit, coef = 2:n_gr) # with intercept
  top <- topTags(lrt, adjust.method = "BH", n = "all", sort.by = "p.value")[[1]]
  colnames(top) <- sub(gr, "", colnames(top))
  top <- cbind(LogFC.intercept = 0, top) # with intercept
  top <- rownames_to_column(top, var = "Genes")

  return(list(design = design, y = y, fit = fit, lrt = lrt, top = top, c = c))
}
# nesscary only if you want to add confounders
# design <- model.matrix(~ Clin_gr + `Age (years)`, data = gr_data)

dge <- column_to_rownames(countTable_final, var = "symbol") %>%
  edgR_dge.fun(., "RVVC_AS", gr_data, confound = NULL)
# get DEGs for clin score 4-5

fc_cols <- sort(grep("logFC.groups", colnames(dge$top), value = TRUE))

DEG_list <- map(
  set_names(fc_cols),
  ~ {
    fc_col <- .x # current column being processed

    dge$top %>%
      as_tibble() %>%
      mutate(
        Regulation = case_when(
          !!sym(fc_col) > 0 & FDR < 0.05 ~ "UP",
          !!sym(fc_col) < 0 & FDR < 0.05 ~ "DOWN",
          TRUE ~ "NOT SIG."
        )
      ) %>%
      filter(Regulation %in% c("UP", "DOWN")) %>%
      dplyr::select(
        Genes,
        {{ fc_col }},
        logCPM,
        LR,
        PValue,
        FDR,
        Regulation
      ) %>%
      group_by(Regulation) %>%
      # slice_max(order_by = abs(!!sym(fc_col)), n = 10) %>%
      ungroup() %>%
      arrange(!!sym(fc_col))
  }
)

D <- DEG_list %>%
  imap(., ~ group_by(.x, Regulation)) %>%
  imap(., ~ slice_max(.x, order_by = abs(!!sym(.y)), n = 10)) %>%
  imap(., ~ arrange(.x, !!sym(.y)))

names(D)
[1] "logFC.groups"     "logFC.groupsRVVC"
D[[2]] %>%
  knitr::kable(., digits = 1)
Genes logFC.groupsRVVC logCPM LR PValue FDR Regulation
CDSN -4.4 4.6 52.1 0 0 DOWN
IGHV3-74 -4.4 2.3 31.1 0 0 DOWN
LORICRIN -3.5 4.9 24.8 0 0 DOWN
LCE2B -3.4 -1.1 15.1 0 0 DOWN
C6orf15 -3.1 -0.5 21.7 0 0 DOWN
LCE2D -2.7 -1.9 10.9 0 0 DOWN
LIPM -2.6 -1.1 26.0 0 0 DOWN
KCNF1 -2.6 -1.2 30.9 0 0 DOWN
DEFB103B,DEFB103A -2.5 -2.4 10.5 0 0 DOWN
SERPINA12 -2.4 1.7 27.8 0 0 DOWN
CXCL1 4.1 5.5 33.0 0 0 UP
MRPS35-DT 4.2 -2.7 21.6 0 0 UP
CXCL8 4.3 3.8 30.0 0 0 UP
PROK2 4.3 -0.7 49.3 0 0 UP
CSF3 4.5 1.1 51.0 0 0 UP
MMP3 4.6 -1.3 57.2 0 0 UP
KRT24 4.7 5.2 59.3 0 0 UP
IGLV4-60 4.8 -1.8 26.0 0 0 UP
IL22 5.4 -0.5 45.2 0 0 UP
LTF 6.5 4.9 73.3 0 0 UP

Step 8: Exploratory Heatmap

###############
# GET MATRIX #
###############
# get DEGs for clin score 4-5

# get filter genes from normalized matrix
# logFC.groupsRVVC
deg_matrix <- cpm_log_TMM[DEG_list[[names(D)[2]]]$Genes, gr_data$svamp_ID]

# use batch corrected expression values for heatmap?
# deg_matrix <- expr_corrected[DEG_list[[names(D)[2]]]$Genes, gr_data$svamp_ID]
# z-scores
# to scale the expression of our genes, we have to
# first transpose our matrix, then scale, then transpose it back:
zscore <- t(apply(deg_matrix, 1, function(i) {
  scale(i, center = T, scale = T)
}))
colnames(zscore) <- colnames(deg_matrix)

# tidyverse way:
# zscore <- cpm_log_TMM %>%
#   as_tibble(rownames = "symbol") %>%
#   rowwise() %>%
#   mutate(across(-symbol, ~ as.numeric(scale(.x, center = TRUE, scale = TRUE)))) %>%
#   ungroup() %>%
#   column_to_rownames("symbol")
##############
# ANNOTATION #
##############
# colour pallets
blue <- brewer.pal(6, name = "Blues") %>% c(., "gray")
Red <- brewer.pal(6, name = "Reds") #%>% c(., "gray")
pal <- c("#f26386", "#fd814e", "#a4d984", "#fbbc52") # "#f588af", "#fd814e"
pal2 <- c("#2266ac", "#91c5de")

#Red <- brewer.pal(6, name = "Reds") #%>% c(., "gray")
pal1 <- c("#ffa998", "#f17730ff", "#902267") #
pal3 <- c("#9e8dec", "#cc93cf", "#a5c97b", "#de9548") #"#abcb4b" "#63d3b4" "#902267",


# group information
annot_col <- gr_data %>%
  # select(-`Age (years)`) %>%
  mutate(across(2:7, ~ factor(.x)))

# check samples number
dim(zscore)
[1] 3616   81
dim(annot_col)
[1] 81 15
# top annotation object:
top_annot <- columnAnnotation(
  bin_Clin = annot_col$Clin_gr, # binned clinical grade
  Clin = annot_col$`Clinical score (0-5)`, # raw clinical score
  Group_cross = annot_col$group_cross, # sample group
  RVVC = annot_col$group, # sample group
  hyphae = annot_col$hyphae, # sample group
  RVVC_AS = annot_col$RVVC_AS, # sample group

  #show_legend = FALSE,
  show_annotation_name = T,
  annotation_name_gp = gpar(fontsize = 8),
  annotation_legend_param = list(
    grid_height = unit(.1, "mm"),
    grid_width = unit(2, "mm"),
    title = "",
    labels_gp = gpar(fontsize = 7),
    title_gp = gpar(fontsize = 8)
  ),
  simple_anno_size = unit(.3, "cm"),
  #gap = unit(1, "cm"),
  col = list(
    Clin = set_names(Red, levels(annot_col$`Clinical score (0-5)`)),
    bin_Clin = set_names(pal1, levels(annot_col$Clin_gr)),
    Group_cross = set_names(pal3, levels(annot_col$group_cross)),
    RVVC = set_names(pal2, levels(annot_col$group)),
    RVVC_AV = set_names(pal1, levels(annot_col$RVVC_AS)),
    hyphae = set_names(pal, levels(annot_col$hyphae))
  )
)
################
# PLOT HEATMAP #
################
# colour mapping
breaks.fun <- function(min, max, n) {
  x <- seq(min, max, length.out = n)
  x[which(x == median(x))] <- 0
  round(x)
  return(x)
}

# to better know what values to use for the legend
# we plot the distribution of values
# Convert to a data frame for ggplot
df <- data.frame(value = as.vector(zscore))

# Basic ggplot histogram
p <- ggplot(df, aes(x = value)) +
  geom_histogram(bins = 30, fill = "skyblue", color = "white") +
  #scale_x_continuous(breaks = scales::breaks_pretty(n = 20)) +
  scale_y_continuous(labels = label_comma()) + # <-- commas
  labs(
    title = "Distribution of Matrix Values",
    x = "Matrix Values",
    y = "Count"
  ) +
  theme_minimal(base_size = 14) #+
#theme(axis.text.x = element_text(angle = 45, hjust = 1))

# plot to better see the outlier values
p2 <- p + ylim(0, 5000)
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
plot_grid(p, p2, ncol = 2)
Warning: Removed 11 rows containing missing values or values outside the scale range
(`geom_bar()`).

Figure 3: Histogram showing the distribution of z-score normalized matrix values

# set limits for colourpallet
RdBu = c(
  "#364B9A",
  "#4A7BB7",
  "#6EA6CD",
  "#98CAE1",
  "#C2E4EF",
  "white",
  "#FEDA8B",
  "#FDB366",
  "#F67E4B",
  "#DD3D2D",
  "#A50026"
)

# using min and max from the zscore matrix can work, but is usualy influenced too much
# from the min and max values which tend to be outliers
col <- colorRamp2(
  breaks.fun(min(zscore), max(zscore), 7),
  colorRampPalette(c(RdBu))(7)
)

# instead we use the historgram to determine the min and max values of the scale
# values larger than 5 are all mapped to dark red and values less than -5 are all mapped to dark blue.
col <- colorRamp2(breaks.fun(-5, 5, 7), colorRampPalette(c(RdBu))(7))

based on these histograms we can set the scale max and min breaks at 5

################
# PLOT HEATMAP #
################
# Heatmap global options:
ht_opt$COLUMN_ANNO_PADDING = unit(.05, "cm")
ht_opt$HEATMAP_LEGEND_PADDING = unit(-0, "cm") #unit(c(0, 0, 0, -1), "cm") #b,l,t,r
ht_opt$TITLE_PADDING = unit(.05, "cm")
ht_opt$DIMNAME_PADDING = unit(.05, "cm")

# average expression:
H <- Heatmap(
  zscore,
  col = col,
  cluster_columns = TRUE,
  show_row_dend = FALSE,
  name = "Gene  \nExpression (z)",
  show_row_names = FALSE,
  show_column_names = TRUE,
  column_names_gp = grid::gpar(fontsize = 8),

  # annotation
  # right_annotation = right_anno_row, left_annotation = left_anno_row,
  top_annotation = top_annot
)
`use_raster` is automatically set to TRUE for a matrix with more than
2000 rows. You can control `use_raster` argument by explicitly setting
TRUE/FALSE to it.

Set `ht_opt$message = FALSE` to turn off this message.
'magick' package is suggested to install to give better rasterization.

Set `ht_opt$message = FALSE` to turn off this message.
H

Figure 5: Hierarchial clustering of differentially expressed genes. Heatmap of all 7,983 DEGs (FDR-adjusted p value <0.05) between the Clinical score values 5 and 4 (n = 14) and Clinical score values 0 and 1 CTRL (n = 108) group. Each study participant is represented by a vertical column and each gene is represented by a horizontal row. The expression of each gene is standardized (z) to a mean of 0 and a standard deviation of 1.

pdf(file = "./Heatmap_gene_expression.pdf", width = 10, height = 7)
H
dev.off()
quartz_off_screen 
                2 

Step 9: UMAP on signifcant genes

#### UMAP analysis ####
set.seed(1)
umap <- uwot::umap(
  t(deg_matrix),
  n_neighbors = 15,
  init = "spectral",
  scale = T
)

umap_df <- tibble(
  "UMAP 1" = umap[, 1],
  "UMAP 2" = umap[, 2],
  "ID" = rownames(umap)
) %>%
  left_join(., annot_col, by = c("ID" = "svamp_ID"))

#### UMAP plotting by groups ####
col <- c('#FB5273', "#6B51A3", "#9D9AC8", '#4FCEEF', "#e68633")
col <- c('#FB5273', "#6B51A3", "#9D9AC8", '#4FCEEF', "#e68633", "#9e8dec")

pal3 <- c("#9e8dec", "#cc93cf", "#a5c97b", "#de9548")

txt_df <- umap_df %>%
  select(1:11) %>%
  mutate(
    txt_clin = ifelse(
      grepl("0", umap_df$`Clinical score (0-5)`),
      NA,
      as.character(.$`Clinical score (0-5)`)
    )
  ) %>%
  mutate(txt_id = ifelse(group == "RVVC", .$ID, NA))

# save plot to file
dev.new(height = 16.11, width = 16.11, units = "cm")
#ggsave(paste0("./Figures/09/", "Bulk_UMAP.png"), p)

UMAP_fun <- function(group_var, txt_var) {
  p <- ggplot(
    umap_df,
    aes(
      x = `UMAP 1`,
      y = `UMAP 2`,
      fill = .data[[group_var]],
      shape = Follow_up,
      color = .data[[group_var]]
    )
  ) +
    #geom_jitter( shape=21, size=3, color="white", width=.5, height=.5) +  # Tassos used jitter
    geom_point(size = 3, alpha = 1) +
    scale_colour_manual(
      values = pal1,
      name = "",
      aesthetics = c("colour", "fill")
    ) +
    scale_shape_manual(values = c(21, 5), name = "", ) + #specify individual shapes+
    geom_text(
      data = txt_df,
      aes(x = `UMAP 1`, y = `UMAP 2`, label = .data[[txt_var]]),
      size = 3,
      hjust = 0,
      nudge_x = 0.07,
      color = "gray51"
    ) +
    theme_bw(base_size = 14) +
    theme(
      line = element_blank(),
      axis.ticks = element_line(color = "black"),
      aspect.ratio = 1,
      axis.title = element_blank()
    )
  return(p)
}
pal1 <- c("#ffa998", "#902267", '#FB5273')
UMAP_fun("RVVC_AS", "txt_id")
Warning: Removed 42 rows containing missing values or values outside the scale range
(`geom_text()`).

UMAP_fun("RVVC_AS", "txt_clin")
Warning: Removed 50 rows containing missing values or values outside the scale range
(`geom_text()`).

pal1 <- c("#ffa998", "#902267", '#FB5273', '#4FCEEF')
UMAP_fun("hyphae", "txt_clin")
Warning: Removed 50 rows containing missing values or values outside the scale range
(`geom_text()`).

pal1 <- c("#f26386", "#fd814e", "#a4d984", "#fbbc52")
UMAP_fun("group_cross", "txt_clin")
Warning: Removed 50 rows containing missing values or values outside the scale range
(`geom_text()`).