# remove quality controll samples# assuming that the NA_1-6 samples are quality ctrls,# we remove them before normalizing and filteringcountTable <- raw_matrix %>%select(-starts_with("NA")) # %>%#select(-ends_with("_run2"))# Create matrix countTablecountTable <- countTable %>%select(-c(1:6), -entrez) %>%# remove metadata columnsfilter(!is.na(symbol)) %>%# remove rows without a symbolfilter(if_any(where(is.numeric), ~ . !=0)) %>%# remove rows that are all zerossummarise(across(where(is.numeric), \(x) sum(x, na.rm =TRUE)),.by ="symbol" )
# number of uniqe samples excluding "_run2" samplessamples <-select(countTable, where(is.numeric) &!(matches("_run2$"))) %>%colnames()length(samples)
[1] 81
Step 2: Normalize counts with TMM
# get normalized matrix# Create DGElist object to relate counts to groupsdge <- countTable %>%column_to_rownames(., var ="symbol") %>%DGEList(.)# Normalize counts with TMMdge_TMM <-calcNormFactors(dge, method ="TMM")
Step 3: Gene-filtering
Alternative 1:
We calculate the mean log2 CPM of each gene to determine what filtering cuttof to use. By looking at the histogram of the mean log2 CPM values you can determine what log2 CPM value would be suitable to use. You want to see a normal distribution of the mean log2 values. In this dataset vi see that there are a higer frequency of genes around log2CPM between 1 and 2 than what you would expect from the normal distribution.
# Identify low count cut-offmeanLog2CPM <-rowMeans(log2(cpm(countTable[, -1]) +1))# hist(meanLog2CPM)ggplot() +geom_histogram(aes(meanLog2CPM), fill ="red", alpha =0.7, bins =50) +theme_minimal() +scale_y_continuous(labels =label_comma()) +# <-- commasgeom_vline(xintercept =1.5, color ="gray", linetype ="dashed") +ggtitle("Distribution of Mean log2 CPM")
Figure 1: Histogram of mean log2 CPM expression per gene. The vertical gray dashed line marks 1.5 log2 cpm
# number of genes that will be removed:s <-sum(meanLog2CPM <=1.5)# knitr::kable(., digits = 1)# Alternative 1.genes_keep_1 <- countTable %>%filter(rowMeans(across(any_of(samples))) >1.5) %>%pluck(., "symbol")countTable %>%mutate(meanLog2CPM =rowMeans(across(any_of(samples))), .after ="symbol") %>%mutate(cumulative_sum =rowSums(across(any_of(samples))),.after ="symbol" ) %>%filter(meanLog2CPM <1.5) %>%arrange(desc(cumulative_sum)) %>%select(1:3)
A cuttof at 1.5 seem to be appropriate and would result in the removal of {r} s genes.
Alternative 2:
Another aproach is to filter genes by setting a minimum treshold of reads/counts per million in x number of samples e.g > 1 CPM in three samples > 5 CPM in three samples > 1 read in 5 samples
Keep genes where CPM > 1 in more than 3 samples
# Alternative 2.# get counts per mullion (CPM) not logged, for filtering:cpm_TMM <-cpm(dge_TMM, log =FALSE)genes_keep_2 <- cpm_TMM %>%as_tibble(., rownames ="symbol") %>%filter(rowSums(across(all_of(samples), ~ .x >=1)) >=3) %>%pluck(., "symbol")# look at top filtered genescpm_TMM %>%as_tibble(., rownames ="symbol") %>%filter(!(symbol %in% genes_keep_2)) %>%mutate(n_over_1cpm =rowSums(across(all_of(samples), ~ .x >=1)),.after ="symbol" ) %>%mutate(sum_cpm =rowSums(across(all_of(samples))), .after ="symbol") %>%filter(n_over_1cpm <3) %>%arrange(desc(n_over_1cpm)) %>%select(1:3)
Alternative one is less conservative presarving more lowly expressed genes. Alternative 2 is more sensitive. We go for alternative 2 as more than 16 000 genes is more than enough.
Figure 3: Comparison of technical replicates across sequencing runs. Scatterplots show log₂-transformed CPM values for the same biological samples sequenced in two independent sequencing runs. Each point represents one gene. The dashed red line indicates the 1:1 relationship expected for perfect agreement between runs. Overall, the replicate samples display high concordance, confirming that technical variation between sequencing runs is minimal relative to biological signal. Pearson and Spearman correlation coefficients were calculated for each sample pair and are summarized in the accompanying table.
Figure 3: Comparison of technical replicates across sequencing runs. Scatterplots show log₂-transformed CPM values for the same biological samples sequenced in two independent sequencing runs. Each point represents one gene. The dashed red line indicates the 1:1 relationship expected for perfect agreement between runs. Overall, the replicate samples display high concordance, confirming that technical variation between sequencing runs is minimal relative to biological signal. Pearson and Spearman correlation coefficients were calculated for each sample pair and are summarized in the accompanying table.
# Convert counts to long formatlong_counts <- countTable_filt %>%pivot_longer(cols =-symbol,names_to ="svamp_ID",values_to ="counts" ) %>%mutate(is_mito =grepl("^MT", symbol)) %>%left_join(trx_mapping %>%select(svamp_ID, batch), by ="svamp_ID")qc_df <- long_counts %>%group_by(svamp_ID, batch) %>%summarise(total_counts =sum(counts),mito_counts =sum(counts[is_mito]),percent_mito = mito_counts / total_counts *100,.groups ="drop" )ggplot(qc_df, aes(x = batch, y = total_counts, fill = batch)) +geom_violin(trim =FALSE) +geom_boxplot(width =0.1, outlier.shape =NA) +geom_jitter(width =0.25, size =1) +theme_minimal() +labs(title ="Total counts per sample by batch",x ="Batch",y ="Total counts" )
ggplot(qc_df, aes(x = batch, y = percent_mito, fill = batch)) +geom_violin(trim =FALSE) +geom_boxplot(width =0.1, outlier.shape =NA) +geom_jitter(width =0.25) +theme_minimal() +labs(title ="Mitochondrial percentage per sample by batch",x ="Batch",y ="Percent mitochondrial reads" )
Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
plot_pca(cpm_log_TMM[, samples[-1]], "before")
Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
Save batch corrected and filtered and normalized expression values
######################### SAVE NORMALIZED DATA #########################expr_corrected %>%as_tibble(., rownames ="symbol") %>%write_csv(., "../Results/MixOmic/Transcriptomics_batch_filt.csv")# norm <- read_csv("../Results/MixOmic/Transcriptomics_Normalized_filt.csv")
Step 7: Differential Gene Expression analysis
################### DEG's FUNCTION #################### df <- meta_trx# sample_df <- gr_data# group <- "Clin_gr"edgR_dge.fun <-function(CountMatrix, gr, sample_df, confound =NULL) {#groups <- sample_df[, group] groups <-pull(sample_df, gr) n_gr <-length(na.omit(unique(groups)))if (any(is.na(groups))) { groups <- tidyr::replace_na(groups, "X") }if (!is.null(confound)) { c <-"_conf" design <- design } else { c <-"_no_conf" design <-model.matrix(~groups, data = sample_df) # without confounders }colnames(design) <-sub(gr, "", colnames(design))colnames(design) <-sub("\\(Intercept\\)", "Intercept", colnames(design))colnames(design) <-sub(" ", "_", colnames(design)) y <-DGEList(counts = CountMatrix, remove.zeros = T) y <-calcNormFactors(y, method ="TMM") y <-estimateGLMCommonDisp(y, design) y <-estimateGLMTagwiseDisp(y, design) fit <-glmFit(y, design) lrt <-glmLRT(fit, coef =2:n_gr) # with intercept top <-topTags(lrt, adjust.method ="BH", n ="all", sort.by ="p.value")[[1]]colnames(top) <-sub(gr, "", colnames(top)) top <-cbind(LogFC.intercept =0, top) # with intercept top <-rownames_to_column(top, var ="Genes")return(list(design = design, y = y, fit = fit, lrt = lrt, top = top, c = c))}
# nesscary only if you want to add confounders# design <- model.matrix(~ Clin_gr + `Age (years)`, data = gr_data)dge <-column_to_rownames(countTable_final, var ="symbol") %>%edgR_dge.fun(., "RVVC_AS", gr_data, confound =NULL)
################ GET MATRIX ################# get DEGs for clin score 4-5# get filter genes from normalized matrix# logFC.groupsRVVCdeg_matrix <- cpm_log_TMM[DEG_list[[names(D)[2]]]$Genes, gr_data$svamp_ID]# use batch corrected expression values for heatmap?# deg_matrix <- expr_corrected[DEG_list[[names(D)[2]]]$Genes, gr_data$svamp_ID]
# z-scores# to scale the expression of our genes, we have to# first transpose our matrix, then scale, then transpose it back:zscore <-t(apply(deg_matrix, 1, function(i) {scale(i, center = T, scale = T)}))colnames(zscore) <-colnames(deg_matrix)# tidyverse way:# zscore <- cpm_log_TMM %>%# as_tibble(rownames = "symbol") %>%# rowwise() %>%# mutate(across(-symbol, ~ as.numeric(scale(.x, center = TRUE, scale = TRUE)))) %>%# ungroup() %>%# column_to_rownames("symbol")
################# PLOT HEATMAP ################## colour mappingbreaks.fun <-function(min, max, n) { x <-seq(min, max, length.out = n) x[which(x ==median(x))] <-0round(x)return(x)}# to better know what values to use for the legend# we plot the distribution of values# Convert to a data frame for ggplotdf <-data.frame(value =as.vector(zscore))# Basic ggplot histogramp <-ggplot(df, aes(x = value)) +geom_histogram(bins =30, fill ="skyblue", color ="white") +#scale_x_continuous(breaks = scales::breaks_pretty(n = 20)) +scale_y_continuous(labels =label_comma()) +# <-- commaslabs(title ="Distribution of Matrix Values",x ="Matrix Values",y ="Count" ) +theme_minimal(base_size =14) #+#theme(axis.text.x = element_text(angle = 45, hjust = 1))# plot to better see the outlier valuesp2 <- p +ylim(0, 5000)
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
plot_grid(p, p2, ncol =2)
Warning: Removed 11 rows containing missing values or values outside the scale range
(`geom_bar()`).
Figure 3: Histogram showing the distribution of z-score normalized matrix values
# set limits for colourpalletRdBu =c("#364B9A","#4A7BB7","#6EA6CD","#98CAE1","#C2E4EF","white","#FEDA8B","#FDB366","#F67E4B","#DD3D2D","#A50026")# using min and max from the zscore matrix can work, but is usualy influenced too much# from the min and max values which tend to be outlierscol <-colorRamp2(breaks.fun(min(zscore), max(zscore), 7),colorRampPalette(c(RdBu))(7))# instead we use the historgram to determine the min and max values of the scale# values larger than 5 are all mapped to dark red and values less than -5 are all mapped to dark blue.col <-colorRamp2(breaks.fun(-5, 5, 7), colorRampPalette(c(RdBu))(7))
based on these histograms we can set the scale max and min breaks at 5
`use_raster` is automatically set to TRUE for a matrix with more than
2000 rows. You can control `use_raster` argument by explicitly setting
TRUE/FALSE to it.
Set `ht_opt$message = FALSE` to turn off this message.
'magick' package is suggested to install to give better rasterization.
Set `ht_opt$message = FALSE` to turn off this message.
H
Figure 5: Hierarchial clustering of differentially expressed genes. Heatmap of all 7,983 DEGs (FDR-adjusted p value <0.05) between the Clinical score values 5 and 4 (n = 14) and Clinical score values 0 and 1 CTRL (n = 108) group. Each study participant is represented by a vertical column and each gene is represented by a horizontal row. The expression of each gene is standardized (z) to a mean of 0 and a standard deviation of 1.
Warning: Removed 50 rows containing missing values or values outside the scale range
(`geom_text()`).
Source Code
---title: "Transcriptomics Normalization and Exploration"date: todaydate-format: "D MMM YYYY"format: html: embed-resources: true toc: true toc-depth: 2 highlight-style: github code-tools: source: true toggle: true caption: Code---## Overview: #### Quality Control and Normalization0. **Sample Information** 1. **Metadata and group assignment** 2. **Normalize counts with TMM**3. **Gene-filtering** Identify treshold for low abundant genes to filter4. **Compare re-sequenced samples**5. **Differential Gene Expression analysis**6. **Exploratory Heatmap**7. **UMAP on signifcant genes**### Load Libraries```{r Libraries}#| code-fold: true#| output: false################### LOAD LIBRARIES ###################library(tidyverse)# BiocManager::install("limma")# BiocManager::install("edgeR")# BiocManager::install("ComplexHeatmap")library(limma)library(edgeR)library(scales)library(ComplexHeatmap)library(circlize)library(mixOmics)library(readxl)library(cowplot)library(RColorBrewer)select <- dplyr::selectmap <- purrr::map# setwd("/Users/vilkal/work/Brolidens_work/Projects/Candida-omics/src")```## Sample InformationThere are only two biopsies collected in the longitudinal study — at **visit 02** and **visit 04**, corresponding to **baseline** and the **3-month follow-up**.#### Longitudinal Study Setup#### Visits:- **01** – Inclusion - **02** – Secretion + biopsy (**baseline**) - **03** – Secretion only (**1 week post-treatment**) - **04** – Secretion + biopsy (**3 months post-treatment**) - **05** – Secretion only (**6 months post-treatment**)#### Sequencing RunsThe samples have been run in two sequencing batches:- **Run 1:** `BEA23P074_23`- **Run 2:** `BEA24P003_28`#### Sample SummaryBased on the files delivered from BEA, I believe we have sequenced: **81 unique samples**, of which **73 are baseline** and **8 are 3-month follow-up**.NB! The baseline sample of **(S69)** has `12:01` (DS_ID) listed as baseline instead of `12:02`.This was a mistake, but we decided to keep it. #### Potential Control SamplesThere are six samples that might be **controls**, but this is uncertain. I could not find any documentation explaining their naming:FF060322243, FF060322244, 231025:1, 230918., 231108:1, 231108:1```{r Load-data}#| code-fold: true#| output: false############## LOAD DATA ############### meta_path <- "/Users/vilkal/Downloads/Metadata_svamp.xlsx"# meta_data <- read_xlsx(meta_path, sheet = "Metadata", skip = 1)meta_path <-"/Users/vilkal/work/Brolidens_work/Projects/Candida-omics/data/Metadata_svamp_FINAL.xlsx"meta_data <-read_xlsx(meta_path, sheet ="Metadata", na ="na")trx_path <-"../Tidy_data/Transcriptomics/Counts_all_trx_samples.csv"raw_matrix <-read_csv(trx_path)mapping_path <-"/Users/vilkal/work/Brolidens_work/Projects/Candida-omics/Tidy_data/Transcriptomics/Mapping_all_trx_samples.csv"trx_mapping <-read_csv(mapping_path)```## Step 1: Metadata and group assignment```{r group-assignment}g <-c("0"="0","1"="1-3","2"="1-3","3"="1-3","4"="4-5","5"="4-5")trx_mapping <- trx_mapping %>%select(svamp_ID, batch = BEA_ID) %>%mutate(batch =str_replace(batch, "_.+", ""))gr_data <- meta_data %>%rename(`Symptom score (0-5)`="Symptom score (0-5) - ibland har de svarat olika skriftligt i frågeformuläret och muntligt vid inklusiion, då har jag valt den högsta scoren" ) %>%select( svamp_ID,`Clinical score (0-5)`,`Clinical score: hyphae (y/n)`,`Fungal culture: C. albicans (y/n)`,`Fungal culture: Non-albicans candida spp. (y/n)`,`Symptom score (0-5)`,`Recurring fungal infections > 2/year (y/n)` ) %>%filter(!(is.na(svamp_ID))) %>%left_join(., trx_mapping, by ="svamp_ID") %>%mutate(group_cross =case_when(`Fungal culture: C. albicans (y/n)`=="1"&`Symptom score (0-5)`>=1&`Recurring fungal infections > 2/year (y/n)`=="1"~"RVVCpos",`Fungal culture: C. albicans (y/n)`=="0"&`Recurring fungal infections > 2/year (y/n)`=="1"~"RVVCneg",`Fungal culture: C. albicans (y/n)`=="1"&`Symptom score (0-5)`==0&`Recurring fungal infections > 2/year (y/n)`=="0"~"AS",`Fungal culture: C. albicans (y/n)`=="0"&`Recurring fungal infections > 2/year (y/n)`=="0"~"Control",TRUE~NA_character_ ) ) %>%mutate(group =case_when(`Recurring fungal infections > 2/year (y/n)`=="1"~"RVVC",grepl("_\\d\\w", svamp_ID) ~"RVVC",`Recurring fungal infections > 2/year (y/n)`=="0"~"Control",`Fungal culture: C. albicans (y/n)`=="1"&`Recurring fungal infections > 2/year (y/n)`=="0"~"Candidapos",TRUE~NA_character_ ) ) %>%mutate(pos =case_when(`Fungal culture: C. albicans (y/n)`=="1"|`Fungal culture: Non-albicans candida spp. (y/n)`==1~"pos",TRUE~"neg" ),.after ="svamp_ID" ) %>%mutate(hyphae =case_when(`Clinical score: hyphae (y/n)`=="1"~paste0(group, "_H"),TRUE~ group ),RVVC_AS =case_when( group =="RVVC"&`Clinical score (0-5)`=="0"~paste0(group, "_AS"),TRUE~ group ),Follow_up =case_when(grepl("_\\d\\w", svamp_ID) ~"Follow_up",TRUE~"Baseline" ),.after ="group" ) %>%mutate(Clin_gr = g[as.character(.$`Clinical score (0-5)`)],.after ="svamp_ID" ) %>%select("svamp_ID", Clin_gr, group, hyphae, group_cross, RVVC_AS, Follow_up,everything() )```### Check the distribution of samples per groups```{r group-distribution}vars <-c("Clinical score (0-5)","Clin_gr","group","hyphae","RVVC_AS","group_cross") %>%set_names()gr_data_filt <- gr_data %>%filter(gr_data$svamp_ID %in%colnames(raw_matrix))map(vars, ~table(gr_data_filt[[.x]]))```### Get count table```{r get-count-table}# remove quality controll samples# assuming that the NA_1-6 samples are quality ctrls,# we remove them before normalizing and filteringcountTable <- raw_matrix %>%select(-starts_with("NA")) # %>%#select(-ends_with("_run2"))# Create matrix countTablecountTable <- countTable %>%select(-c(1:6), -entrez) %>%# remove metadata columnsfilter(!is.na(symbol)) %>%# remove rows without a symbolfilter(if_any(where(is.numeric), ~ . !=0)) %>%# remove rows that are all zerossummarise(across(where(is.numeric), \(x) sum(x, na.rm =TRUE)),.by ="symbol" )``````{r number-of-samples}# number of uniqe samples excluding "_run2" samplessamples <-select(countTable, where(is.numeric) &!(matches("_run2$"))) %>%colnames()length(samples)```## Step 2: Normalize counts with TMM ```{r Normalized-data}#| eval: true# get normalized matrix# Create DGElist object to relate counts to groupsdge <- countTable %>%column_to_rownames(., var ="symbol") %>%DGEList(.)# Normalize counts with TMMdge_TMM <-calcNormFactors(dge, method ="TMM")```## Step 3: Gene-filtering### Alternative 1:We calculate the mean log2 CPM of each gene to determine what filtering cuttof to use. By looking at the histogram of the mean log2 CPM values you can determine what log2 CPM value would be suitable to use. You want to see a normal distribution of the mean log2 values. In this dataset vi see that there are a higer frequency of genes around log2CPM between 1 and 2 than what you would expect from the normal distribution. ```{r count-distribution-plot}#| fig-cap: "Figure 1: Histogram of mean log2 CPM expression per gene. The vertical gray dashed line marks 1.5 log2 cpm"# Identify low count cut-offmeanLog2CPM <-rowMeans(log2(cpm(countTable[, -1]) +1))# hist(meanLog2CPM)ggplot() +geom_histogram(aes(meanLog2CPM), fill ="red", alpha =0.7, bins =50) +theme_minimal() +scale_y_continuous(labels =label_comma()) +# <-- commasgeom_vline(xintercept =1.5, color ="gray", linetype ="dashed") +ggtitle("Distribution of Mean log2 CPM")# number of genes that will be removed:s <-sum(meanLog2CPM <=1.5)# knitr::kable(., digits = 1)# Alternative 1.genes_keep_1 <- countTable %>%filter(rowMeans(across(any_of(samples))) >1.5) %>%pluck(., "symbol")countTable %>%mutate(meanLog2CPM =rowMeans(across(any_of(samples))), .after ="symbol") %>%mutate(cumulative_sum =rowSums(across(any_of(samples))),.after ="symbol" ) %>%filter(meanLog2CPM <1.5) %>%arrange(desc(cumulative_sum)) %>%select(1:3)```A cuttof at 1.5 seem to be appropriate and would result in the removal of `{r} s` genes. ### Alternative 2:Another aproach is to filter genes by setting a minimum treshold of reads/counts per million in x number of samples e.g > 1 CPM in three samples > 5 CPM in three samples > 1 read in 5 samplesKeep genes where CPM > 1 in more than 3 samples```{r identify-genes-to-remove}# Alternative 2.# get counts per mullion (CPM) not logged, for filtering:cpm_TMM <-cpm(dge_TMM, log =FALSE)genes_keep_2 <- cpm_TMM %>%as_tibble(., rownames ="symbol") %>%filter(rowSums(across(all_of(samples), ~ .x >=1)) >=3) %>%pluck(., "symbol")# look at top filtered genescpm_TMM %>%as_tibble(., rownames ="symbol") %>%filter(!(symbol %in% genes_keep_2)) %>%mutate(n_over_1cpm =rowSums(across(all_of(samples), ~ .x >=1)),.after ="symbol" ) %>%mutate(sum_cpm =rowSums(across(all_of(samples))), .after ="symbol") %>%filter(n_over_1cpm <3) %>%arrange(desc(n_over_1cpm)) %>%select(1:3)length(genes_keep_1)length(genes_keep_2)length(intersect(genes_keep_1, genes_keep_2))```Alternative one is less conservative presarving more lowly expressed genes. Alternative 2 is more sensitive. We go for alternative 2 as more than 16 000 genes is more than enough.```{r filter-genes}# Step 3. filter low abundant genes# DEGlist objectdge_TMM_filt <- dge_TMM[genes_keep_2, ]# count tablecountTable_filt <- countTable %>%filter(symbol %in% genes_keep_2)```## Step 4: Save filtered and normalized counts```{r save-mormalized-data}######################### SAVE NORMALIZED DATA #########################cpm_log_TMM <-cpm(dge_TMM_filt, log = T) # for heatmapcpm_log_TMM %>%as_tibble(., rownames ="symbol") %>%# remove samples sequenced twiceselect(symbol, all_of(samples)) %>%write_csv(., "../Results/MixOmic/Transcriptomics_Normalized_filt.csv")# norm <- read_csv("../Results/MixOmic/Transcriptomics_Normalized_filt.csv")# cpm(dge_TMM, log = T) %>%# as_tibble(., rownames = "symbol") %>%# # remove samples sequenced twice# select(symbol, all_of(samples)) %>%# write_csv(., "../Results/MixOmic/Transcriptomics_Norm_NO_FILT.csv")# norm <- read_csv("../Results/MixOmic/Transcriptomics_Normalized.csv")```## Step 5: compare re-sequenced samples```{r compare-re-sequenced-samples}#| fig-width: 10#| fig-height: 8#| fig-cap: "Figure 3: Comparison of technical replicates across sequencing runs. Scatterplots show log₂-transformed CPM values for the same biological samples sequenced in two independent sequencing runs. Each point represents one gene. The dashed red line indicates the 1:1 relationship expected for perfect agreement between runs. Overall, the replicate samples display high concordance, confirming that technical variation between sequencing runs is minimal relative to biological signal. Pearson and Spearman correlation coefficients were calculated for each sample pair and are summarized in the accompanying table."#| code-fold: false#| eval: true############################## PLOT RE-SEQUENCED SAMPLES ##############################n <-names(select(countTable, contains("_run2")))samp <-str_remove(n, "_run2")n <-c(n, samp) # samples sequenced twice# Create a tidy summary comparing all pairspair_summary <-map_dfr(samp, function(samp) { run1 <- cpm_log_TMM[, samp, drop =TRUE] run2 <- cpm_log_TMM[, paste0(samp, "_run2"), drop =TRUE]tibble(sample = samp,pearson =cor(run1, run2, method ="pearson"),spearman =cor(run1, run2, method ="spearman"),mean_diff =mean(run1 - run2),sd_diff =sd(run1 - run2) )})knitr::kable(pair_summary)# generate one plot per pairair_plots <-map(samp, function(samp) { df <-tibble(gene =rownames(cpm_log_TMM),run1 = cpm_log_TMM[, samp],run2 = cpm_log_TMM[, paste0(samp, "_run2")] )ggplot(df, aes(x = run1, y = run2)) +geom_point(alpha =0.3) +geom_abline(slope =1, intercept =0, linetype ="dashed", color ="red") +labs(title =paste("Comparison:", samp),x ="Run1 log2-CPM",y ="Run2 log2-CPM" ) +theme_minimal()}) %>%set_names(samp)cowplot::plot_grid(plotlist = air_plots, ncol =2)``````{r}# Convert counts to long formatlong_counts <- countTable_filt %>%pivot_longer(cols =-symbol,names_to ="svamp_ID",values_to ="counts" ) %>%mutate(is_mito =grepl("^MT", symbol)) %>%left_join(trx_mapping %>%select(svamp_ID, batch), by ="svamp_ID")qc_df <- long_counts %>%group_by(svamp_ID, batch) %>%summarise(total_counts =sum(counts),mito_counts =sum(counts[is_mito]),percent_mito = mito_counts / total_counts *100,.groups ="drop" )ggplot(qc_df, aes(x = batch, y = total_counts, fill = batch)) +geom_violin(trim =FALSE) +geom_boxplot(width =0.1, outlier.shape =NA) +geom_jitter(width =0.25, size =1) +theme_minimal() +labs(title ="Total counts per sample by batch",x ="Batch",y ="Total counts" )ggplot(qc_df, aes(x = batch, y = percent_mito, fill = batch)) +geom_violin(trim =FALSE) +geom_boxplot(width =0.1, outlier.shape =NA) +geom_jitter(width =0.25) +theme_minimal() +labs(title ="Mitochondrial percentage per sample by batch",x ="Batch",y ="Percent mitochondrial reads" )```### Remove ctrl and second run samples```{r filter-samples}countTable_final <- countTable_filt %>%# remove samples sequenced twiceselect(-ends_with("_run2")) %>%# remove controll samplesselect(-starts_with("NA"))``````{r batch-QC}batch_smpl <- countTable_filt[, c("symbol", n), drop =TRUE]# Convert to long formatlong_counts <- batch_smpl %>%pivot_longer(cols =-symbol,names_to ="sample",values_to ="counts" ) %>%mutate(run =ifelse(grepl("_run2$", sample), "run2", "run1"),bio_sample =gsub("_run2$", "", sample),is_mito =grepl("^MT", symbol) )qc_df <- long_counts %>%group_by(bio_sample, run) %>%summarise(total_counts =sum(counts),mito_counts =sum(counts[is_mito]),percent_mito = mito_counts / total_counts *100,.groups ="drop" )ggplot(qc_df, aes(x = run, y = total_counts, group = bio_sample)) +geom_line(alpha =0.6) +geom_point(size =3) +theme_minimal() +labs(title ="Paired total counts: run1 vs run2",x ="Sequencing run",y ="Total counts" )ggplot(qc_df, aes(x = run, y = percent_mito, group = bio_sample)) +geom_line(alpha =0.6) +geom_point(size =3) +theme_minimal() +labs(title ="Paired mitochondrial percentage: run1 vs run2",x ="Sequencing run",y ="Percent mitochondrial reads" )```## Step 6: Batch correction```{r performe-batch-correction}# Check consitency between group info and samplessamples <-colnames(countTable_final)intersect(samples, gr_data$svamp_ID)setdiff(samples, gr_data$svamp_ID)gr <-c("0", "1-3", "4-5")gr_data <- gr_data %>%filter(svamp_ID %in% samples) %>%mutate(Clin_gr =factor(.$"Clin_gr", levels = gr)) %>%mutate(`Clinical score (0-5)`=factor(as.character(.$`Clinical score (0-5)`),levels =unique(.$`Clinical score (0-5)`) ) )# change missing value NA to "NA"gr_data_ <- gr_data %>%mutate(RVVC_AS = tidyr::replace_na(RVVC_AS, "NA"))design <-model.matrix(~RVVC_AS, data = gr_data_)rownames(design) <- gr_data$svamp_IDbatch <- gr_data$batch# remove batch effectexpr_corrected <-removeBatchEffect( cpm_log_TMM[, samples[-1]],batch = batch,design = design)``````{r plot-batch-correction}plot_pca <-function(X, title) { pca <-prcomp(t(X), scale. =TRUE) pca_df <-data.frame(PC1 = pca$x[, 1],PC2 = pca$x[, 2],batch = gr_data[["batch"]],condition = gr_data[["RVVC_AS"]] )ggplot(pca_df, aes(x = PC1, y = PC2, color = batch, shape = condition)) +geom_point(size =3) +theme_minimal() +labs(title =paste0("PCA ", title, " batch correction"))}plot_pca(expr_corrected, "after")plot_pca(cpm_log_TMM[, samples[-1]], "before")```### Save batch corrected and filtered and normalized expression values```{r save-mormalized-batch-corrected-data}######################### SAVE NORMALIZED DATA #########################expr_corrected %>%as_tibble(., rownames ="symbol") %>%write_csv(., "../Results/MixOmic/Transcriptomics_batch_filt.csv")# norm <- read_csv("../Results/MixOmic/Transcriptomics_Normalized_filt.csv")```## Step 7: Differential Gene Expression analysis```{r functions}################### DEG's FUNCTION #################### df <- meta_trx# sample_df <- gr_data# group <- "Clin_gr"edgR_dge.fun <-function(CountMatrix, gr, sample_df, confound =NULL) {#groups <- sample_df[, group] groups <-pull(sample_df, gr) n_gr <-length(na.omit(unique(groups)))if (any(is.na(groups))) { groups <- tidyr::replace_na(groups, "X") }if (!is.null(confound)) { c <-"_conf" design <- design } else { c <-"_no_conf" design <-model.matrix(~groups, data = sample_df) # without confounders }colnames(design) <-sub(gr, "", colnames(design))colnames(design) <-sub("\\(Intercept\\)", "Intercept", colnames(design))colnames(design) <-sub(" ", "_", colnames(design)) y <-DGEList(counts = CountMatrix, remove.zeros = T) y <-calcNormFactors(y, method ="TMM") y <-estimateGLMCommonDisp(y, design) y <-estimateGLMTagwiseDisp(y, design) fit <-glmFit(y, design) lrt <-glmLRT(fit, coef =2:n_gr) # with intercept top <-topTags(lrt, adjust.method ="BH", n ="all", sort.by ="p.value")[[1]]colnames(top) <-sub(gr, "", colnames(top)) top <-cbind(LogFC.intercept =0, top) # with intercept top <-rownames_to_column(top, var ="Genes")return(list(design = design, y = y, fit = fit, lrt = lrt, top = top, c = c))}``````{r dge-analysis}# nesscary only if you want to add confounders# design <- model.matrix(~ Clin_gr + `Age (years)`, data = gr_data)dge <-column_to_rownames(countTable_final, var ="symbol") %>%edgR_dge.fun(., "RVVC_AS", gr_data, confound =NULL)``````{r top-DEG}# get DEGs for clin score 4-5fc_cols <-sort(grep("logFC.groups", colnames(dge$top), value =TRUE))DEG_list <-map(set_names(fc_cols),~ { fc_col <- .x # current column being processed dge$top %>%as_tibble() %>%mutate(Regulation =case_when(!!sym(fc_col) >0& FDR <0.05~"UP",!!sym(fc_col) <0& FDR <0.05~"DOWN",TRUE~"NOT SIG." ) ) %>%filter(Regulation %in%c("UP", "DOWN")) %>% dplyr::select( Genes, {{ fc_col }}, logCPM, LR, PValue, FDR, Regulation ) %>%group_by(Regulation) %>%# slice_max(order_by = abs(!!sym(fc_col)), n = 10) %>%ungroup() %>%arrange(!!sym(fc_col)) })D <- DEG_list %>%imap(., ~group_by(.x, Regulation)) %>%imap(., ~slice_max(.x, order_by =abs(!!sym(.y)), n =10)) %>%imap(., ~arrange(.x, !!sym(.y)))names(D)D[[2]] %>% knitr::kable(., digits =1)```## Step 8: Exploratory Heatmap```{r get-matrix}################ GET MATRIX ################# get DEGs for clin score 4-5# get filter genes from normalized matrix# logFC.groupsRVVCdeg_matrix <- cpm_log_TMM[DEG_list[[names(D)[2]]]$Genes, gr_data$svamp_ID]# use batch corrected expression values for heatmap?# deg_matrix <- expr_corrected[DEG_list[[names(D)[2]]]$Genes, gr_data$svamp_ID]``````{r z-scores}# z-scores# to scale the expression of our genes, we have to# first transpose our matrix, then scale, then transpose it back:zscore <-t(apply(deg_matrix, 1, function(i) {scale(i, center = T, scale = T)}))colnames(zscore) <-colnames(deg_matrix)# tidyverse way:# zscore <- cpm_log_TMM %>%# as_tibble(rownames = "symbol") %>%# rowwise() %>%# mutate(across(-symbol, ~ as.numeric(scale(.x, center = TRUE, scale = TRUE)))) %>%# ungroup() %>%# column_to_rownames("symbol")``````{r heatmap-annotation}############### ANNOTATION ################ colour palletsblue <-brewer.pal(6, name ="Blues") %>%c(., "gray")Red <-brewer.pal(6, name ="Reds") #%>% c(., "gray")pal <-c("#f26386", "#fd814e", "#a4d984", "#fbbc52") # "#f588af", "#fd814e"pal2 <-c("#2266ac", "#91c5de")#Red <- brewer.pal(6, name = "Reds") #%>% c(., "gray")pal1 <-c("#ffa998", "#f17730ff", "#902267") #pal3 <-c("#9e8dec", "#cc93cf", "#a5c97b", "#de9548") #"#abcb4b" "#63d3b4" "#902267",# group informationannot_col <- gr_data %>%# select(-`Age (years)`) %>%mutate(across(2:7, ~factor(.x)))# check samples numberdim(zscore)dim(annot_col)# top annotation object:top_annot <-columnAnnotation(bin_Clin = annot_col$Clin_gr, # binned clinical gradeClin = annot_col$`Clinical score (0-5)`, # raw clinical scoreGroup_cross = annot_col$group_cross, # sample groupRVVC = annot_col$group, # sample grouphyphae = annot_col$hyphae, # sample groupRVVC_AS = annot_col$RVVC_AS, # sample group#show_legend = FALSE,show_annotation_name = T,annotation_name_gp =gpar(fontsize =8),annotation_legend_param =list(grid_height =unit(.1, "mm"),grid_width =unit(2, "mm"),title ="",labels_gp =gpar(fontsize =7),title_gp =gpar(fontsize =8) ),simple_anno_size =unit(.3, "cm"),#gap = unit(1, "cm"),col =list(Clin =set_names(Red, levels(annot_col$`Clinical score (0-5)`)),bin_Clin =set_names(pal1, levels(annot_col$Clin_gr)),Group_cross =set_names(pal3, levels(annot_col$group_cross)),RVVC =set_names(pal2, levels(annot_col$group)),RVVC_AV =set_names(pal1, levels(annot_col$RVVC_AS)),hyphae =set_names(pal, levels(annot_col$hyphae)) ))``````{r heatmap-color-scale}#| fig-cap: "Figure 3: Histogram showing the distribution of z-score normalized matrix values"#| fig-width: 10#| fig-height: 5################# PLOT HEATMAP ################## colour mappingbreaks.fun <-function(min, max, n) { x <-seq(min, max, length.out = n) x[which(x ==median(x))] <-0round(x)return(x)}# to better know what values to use for the legend# we plot the distribution of values# Convert to a data frame for ggplotdf <-data.frame(value =as.vector(zscore))# Basic ggplot histogramp <-ggplot(df, aes(x = value)) +geom_histogram(bins =30, fill ="skyblue", color ="white") +#scale_x_continuous(breaks = scales::breaks_pretty(n = 20)) +scale_y_continuous(labels =label_comma()) +# <-- commaslabs(title ="Distribution of Matrix Values",x ="Matrix Values",y ="Count" ) +theme_minimal(base_size =14) #+#theme(axis.text.x = element_text(angle = 45, hjust = 1))# plot to better see the outlier valuesp2 <- p +ylim(0, 5000)plot_grid(p, p2, ncol =2)# set limits for colourpalletRdBu =c("#364B9A","#4A7BB7","#6EA6CD","#98CAE1","#C2E4EF","white","#FEDA8B","#FDB366","#F67E4B","#DD3D2D","#A50026")# using min and max from the zscore matrix can work, but is usualy influenced too much# from the min and max values which tend to be outlierscol <-colorRamp2(breaks.fun(min(zscore), max(zscore), 7),colorRampPalette(c(RdBu))(7))# instead we use the historgram to determine the min and max values of the scale# values larger than 5 are all mapped to dark red and values less than -5 are all mapped to dark blue.col <-colorRamp2(breaks.fun(-5, 5, 7), colorRampPalette(c(RdBu))(7))```based on these histograms we can set the scale max and min breaks at 5```{r plot-heatmap}#| fig-width: 10#| fig-height: 7#| fig-cap: "Figure 5: Hierarchial clustering of differentially expressed genes. Heatmap of all 7,983 DEGs (FDR-adjusted p value <0.05) between the Clinical score values 5 and 4 (n = 14) and Clinical score values 0 and 1 CTRL (n = 108) group. Each study participant is represented by a vertical column and each gene is represented by a horizontal row. The expression of each gene is standardized (z) to a mean of 0 and a standard deviation of 1."################# PLOT HEATMAP ################## Heatmap global options:ht_opt$COLUMN_ANNO_PADDING =unit(.05, "cm")ht_opt$HEATMAP_LEGEND_PADDING =unit(-0, "cm") #unit(c(0, 0, 0, -1), "cm") #b,l,t,rht_opt$TITLE_PADDING =unit(.05, "cm")ht_opt$DIMNAME_PADDING =unit(.05, "cm")# average expression:H <-Heatmap( zscore,col = col,cluster_columns =TRUE,show_row_dend =FALSE,name ="Gene \nExpression (z)",show_row_names =FALSE,show_column_names =TRUE,column_names_gp = grid::gpar(fontsize =8),# annotation# right_annotation = right_anno_row, left_annotation = left_anno_row,top_annotation = top_annot)Hpdf(file ="./Heatmap_gene_expression.pdf", width =10, height =7)Hdev.off()```## Step 9: UMAP on signifcant genes```{r UMAP}#### UMAP analysis ####set.seed(1)umap <- uwot::umap(t(deg_matrix),n_neighbors =15,init ="spectral",scale = T)umap_df <-tibble("UMAP 1"= umap[, 1],"UMAP 2"= umap[, 2],"ID"=rownames(umap)) %>%left_join(., annot_col, by =c("ID"="svamp_ID"))#### UMAP plotting by groups ####col <-c('#FB5273', "#6B51A3", "#9D9AC8", '#4FCEEF', "#e68633")col <-c('#FB5273', "#6B51A3", "#9D9AC8", '#4FCEEF', "#e68633", "#9e8dec")pal3 <-c("#9e8dec", "#cc93cf", "#a5c97b", "#de9548")txt_df <- umap_df %>%select(1:11) %>%mutate(txt_clin =ifelse(grepl("0", umap_df$`Clinical score (0-5)`),NA,as.character(.$`Clinical score (0-5)`) ) ) %>%mutate(txt_id =ifelse(group =="RVVC", .$ID, NA))# save plot to filedev.new(height =16.11, width =16.11, units ="cm")#ggsave(paste0("./Figures/09/", "Bulk_UMAP.png"), p)UMAP_fun <-function(group_var, txt_var) { p <-ggplot( umap_df,aes(x =`UMAP 1`,y =`UMAP 2`,fill = .data[[group_var]],shape = Follow_up,color = .data[[group_var]] ) ) +#geom_jitter( shape=21, size=3, color="white", width=.5, height=.5) + # Tassos used jittergeom_point(size =3, alpha =1) +scale_colour_manual(values = pal1,name ="",aesthetics =c("colour", "fill") ) +scale_shape_manual(values =c(21, 5), name ="", ) +#specify individual shapes+geom_text(data = txt_df,aes(x =`UMAP 1`, y =`UMAP 2`, label = .data[[txt_var]]),size =3,hjust =0,nudge_x =0.07,color ="gray51" ) +theme_bw(base_size =14) +theme(line =element_blank(),axis.ticks =element_line(color ="black"),aspect.ratio =1,axis.title =element_blank() )return(p)}``````{r plot-UMAP}#| fig-width: 10#| fig-height: 7pal1 <-c("#ffa998", "#902267", '#FB5273')UMAP_fun("RVVC_AS", "txt_id")UMAP_fun("RVVC_AS", "txt_clin")pal1 <-c("#ffa998", "#902267", '#FB5273', '#4FCEEF')UMAP_fun("hyphae", "txt_clin")pal1 <-c("#f26386", "#fd814e", "#a4d984", "#fbbc52")UMAP_fun("group_cross", "txt_clin")```