UMCUGenetics · mraves2 · Feb 9, 2026 · Feb 9, 2026 · Feb 9, 2026 · Feb 9, 2026
diff --git a/DIMS/MakeInit.R b/DIMS/MakeInit.R
diff --git a/DIMS/MakeInit.nf b/DIMS/MakeInit.nf
diff --git a/DIMS/ParseSamplesheet.R b/DIMS/ParseSamplesheet.R
@@ -0,0 +1,19 @@
+# define parameters
+args <- commandArgs(trailingOnly = TRUE)
+
+sample_sheet <- as.data.frame(read.csv(args[1], sep = "\t"))
+preprocessing_scripts_dir <- args[2]
+
+# load in function script
+source(paste0(preprocessing_scripts_dir, "parse_samplesheet_functions.R"))
+
+# generate the replication pattern
+repl_pattern <- generate_repl_pattern(sample_sheet)
+
+# write the replication pattern to text file for troubleshooting purposes
+sink("replication_pattern.txt")
+print(repl_pattern)
+sink()
+
+# save replication pattern to file
+save(repl_pattern, file = "init.RData")
diff --git a/DIMS/ParseSamplesheet.nf b/DIMS/ParseSamplesheet.nf
@@ -0,0 +1,17 @@
+process ParseSamplesheet {
+    tag "DIMS ParseSamplesheet"
+    label 'ParseSamplesheet'
+    container = 'docker://umcugenbioinf/dims:1.3'
+
+    input:
+       path(samplesheet) 
+
+    output:
+       path('init.RData'), emit: rdata_file
+       path('replication_pattern.txt')
+
+    script:
+        """
+        Rscript ${baseDir}/CustomModules/DIMS/ParseSamplesheet.R $samplesheet $params.preprocessing_scripts_dir
+        """
+}
diff --git a/DIMS/preprocessing/parse_samplesheet_functions.R b/DIMS/preprocessing/parse_samplesheet_functions.R
@@ -0,0 +1,30 @@
+# function for parse_samplesheet
+
+#' Generate replication pattern list based on information in sample_sheet
+#'
+#' @param sample_sheet: matrix of file names and sample names
+#'
+#' @return ints_sorted: list of sample names with corresponding file names (technical replicates)
+generate_repl_pattern <- function(sample_sheet) {
+  # get the file name and sample name columns from the samplesheet
+  file_name_col <- grep("File_Name|File Name", colnames(sample_sheet), ignore.case = TRUE)
+  sample_name_col <- grep("Sample_Name|Sample Name", colnames(sample_sheet), ignore.case = TRUE)
+  # get the unique sample names from the samplesheet
+    sample_names <- sample_sheet[sample_name_col] |>
+    unlist() |>
+    as.vector() |>
+    trimws() |>
+    unique() |>
+    sort()
+  # remove all characters from sample_names which are not letters, numbers, hyphens and periods
+  sample_names <- gsub("[^-.[:alnum:]]", "_", sample_names)
+
+  # create replication pattern (which technical replicates belong to which sample)
+  repl_pattern <- split(
+    sample_sheet[[file_name_col]],
+    sample_sheet[[sample_name_col]]
+  )
+
+  return(repl_pattern)
+}
+
diff --git a/DIMS/tests/testthat/parse_samplesheet_functions.R b/DIMS/tests/testthat/parse_samplesheet_functions.R
@@ -0,0 +1,24 @@
+# unit tests for ParseSamplesheet
+# function: generate_repl_pattern
+
+# source all functions for ParseSamplesheet
+source("../../preprocessing/parse_samplesheet_functions.R")
+
+# test generate_repl_pattern
+testthat::test_that("replication pattern is correctly generated", {
+  # create sample sheet tot test on:
+  test_file_names <- paste0(rep("RES_20260101_", 6), sprintf("%03d", 1:6))
+  test_sample_names <- sort(rep(c("C1", "P2", "P3"), 2))
+  test_sample_sheet <- as.data.frame(cbind(File_Name = test_file_names, Sample_Name = test_sample_names))
+
+  # test that a list of length 3 is generated
+  expect_length(generate_repl_pattern(test_sample_sheet), 3)
+  # test list names
+  expect_equal(names(generate_repl_pattern(test_sample_sheet)), unique(test_sample_names), TRUE)
+
+  # test what happens if any sample name is used twice
+  test_sample_names <- gsub("P3", "P2", test_sample_names)
+  test_sample_sheet <- as.data.frame(cbind(File_Name = test_file_names, Sample_Name = test_sample_names))
+  expect_length(generate_repl_pattern(test_sample_sheet), 2)
+  expect_length(generate_repl_pattern(test_sample_sheet)$P2, 4)
+})