@@ -5,15 +5,17 @@ if (!doDiet && length(seuratObjects) > 20 && !disableAutoDietSeurat) {
55 doDiet <- TRUE
66}
77
8- mergeBatch <- function (dat ) {
8+ filesToDelete <- c()
9+
10+ mergeBatchInMemory <- function (datasetIdToFilePath , saveFile ) {
911 toMerge <- list ()
10- for (datasetId in names(dat )) {
12+ for (datasetId in names(datasetIdToFilePath )) {
1113 print(paste0(' Loading: ' , datasetId ))
1214 if (doDiet ) {
13- toMerge [[datasetId ]] <- Seurat :: DietSeurat(readSeuratRDS(dat [[datasetId ]]))
15+ toMerge [[datasetId ]] <- Seurat :: DietSeurat(readSeuratRDS(datasetIdToFilePath [[datasetId ]]))
1416 gc()
1517 } else {
16- toMerge [[datasetId ]] <- readSeuratRDS(dat [[datasetId ]])
18+ toMerge [[datasetId ]] <- readSeuratRDS(datasetIdToFilePath [[datasetId ]])
1719 }
1820
1921 if (ncol(toMerge [[datasetId ]]) == 1 ) {
@@ -38,59 +40,78 @@ mergeBatch <- function(dat) {
3840 }
3941
4042 seuratObj <- CellMembrane :: MergeSeuratObjs(toMerge , projectName = projectName , doGC = doDiet , errorOnBarcodeSuffix = errorOnBarcodeSuffix )
41- return (seuratObj )
43+ saveRDS(seuratObj , file = saveFile )
44+ filesToDelete <<- c(filesToDelete , saveFile )
45+
46+ return (fn )
4247}
4348
44- if (length(seuratObjects ) == 1 ) {
45- print(' There is only one seurat object, no need to merge' )
46- datasetId <- names(seuratObjects )[[1 ]]
47- saveData(readSeuratRDS(seuratObjects [[datasetId ]]), datasetId )
48- } else {
49- batchSize <- 20
50- numBatches <- ceiling(length(seuratObjects ) / batchSize )
49+ mergeBatch <- function (seuratObjects , outerBatchIdx , maxBatchSize = 20 , maxInputFileSizeMb = maxAllowableInputFileSizeMb ) {
50+ logger :: log_info(paste0(' Beginning outer batch: ' , outerBatchIdx , ' with total files: ' , length(seuratObjects )))
51+
52+ if (length(seuratObjects ) == 1 ) {
53+ print(' Single file, nothing to do' )
54+ return (seuratObjects )
55+ }
56+
57+ # Phase 1: group into batches:
58+ batchList <- list ()
59+ activeBatch <- c()
60+ sizeOfBatch <- 0
61+ batchIdx <- 1
62+ for (datasetId in names(seuratObjects )) {
63+ activeBatch <- c(activeBatch , seuratObjects [[datasetId ]])
64+ sizeInMb <- (file.size(seuratObjects [[datasetId ]]) / 1024 ^ 2 )
65+ sizeOfBatch <- sizeOfBatch + sizeInMb
66+
67+ if (length(activeBatch ) > = maxBatchSize || (sizeOfBatch > = maxInputFileSizeMb && length(activeBatch ) > 1 )) {
68+ logger :: log_info(paste0(' adding to batch with ' , length(activeBatch ), ' files and ' , sizeOfBatch , ' MB' ))
69+ batchList [batchIdx ] <- activeBatch
70+ activeBatch <- c()
71+ sizeOfBatch <- 0
72+ batchIdx <- batchIdx + 1
73+ next
74+ }
75+ }
76+
77+ # Account for final files:
78+ if (length(activeBatch ) > 0 ) {
79+ logger :: log_info(paste0(' finalizing batch with ' , length(activeBatch ), ' files and ' , sizeOfBatch , ' MB' ))
80+ batchList [batchIdx ] <- activeBatch
81+ }
82+
83+ if (length(batchList ) == 0 ){
84+ stop(' Error: zero length batchList' )
85+ }
86+
5187 mergedObjectFiles <- list ()
52- for (i in 1 : numBatches ) {
53- logger :: log_info(paste0(' Merging batch ' , i , ' of ' , numBatches ))
54- start <- 1 + (i - 1 )* batchSize
55- end <- min(start + batchSize - 1 , length(seuratObjects ))
56- logger :: log_info(paste0(' processing: ' , start , ' to ' , end , ' of ' , length(seuratObjects )))
88+ for (i in 1 : length(batchList )) {
89+ activeBatch <- batchList [[i ]]
90+ logger :: log_info(paste0(' Merging inner batch ' , i , ' of ' , length(batchList ), ' with ' , length(activeBatch ), ' files' ))
5791
58- fn <- paste0(' mergeBatch.' , i , ' .rds' )
59- saveRDS(mergeBatch(seuratObjects [start : end ]), file = fn )
60- mergedObjectFiles [[i ]] <- fn
92+ saveFile <- paste0(' merge.' , outerBatchIdx , ' .' , i , ' .rds' )
93+ mergedObjectFiles [[i ]] <- mergeBatchInMemory(activeBatch , saveFile = saveFile )
6194
6295 logger :: log_info(paste0(' mem used: ' , R.utils :: hsize(as.numeric(pryr :: mem_used()))))
6396 gc()
6497 logger :: log_info(paste0(' after gc: ' , R.utils :: hsize(as.numeric(pryr :: mem_used()))))
6598 }
99+ logger :: log_info(' Done with inner batch' )
66100
67- logger :: log_info(' Done with batches' )
68- if (length(mergedObjectFiles ) == 1 ) {
69- seuratObj <- readRDS(mergedObjectFiles [[1 ]])
70- unlink(mergedObjectFiles [[1 ]])
71- } else {
72- logger :: log_info(' performing final merge' )
73- # TODO: check for single cell in object
74- seuratObj <- readRDS(mergedObjectFiles [[1 ]])
75- unlink(mergedObjectFiles [[1 ]])
76-
77- for (i in 2 : length(mergedObjectFiles )) {
78- logger :: log_info(paste0(' Merging final file ' , i , ' of ' , length(mergedObjectFiles )))
79- seuratObj <- merge(x = seuratObj , y = readRDS(mergedObjectFiles [[i ]]), project = seuratObj @ project.name )
80- if (HasSplitLayers(seuratObj )) {
81- seuratObj <- MergeSplitLayers(seuratObj )
82- }
101+ if (length(mergedObjectFiles ) > 1 ) {
102+ return (mergeBatch(mergedObjectFiles , outerBatchIdx = (outerBatchIdx + 1 ), maxInputFileSizeMb = maxInputFileSizeMb , maxBatchSize = maxBatchSize ))
103+ }
83104
84- unlink(mergedObjectFiles [[i ]])
105+ return (mergedObjectFiles )
106+ }
85107
86- logger :: log_info(paste0(' mem used: ' , R.utils :: hsize(as.numeric(pryr :: mem_used()))))
87- logger :: log_info(paste0(' seurat object: ' , R.utils :: hsize(as.numeric(utils :: object.size(seuratObj )))))
88- gc()
89- logger :: log_info(paste0(' after gc: ' , R.utils :: hsize(as.numeric(pryr :: mem_used()))))
90- }
91- }
108+ mergedObjectFiles <- mergeBatch(seuratObjects , outerBatchIdx = 1 )
92109
93- gc()
110+ print(' Overall merge complete' )
111+ gc()
112+ saveData(seuratObjMerged , projectName )
94113
95- saveData(seuratObj , projectName )
114+ # Cleanup:
115+ for (fn in filesToDelete ) {
116+ unlink(fn )
96117}
0 commit comments