Skip to content

Commit d8a8d9a

Browse files
committed
Rework MergeSeurat to handle large data better
1 parent ec32afa commit d8a8d9a

File tree

2 files changed

+68
-46
lines changed

2 files changed

+68
-46
lines changed

singlecell/resources/chunks/MergeSeurat.R

Lines changed: 66 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,17 @@ if (!doDiet && length(seuratObjects) > 20 && !disableAutoDietSeurat) {
55
doDiet <- TRUE
66
}
77

8-
mergeBatch <- function(dat) {
8+
filesToDelete <- c()
9+
10+
mergeBatchInMemory <- function(datasetIdToFilePath, saveFile) {
911
toMerge <- list()
10-
for (datasetId in names(dat)) {
12+
for (datasetId in names(datasetIdToFilePath)) {
1113
print(paste0('Loading: ', datasetId))
1214
if (doDiet) {
13-
toMerge[[datasetId]] <- Seurat::DietSeurat(readSeuratRDS(dat[[datasetId]]))
15+
toMerge[[datasetId]] <- Seurat::DietSeurat(readSeuratRDS(datasetIdToFilePath[[datasetId]]))
1416
gc()
1517
} else {
16-
toMerge[[datasetId]] <- readSeuratRDS(dat[[datasetId]])
18+
toMerge[[datasetId]] <- readSeuratRDS(datasetIdToFilePath[[datasetId]])
1719
}
1820

1921
if (ncol(toMerge[[datasetId]]) == 1) {
@@ -38,59 +40,78 @@ mergeBatch <- function(dat) {
3840
}
3941

4042
seuratObj <- CellMembrane::MergeSeuratObjs(toMerge, projectName = projectName, doGC = doDiet, errorOnBarcodeSuffix = errorOnBarcodeSuffix)
41-
return(seuratObj)
43+
saveRDS(seuratObj, file = saveFile)
44+
filesToDelete <<- c(filesToDelete, saveFile)
45+
46+
return(fn)
4247
}
4348

44-
if (length(seuratObjects) == 1) {
45-
print('There is only one seurat object, no need to merge')
46-
datasetId <- names(seuratObjects)[[1]]
47-
saveData(readSeuratRDS(seuratObjects[[datasetId]]), datasetId)
48-
} else {
49-
batchSize <- 20
50-
numBatches <- ceiling(length(seuratObjects) / batchSize)
49+
mergeBatch <- function(seuratObjects, outerBatchIdx, maxBatchSize = 20, maxInputFileSizeMb = maxAllowableInputFileSizeMb) {
50+
logger::log_info(paste0('Beginning outer batch: ', outerBatchIdx, ' with total files: ', length(seuratObjects)))
51+
52+
if (length(seuratObjects) == 1) {
53+
print('Single file, nothing to do')
54+
return(seuratObjects)
55+
}
56+
57+
# Phase 1: group into batches:
58+
batchList <- list()
59+
activeBatch <- c()
60+
sizeOfBatch <- 0
61+
batchIdx <- 1
62+
for (datasetId in names(seuratObjects)) {
63+
activeBatch <- c(activeBatch, seuratObjects[[datasetId]])
64+
sizeInMb <- (file.size(seuratObjects[[datasetId]]) / 1024^2)
65+
sizeOfBatch <- sizeOfBatch + sizeInMb
66+
67+
if (length(activeBatch) >= maxBatchSize || (sizeOfBatch >= maxInputFileSizeMb && length(activeBatch) > 1)) {
68+
logger::log_info(paste0('adding to batch with ', length(activeBatch), ' files and ', sizeOfBatch, 'MB'))
69+
batchList[batchIdx] <- activeBatch
70+
activeBatch <- c()
71+
sizeOfBatch <- 0
72+
batchIdx <- batchIdx + 1
73+
next
74+
}
75+
}
76+
77+
# Account for final files:
78+
if (length(activeBatch) > 0) {
79+
logger::log_info(paste0('finalizing batch with ', length(activeBatch), ' files and ', sizeOfBatch, 'MB'))
80+
batchList[batchIdx] <- activeBatch
81+
}
82+
83+
if (length(batchList) == 0){
84+
stop('Error: zero length batchList')
85+
}
86+
5187
mergedObjectFiles <- list()
52-
for (i in 1:numBatches) {
53-
logger::log_info(paste0('Merging batch ', i, ' of ', numBatches))
54-
start <- 1 + (i-1)*batchSize
55-
end <- min(start+batchSize-1, length(seuratObjects))
56-
logger::log_info(paste0('processing: ', start, ' to ', end, ' of ', length(seuratObjects)))
88+
for (i in 1:length(batchList)) {
89+
activeBatch <- batchList[[i]]
90+
logger::log_info(paste0('Merging inner batch ', i, ' of ', length(batchList), ' with ', length(activeBatch), ' files'))
5791

58-
fn <- paste0('mergeBatch.', i, '.rds')
59-
saveRDS(mergeBatch(seuratObjects[start:end]), file = fn)
60-
mergedObjectFiles[[i]] <- fn
92+
saveFile <- paste0('merge.', outerBatchIdx, '.', i, '.rds')
93+
mergedObjectFiles[[i]] <- mergeBatchInMemory(activeBatch, saveFile = saveFile)
6194

6295
logger::log_info(paste0('mem used: ', R.utils::hsize(as.numeric(pryr::mem_used()))))
6396
gc()
6497
logger::log_info(paste0('after gc: ', R.utils::hsize(as.numeric(pryr::mem_used()))))
6598
}
99+
logger::log_info('Done with inner batch')
66100

67-
logger::log_info('Done with batches')
68-
if (length(mergedObjectFiles) == 1) {
69-
seuratObj <- readRDS(mergedObjectFiles[[1]])
70-
unlink(mergedObjectFiles[[1]])
71-
} else {
72-
logger::log_info('performing final merge')
73-
# TODO: check for single cell in object
74-
seuratObj <- readRDS(mergedObjectFiles[[1]])
75-
unlink(mergedObjectFiles[[1]])
76-
77-
for (i in 2:length(mergedObjectFiles)) {
78-
logger::log_info(paste0('Merging final file ', i, ' of ', length(mergedObjectFiles)))
79-
seuratObj <- merge(x = seuratObj, y = readRDS(mergedObjectFiles[[i]]), project = seuratObj@project.name)
80-
if (HasSplitLayers(seuratObj)) {
81-
seuratObj <- MergeSplitLayers(seuratObj)
82-
}
101+
if (length(mergedObjectFiles) > 1) {
102+
return(mergeBatch(mergedObjectFiles, outerBatchIdx = (outerBatchIdx + 1), maxInputFileSizeMb = maxInputFileSizeMb, maxBatchSize = maxBatchSize))
103+
}
83104

84-
unlink(mergedObjectFiles[[i]])
105+
return(mergedObjectFiles)
106+
}
85107

86-
logger::log_info(paste0('mem used: ', R.utils::hsize(as.numeric(pryr::mem_used()))))
87-
logger::log_info(paste0('seurat object: ', R.utils::hsize(as.numeric(utils::object.size(seuratObj)))))
88-
gc()
89-
logger::log_info(paste0('after gc: ', R.utils::hsize(as.numeric(pryr::mem_used()))))
90-
}
91-
}
108+
mergedObjectFiles <- mergeBatch(seuratObjects, outerBatchIdx = 1)
92109

93-
gc()
110+
print('Overall merge complete')
111+
gc()
112+
saveData(seuratObjMerged, projectName)
94113

95-
saveData(seuratObj, projectName)
114+
# Cleanup:
115+
for (fn in filesToDelete) {
116+
unlink(fn)
96117
}

singlecell/src/org/labkey/singlecell/pipeline/singlecell/MergeSeurat.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,8 @@ public Provider()
3535
put("height", 150);
3636
put("delimiter", ",");
3737
put("stripCharsRe", "/(^['\"]+)|(['\"]+$)/g");
38-
}}, "RNA.orig").delimiter(",")
38+
}}, "RNA.orig").delimiter(","),
39+
SeuratToolParameter.create("maxAllowableInputFileSizeMb", "Max Allowable Batch Size (MB)", "The largest allowable amount of data (in MB), measured as the size of the RDS files, to be allowed in one unit of data to merge in memory.", "ldk-integerfield", null, 200, "maxAllowableInputFileSizeMb", true, false)
3940
), null, null);
4041
}
4142

0 commit comments

Comments
 (0)