Allow saturation calculation to run on merged objects, and to run on ADT/HTO data

bbimber · bbimber · commit 6b6efeb14af0 · 2021-10-28T12:40:31.000-07:00
diff --git a/singlecell/resources/chunks/AppendSaturation.R b/singlecell/resources/chunks/AppendSaturation.R
@@ -1,22 +1,12 @@
 for (datasetId in names(seuratObjects)) {
     seuratObj <- seuratObjects[[datasetId]]
     seuratObjects[[datasetId]] <- NULL
-    molInfoFile <- molInfoFiles[[datasetId]]
-    if (is.null(molInfoFile)) {
-        stop(paste0('Unable to find molInfo file for: ', datasetId))
-    }
 
     if (!'DatasetId' %in% names(seuratObj@meta.data)) {
         stop('Seurat object lacks a DatasetId field!')
     }
 
-    datasetId <- unique(seuratObj$DatasetId)
-    if (length(datasetId) != 1) {
-        stop('Saturation can only be computed from single-dataset seurat objects!')
-    }
-
-    seuratObj <- CellMembrane::AppendPerCellSaturation(seuratObj, molInfoFile = molInfoFile, cellbarcodePrefix = paste0(datasetId, '_'))
-
+    seuratObj <- CellMembrane::AppendPerCellSaturationInBulk(seuratObj, molInfoList = molInfoFiles[[datasetId]])
     newSeuratObjects[[datasetId]] <- seuratObj
 
     # Cleanup
diff --git a/singlecell/resources/chunks/SaveData.R b/singlecell/resources/chunks/SaveData.R
@@ -19,7 +19,9 @@ for (datasetId in names(newSeuratObjects)) {
     savedFiles <- rbind(savedFiles, data.frame(datasetId = datasetId, datasetName = datasetName, filename = fn, outputFileId = outputFileId))
 
     # Write cell barcodes and metadata:
-    write.table(seuratObj@meta.data, file = metaFile, quote = F, row.names = F, sep = ',', col.names = F)
+    metaDf <- seuratObj@meta.data
+    metaDf$cellbarcode <- colnames(seuratObj)
+    write.table(metaDf, file = metaFile, quote = F, row.names = F, sep = ',', col.names = T)
     write.table(data.frame(CellBarcode = colnames(seuratObj)), file = barcodeFile, quote = F, row.names = F, sep = ',', col.names = F)
 }
 
diff --git a/singlecell/src/org/labkey/singlecell/pipeline/singlecell/AppendSaturation.java b/singlecell/src/org/labkey/singlecell/pipeline/singlecell/AppendSaturation.java
@@ -3,20 +3,34 @@
 import au.com.bytecode.opencsv.CSVReader;
 import au.com.bytecode.opencsv.CSVWriter;
 import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.labkey.api.data.Container;
+import org.labkey.api.data.SimpleFilter;
+import org.labkey.api.data.Sort;
+import org.labkey.api.data.TableSelector;
+import org.labkey.api.pipeline.PipelineJob;
 import org.labkey.api.pipeline.PipelineJobException;
+import org.labkey.api.query.FieldKey;
+import org.labkey.api.query.QueryService;
 import org.labkey.api.reader.Readers;
 import org.labkey.api.sequenceanalysis.SequenceOutputFile;
 import org.labkey.api.sequenceanalysis.pipeline.AbstractPipelineStepProvider;
 import org.labkey.api.sequenceanalysis.pipeline.PipelineContext;
 import org.labkey.api.sequenceanalysis.pipeline.SequenceOutputHandler;
 import org.labkey.api.singlecell.pipeline.SingleCellStep;
+import org.labkey.api.util.PageFlowUtil;
 import org.labkey.api.writer.PrintWriters;
+import org.labkey.singlecell.SingleCellSchema;
 
 import java.io.File;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
 import java.util.List;
+import java.util.Map;
+import java.util.Set;
 
 import static org.labkey.singlecell.analysis.ProcessSingleCellHandler.LOUPE_TYPE;
 
@@ -48,20 +62,72 @@ public void init(SequenceOutputHandler.JobContext ctx, List<SequenceOutputFile>
     {
         try (CSVWriter writer = new CSVWriter(PrintWriters.getPrintWriter(getMolInfoTable(ctx))))
         {
+            Map<Integer, SequenceOutputFile> loupeOutputs = new HashMap<>();
             for (SequenceOutputFile so : inputFiles)
             {
                 if (!LOUPE_TYPE.isType(so.getFile()))
                 {
-                    throw new PipelineJobException("All input files must be loupe files to use sequence saturation");
+                    File meta = new File(so.getFile().getPath().replaceAll(".rds", ".cellBarcodes.csv"));
+                    if (!meta.exists())
+                    {
+                        throw new PipelineJobException("Cannot find expected metadata file: " + meta.getPath());
+                    }
+
+                    Set<Integer> uniqueIds = new HashSet<>();
+                    try (CSVReader reader = new CSVReader(Readers.getReader(meta), '_'))
+                    {
+                        String[] line;
+                        while ((line = reader.readNext()) != null)
+                        {
+                            if (line.length != 2)
+                            {
+                                throw new PipelineJobException("Unexpected barcode line: " + StringUtils.join(line, "_"));
+                            }
+
+                            try
+                            {
+                                uniqueIds.add(Integer.parseInt(line[0]));
+                            }
+                            catch (NumberFormatException e)
+                            {
+                                throw new PipelineJobException("Non-numeric barcode prefix: " + StringUtils.join(line, "_"));
+                            }
+                        }
+                    }
+
+                    for (Integer rowId : uniqueIds)
+                    {
+                        SequenceOutputFile loupeObj = SequenceOutputFile.getForId(rowId);
+                        if (loupeObj == null)
+                        {
+                            throw new PipelineJobException("Unable to find loupe output file with ID: " + rowId);
+                        }
+                        loupeOutputs.put(rowId, loupeObj);
+                    }
                 }
+                else
+                {
+                    loupeOutputs.put(so.getRowid(), so);
+                }
+            }
 
-                File molInfo = new File(so.getFile().getParentFile(), "molecule_info.h5");
+            for (Integer rowId : loupeOutputs.keySet())
+            {
+                SequenceOutputFile loupeFile = loupeOutputs.get(rowId);
+                File molInfo = new File(loupeFile.getFile().getParentFile(), "molecule_info.h5");
                 if (!molInfo.exists())
                 {
                     throw new PipelineJobException("Cannot find file: " + molInfo.getPath());
                 }
 
-                writer.writeNext(new String[]{String.valueOf(so.getRowid()), molInfo.getPath()});
+                if (loupeFile.getReadset() == null)
+                {
+                    throw new PipelineJobException("Loupe file lacks a readset: " + rowId);
+                }
+
+                writer.writeNext(new String[]{String.valueOf(loupeFile.getRowid()), molInfo.getPath(), "RNA"});
+
+                findAdditionalData(loupeFile, writer, ctx.getJob());
             }
         }
         catch (IOException e)
@@ -93,14 +159,15 @@ protected List<Chunk> getChunks(SequenceOutputHandler.JobContext ctx) throws Pip
             while ((line = reader.readNext()) != null)
             {
                 File source = new File(line[1]);
-                File dest = new File(ctx.getWorkingDirectory(), line[0] + ".molInfo.h5");
+                String assay = line[2];
+                File dest = new File(ctx.getWorkingDirectory(), line[0] + "." + assay + ".molInfo.h5");
                 if (dest.exists())
                 {
                     dest.delete();
                 }
                 FileUtils.copyFile(source, dest);
 
-                lines.add("\t'" + line[0] + "' = '" + dest.getName() + "',");
+                lines.add("\t'" + line[0] + "-" + assay + "' = '" + dest.getName() + "',");
                 ctx.getFileManager().addIntermediateFile(dest);
             }
         }
@@ -128,4 +195,65 @@ public String getFileSuffix()
     {
         return "saturation";
     }
+
+    private void findAdditionalData(SequenceOutputFile loupeFile, CSVWriter writer, PipelineJob job) throws IOException, PipelineJobException
+    {
+        Set<Integer> hashingReadsets = new HashSet<>();
+        Set<Integer> citeReadsets = new HashSet<>();
+        Container targetContainer = job.getContainer().isWorkbook() ? job.getContainer().getParent() : job.getContainer();
+        TableSelector ts = new TableSelector(QueryService.get().getUserSchema(job.getUser(), targetContainer, SingleCellSchema.NAME).getTable(SingleCellSchema.TABLE_CDNAS), PageFlowUtil.set("hashingReadsetId", "citeseqReadsetId"), new SimpleFilter(FieldKey.fromString("readsetId"), loupeFile.getReadset()), null);
+        ts.forEachResults(rs -> {
+            if (rs.getObject(FieldKey.fromString("hashingReadsetId")) != null)
+            {
+                hashingReadsets.add(rs.getInt(FieldKey.fromString("hashingReadsetId")));
+            }
+
+            if (rs.getObject(FieldKey.fromString("citeseqReadsetId")) != null)
+            {
+                citeReadsets.add(rs.getInt(FieldKey.fromString("citeseqReadsetId")));
+            }
+        });
+
+        if (hashingReadsets.size() > 1)
+        {
+            throw new PipelineJobException("More than one hashing readset associated with GEX readset: " + loupeFile.getReadset());
+        }
+        else if (hashingReadsets.size() == 1)
+        {
+            writeExtraData(loupeFile.getRowid(), hashingReadsets.iterator().next(), job, "Cell Hashing Counts", writer, "HTO");
+        }
+
+        if (citeReadsets.size() > 1)
+        {
+            throw new PipelineJobException("More than one CITE-seq readset associated with GEX readset: " + loupeFile.getReadset());
+        }
+        else if (citeReadsets.size() == 1)
+        {
+            writeExtraData(loupeFile.getRowid(), citeReadsets.iterator().next(), job, "CITE-seq Counts", writer, "ADT");
+        }
+    }
+
+    private void writeExtraData(int datasetId, int readsetId, PipelineJob job, String category, CSVWriter writer, String assayName) throws PipelineJobException
+    {
+        Container targetContainer = job.getContainer().isWorkbook() ? job.getContainer().getParent() : job.getContainer();
+        SimpleFilter filter = new SimpleFilter(FieldKey.fromString("readsetId"), readsetId);
+        filter.addCondition(FieldKey.fromString("category"), category);
+
+        List<Integer> rowIds = new TableSelector(QueryService.get().getUserSchema(job.getUser(), targetContainer, SingleCellSchema.SEQUENCE_SCHEMA_NAME).getTable("outputfiles"), PageFlowUtil.set("rowid"), filter, new Sort("-rowid")).getArrayList(Integer.class);
+        if (!rowIds.isEmpty())
+        {
+            if (rowIds.size() >  1)
+            {
+                job.getLogger().info("More than one " + assayName + " output found for " + readsetId + ", using the most recent: " + rowIds.get(0));
+            }
+
+            File molInfo = new File(SequenceOutputFile.getForId(rowIds.get(0)).getFile().getParentFile().getParentFile(), "molecule_info.h5");
+            if (!molInfo.exists())
+            {
+                throw new PipelineJobException("Cannot find file: " + molInfo.getPath());
+            }
+
+            writer.writeNext(new String[]{String.valueOf(datasetId), molInfo.getPath(), assayName});
+        }
+    }
 }

Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,9 @@ for (datasetId in names(newSeuratObjects)) {`
`19`	`19`	`savedFiles <- rbind(savedFiles, data.frame(datasetId = datasetId, datasetName = datasetName, filename = fn, outputFileId = outputFileId))`
`20`	`20`
`21`	`21`	`# Write cell barcodes and metadata:`
`22`		`- write.table(seuratObj@meta.data, file = metaFile, quote = F, row.names = F, sep = ',', col.names = F)`
	`22`	`+ metaDf <- seuratObj@meta.data`
	`23`	`+ metaDf$cellbarcode <- colnames(seuratObj)`
	`24`	`+ write.table(metaDf, file = metaFile, quote = F, row.names = F, sep = ',', col.names = T)`
`23`	`25`	`write.table(data.frame(CellBarcode = colnames(seuratObj)), file = barcodeFile, quote = F, row.names = F, sep = ',', col.names = F)`
`24`	`26`	`}`
`25`	`27`