Skip to content

Commit 6039efb

Browse files
committed
Allow nimble step to use cached barcodes
1 parent 604ee34 commit 6039efb

File tree

2 files changed

+206
-12
lines changed

2 files changed

+206
-12
lines changed

singlecell/src/org/labkey/singlecell/run/NimbleAlignmentStep.java

Lines changed: 125 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,18 @@
33
import org.apache.commons.io.FileUtils;
44
import org.jetbrains.annotations.Nullable;
55
import org.json.JSONObject;
6+
import org.labkey.api.data.Container;
7+
import org.labkey.api.data.SimpleFilter;
8+
import org.labkey.api.data.Sort;
9+
import org.labkey.api.data.TableInfo;
10+
import org.labkey.api.data.TableSelector;
11+
import org.labkey.api.exp.api.ExpData;
12+
import org.labkey.api.exp.api.ExperimentService;
13+
import org.labkey.api.pipeline.PipelineJob;
614
import org.labkey.api.pipeline.PipelineJobException;
15+
import org.labkey.api.query.FieldKey;
16+
import org.labkey.api.query.QueryService;
17+
import org.labkey.api.query.UserSchema;
718
import org.labkey.api.sequenceanalysis.model.Readset;
819
import org.labkey.api.sequenceanalysis.pipeline.AbstractAlignmentStepProvider;
920
import org.labkey.api.sequenceanalysis.pipeline.AlignmentOutputImpl;
@@ -14,19 +25,22 @@
1425
import org.labkey.api.sequenceanalysis.pipeline.SequenceAnalysisJobSupport;
1526
import org.labkey.api.sequenceanalysis.pipeline.ToolParameterDescriptor;
1627
import org.labkey.api.util.PageFlowUtil;
28+
import org.labkey.singlecell.SingleCellSchema;
1729

1830
import java.io.File;
1931
import java.io.IOException;
2032
import java.util.Arrays;
33+
import java.util.HashMap;
2134
import java.util.LinkedHashSet;
2235
import java.util.List;
36+
import java.util.Map;
2337

2438
public class NimbleAlignmentStep extends AbstractCellRangerDependentStep
2539
{
2640
public static final String REF_GENOMES = "refGenomes";
2741
public static final String MAX_HITS_TO_REPORT = "maxHitsToReport";
28-
public static final String ALIGN_OUTPUT = "alignmentOutput";
2942
public static final String STRANDEDNESS = "strandedness";
43+
public static final String REQUIRE_CACHED_BARCODES = "requireCachedBarcodes";
3044

3145
public NimbleAlignmentStep(AlignmentStepProvider<?> provider, PipelineContext ctx, CellRangerWrapper wrapper)
3246
{
@@ -59,7 +73,10 @@ public static List<ToolParameterDescriptor> getToolParameters()
5973
}}, null),
6074
ToolParameterDescriptor.create(MAX_HITS_TO_REPORT, "Max Hits To Report", "If a given hit has more than this number of references, it is discarded", "ldk-integerfield", new JSONObject(){{
6175
put("minValue", 0);
62-
}}, 4)
76+
}}, 4),
77+
ToolParameterDescriptor.create(REQUIRE_CACHED_BARCODES, "Fail Unless Cached Barcodes Present", "If checked, the pipeline will expect a previously computed map of cellbarcodes and UMIs to be computed. Under default conditions, if this is missing, cellranger will be re-run. This flag can be helpful to avoid that computation if you expect the barcode file to exist.", "checkbox", new JSONObject(){{
78+
79+
}}, false)
6380
);
6481
}
6582

@@ -68,6 +85,96 @@ public AlignmentOutput performAlignment(Readset rs, List<File> inputFastqs1, @Nu
6885
{
6986
AlignmentOutputImpl output = new AlignmentOutputImpl();
7087

88+
boolean throwIfNotFound = getProvider().getParameterByName(REQUIRE_CACHED_BARCODES).extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Boolean.class, false);
89+
File cachedBarcodes = getCachedBarcodeFile(rs, throwIfNotFound);
90+
91+
File localBam;
92+
if (cachedBarcodes == null)
93+
{
94+
localBam = performCellRangerAlignment(output, rs, inputFastqs1, inputFastqs2, outputDirectory, referenceGenome, basename, readGroupId, platformUnit);
95+
}
96+
else
97+
{
98+
localBam = createNimbleBam(output, rs, inputFastqs1, inputFastqs2);
99+
}
100+
101+
102+
// Now run nimble itself:
103+
NimbleHelper helper = new NimbleHelper(getPipelineCtx(), getProvider(), getStepIdx());
104+
helper.doNimbleAlign(localBam, output, rs, basename);
105+
output.setBAM(localBam);
106+
107+
return output;
108+
}
109+
110+
private File createNimbleBam(AlignmentOutputImpl output, Readset rs, List<File> inputFastqs1, List<File> inputFastqs2) throws PipelineJobException
111+
{
112+
File cellbarcodes = getCachedBarcodeFile(rs, true);
113+
File umiMapping = getUmiMapping(cellbarcodes);
114+
115+
return NimbleHelper.runFastqToBam(output, getPipelineCtx(), rs, inputFastqs1, inputFastqs2, cellbarcodes, umiMapping);
116+
}
117+
118+
private File getCachedBarcodeFile(Readset rs, boolean throwIfNotFound) throws PipelineJobException
119+
{
120+
Map<Integer, Integer> map = getPipelineCtx().getSequenceSupport().getCachedObject(CACHE_KEY, PipelineJob.createObjectMapper().getTypeFactory().constructParametricType(Map.class, Integer.class, Integer.class));
121+
Integer dataId = map.get(rs.getReadsetId());
122+
if (dataId == null)
123+
{
124+
if (throwIfNotFound)
125+
{
126+
throw new PipelineJobException("No cached data found for readset: " + rs.getReadsetId());
127+
}
128+
129+
return null;
130+
}
131+
132+
File ret = getPipelineCtx().getSequenceSupport().getCachedData(dataId);
133+
if (ret == null || ! ret.exists())
134+
{
135+
throw new PipelineJobException("Missing cached cellbarcode file: " + dataId);
136+
}
137+
138+
return ret;
139+
}
140+
141+
private File getUmiMapping(File cellbarcodeFile) throws PipelineJobException
142+
{
143+
File ret = new File(cellbarcodeFile.getPath().replaceAll(".cb.txt.gz", ".umi.txt.gz"));
144+
if (ret == null || ! ret.exists())
145+
{
146+
throw new PipelineJobException("Missing cached UMI file: " + ret.getPath());
147+
}
148+
149+
return ret;
150+
}
151+
152+
private File findCellBarcodeFiles(Readset rs) throws PipelineJobException
153+
{
154+
Container targetContainer = getPipelineCtx().getJob().getContainer().isWorkbookOrTab() ? getPipelineCtx().getJob().getContainer().getParent() : getPipelineCtx().getJob().getContainer();
155+
UserSchema us = QueryService.get().getUserSchema(getPipelineCtx().getJob().getUser(), targetContainer, SingleCellSchema.SEQUENCE_SCHEMA_NAME);
156+
TableInfo ti = us.getTable("outputfiles");
157+
158+
SimpleFilter sf = new SimpleFilter(FieldKey.fromString("readset"), rs.getRowId());
159+
sf.addCondition(FieldKey.fromString("category"), NimbleHelper.CATEGORY_CB);
160+
List<Integer> cbs = new TableSelector(ti, PageFlowUtil.set("dataid"), sf, new Sort("-rowid")).getArrayList(Integer.class);
161+
if (!cbs.isEmpty())
162+
{
163+
int dataId = cbs.get(0);
164+
ExpData d = ExperimentService.get().getExpData(dataId);
165+
if (d == null || d.getFile() == null)
166+
{
167+
throw new PipelineJobException("Output lacks a file: " + dataId);
168+
}
169+
170+
return d.getFile();
171+
}
172+
173+
return null;
174+
}
175+
176+
private File performCellRangerAlignment(AlignmentOutputImpl output, Readset rs, List<File> inputFastqs1, @Nullable List<File> inputFastqs2, File outputDirectory, ReferenceGenome referenceGenome, String basename, String readGroupId, @Nullable String platformUnit) throws PipelineJobException
177+
{
71178
// We need to ensure we keep the BAM for post-processing:
72179
setAlwaysRetainBam(true);
73180

@@ -89,12 +196,7 @@ public AlignmentOutput performAlignment(Readset rs, List<File> inputFastqs1, @Nu
89196

90197
NimbleHelper.write10xBarcodes(localBam, getWrapper().getLogger(), rs, referenceGenome, output);
91198

92-
// Now run nimble itself:
93-
NimbleHelper helper = new NimbleHelper(getPipelineCtx(), getProvider(), getStepIdx());
94-
helper.doNimbleAlign(localBam, output, rs, basename);
95-
output.setBAM(localBam);
96-
97-
return output;
199+
return localBam;
98200
}
99201

100202
@Override
@@ -109,5 +211,20 @@ public void init(SequenceAnalysisJobSupport support) throws PipelineJobException
109211
{
110212
helper.prepareGenome(id);
111213
}
214+
215+
// Try to find 10x barcodes:
216+
HashMap<Integer, File> readsetToBarcodes = new HashMap<>();
217+
for (Readset rs : support.getCachedReadsets())
218+
{
219+
File f = findCellBarcodeFiles(rs);
220+
if (f != null)
221+
{
222+
readsetToBarcodes.put(rs.getReadsetId(), f);
223+
}
224+
}
225+
226+
support.cacheObject(CACHE_KEY, readsetToBarcodes);
112227
}
228+
229+
private static final String CACHE_KEY = "nimble.cb";
113230
}

singlecell/src/org/labkey/singlecell/run/NimbleHelper.java

Lines changed: 81 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,11 @@
2727
import org.labkey.api.sequenceanalysis.pipeline.PipelineStepOutput;
2828
import org.labkey.api.sequenceanalysis.pipeline.PipelineStepProvider;
2929
import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome;
30+
import org.labkey.api.sequenceanalysis.pipeline.SamtoolsRunner;
3031
import org.labkey.api.sequenceanalysis.pipeline.SequencePipelineService;
3132
import org.labkey.api.sequenceanalysis.run.DISCVRSeqRunner;
3233
import org.labkey.api.sequenceanalysis.run.DockerWrapper;
34+
import org.labkey.api.util.FileUtil;
3335
import org.labkey.api.util.PageFlowUtil;
3436
import org.labkey.api.writer.PrintWriters;
3537

@@ -496,6 +498,9 @@ private Map<NimbleGenome, File> doAlignment(List<NimbleGenome> genomes, List<Fil
496498
return resultMap;
497499
}
498500

501+
public static final String CATEGORY_CB = "10x CellBarcode Map";
502+
public static final String CATEGORY_UB = "10x UMI Map";
503+
499504
public static void write10xBarcodes(File bam, Logger log, Readset rs, ReferenceGenome referenceGenome, PipelineStepOutput output) throws PipelineJobException
500505
{
501506
// Write barcodes:
@@ -504,18 +509,18 @@ public static void write10xBarcodes(File bam, Logger log, Readset rs, ReferenceG
504509
barcodeArgs.add("-I");
505510
barcodeArgs.add(bam.getPath());
506511

507-
File cbOutput = new File(bam.getParentFile(), SequenceAnalysisService.get().getUnzippedBaseName(bam.getName()) + "cb.txt.gz");
512+
File cbOutput = new File(bam.getParentFile(), SequenceAnalysisService.get().getUnzippedBaseName(bam.getName()) + ".cb.txt.gz");
508513
barcodeArgs.add("--cbOutput");
509514
barcodeArgs.add(cbOutput.getPath());
510515

511-
File umiOutput = new File(bam.getParentFile(), SequenceAnalysisService.get().getUnzippedBaseName(bam.getName()) + "umi.txt.gz");
516+
File umiOutput = new File(bam.getParentFile(), SequenceAnalysisService.get().getUnzippedBaseName(bam.getName()) + ".umi.txt.gz");
512517
barcodeArgs.add("--umiOutput");
513518
barcodeArgs.add(umiOutput.getPath());
514519

515520
runner.execute(barcodeArgs);
516521

517-
output.addSequenceOutput(cbOutput, "10x CellBarcode Map: " + rs.getName(), "10x CellBarcode Map", rs.getReadsetId(), null, referenceGenome.getGenomeId(), null);
518-
output.addSequenceOutput(umiOutput, "10x UMI Map: " + rs.getName(), "10x UMI Map", rs.getReadsetId(), null, referenceGenome.getGenomeId(), null);
522+
output.addSequenceOutput(cbOutput, "10x CellBarcode Map: " + rs.getName(), CATEGORY_CB, rs.getReadsetId(), null, referenceGenome.getGenomeId(), null);
523+
output.addSequenceOutput(umiOutput, "10x UMI Map: " + rs.getName(), CATEGORY_UB, rs.getReadsetId(), null, referenceGenome.getGenomeId(), null);
519524
}
520525

521526
public static File runNimbleReport(File alignResultsGz, int genomeId, PipelineStepOutput output, PipelineContext ctx) throws PipelineJobException
@@ -595,6 +600,78 @@ private static File getNimbleDoneFile(File parentDir, String resumeString)
595600
return new File(parentDir, "nimble." + resumeString + ".done");
596601
}
597602

603+
public static File runFastqToBam(PipelineStepOutput output, PipelineContext ctx, Readset rs, List<File> inputFastqs1, List<File> inputFastqs2, File cellBarcodes, File umiMapping) throws PipelineJobException
604+
{
605+
List<File> outputBams = new ArrayList<>();
606+
int bamIdx = 0;
607+
while (bamIdx < inputFastqs1.size())
608+
{
609+
File outputBam = new File(ctx.getWorkingDirectory(), FileUtil.makeLegalName(rs.getName()) + ".unmapped." + bamIdx + ".bam");
610+
611+
List<String> args = new ArrayList<>();
612+
args.add("python3");
613+
args.add("-m");
614+
args.add("nimble");
615+
616+
args.add("fastq-to-bam");
617+
618+
Integer maxThreads = SequencePipelineService.get().getMaxThreads(ctx.getLogger());
619+
if (maxThreads != null)
620+
{
621+
args.add("-c");
622+
args.add(maxThreads.toString());
623+
}
624+
625+
args.add("--r1-fastq");
626+
args.add(inputFastqs1.get(bamIdx).getPath());
627+
if (bamIdx > inputFastqs2.size())
628+
{
629+
throw new PipelineJobException("Unequal lengths for first/second pair FASTQs");
630+
}
631+
632+
args.add("--r2-fastq");
633+
args.add(inputFastqs2.get(bamIdx).getPath());
634+
635+
args.add("--cell-barcodes");
636+
args.add(cellBarcodes.getPath());
637+
638+
args.add("--umi-mapping");
639+
args.add(umiMapping.getPath());
640+
641+
args.add("--output");
642+
args.add(outputBam.getPath());
643+
644+
runUsingDocker(args, output, "nimble.fastq-to-bam." + bamIdx, ctx);
645+
outputBams.add(outputBam);
646+
bamIdx++;
647+
}
648+
649+
File outputBam;
650+
if (outputBams.size() > 1)
651+
{
652+
outputBam = new File(ctx.getWorkingDirectory(), FileUtil.makeLegalName(rs.getName()) + ".unmapped.bam");
653+
outputBams.forEach(output::addIntermediateFile);
654+
655+
SamtoolsRunner st = new SamtoolsRunner(ctx.getLogger());
656+
List<String> args = new ArrayList<>(Arrays.asList(st.getSamtoolsPath().getPath(), "merge", "-o", outputBam.getPath(), "-f"));
657+
Integer maxThreads = SequencePipelineService.get().getMaxThreads(ctx.getLogger());
658+
if (maxThreads != null)
659+
{
660+
args.add("-@");
661+
args.add(maxThreads.toString());
662+
}
663+
664+
outputBams.forEach(bam -> args.add(bam.getPath()));
665+
st.execute(args);
666+
}
667+
else
668+
{
669+
outputBam = outputBams.get(0);
670+
}
671+
672+
return outputBam;
673+
}
674+
598675
public static String DOCKER_CONTAINER_NAME = "ghcr.io/bimberlab/nimble:latest";
599676

600677
private boolean runUsingDocker(List<String> nimbleArgs, PipelineStepOutput output, @Nullable String resumeString) throws PipelineJobException

0 commit comments

Comments
 (0)