Skip to content

Commit 4093754

Browse files
committed
Initial start toward making GenotypeGVCFs step automatically merge and retain merged product(s)
1 parent 716e467 commit 4093754

File tree

2 files changed

+111
-56
lines changed

2 files changed

+111
-56
lines changed

SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/GenotypeGVCFHandler.java

Lines changed: 90 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import org.labkey.sequenceanalysis.SequenceAnalysisModule;
2929
import org.labkey.sequenceanalysis.pipeline.JobContextImpl;
3030
import org.labkey.sequenceanalysis.pipeline.ProcessVariantsHandler;
31+
import org.labkey.sequenceanalysis.run.util.CombineGVCFsWrapper;
3132
import org.labkey.sequenceanalysis.run.util.GenomicsDBImportHandler;
3233
import org.labkey.sequenceanalysis.run.util.GenotypeGVCFsWrapper;
3334

@@ -268,29 +269,62 @@ public void processFilesOnWebserver(PipelineJob job, SequenceAnalysisJobSupport
268269

269270
}
270271

272+
private String getBasename(JobContext ctx)
273+
{
274+
String basename = ctx.getParams().get("variantCalling.GenotypeGVCFs.fileBaseName") != null ? ctx.getParams().getString("variantCalling.GenotypeGVCFs.fileBaseName") : "CombinedGenotypes";
275+
basename = basename.replaceAll(".vcf.gz$", "");
276+
basename = basename.replaceAll(".vcf$", "");
277+
278+
return basename;
279+
}
280+
271281
private File runGenotypeGVCFs(PipelineJob job, JobContext ctx, ProcessVariantsHandler.Resumer resumer, List<File> inputFiles, int genomeId) throws PipelineJobException
272282
{
273283
RecordedAction action = new RecordedAction(getName());
274284
action.setStartTime(new Date());
275285

286+
File outDir = ctx.getOutputDir();
287+
String basename = getBasename(ctx);
288+
289+
File outputVcf = new File(outDir, basename + ".vcf.gz");
290+
276291
for (File f : inputFiles)
277292
{
278293
action.addInput(f, "Input Variants");
279294
}
280295

296+
boolean doCopyLocal = ctx.getParams().optBoolean("variantCalling.GenotypeGVCFs.doCopyInputs", false);
297+
298+
Set<File> toDelete = new HashSet<>();
299+
List<File> filesToProcess = new ArrayList<>();
300+
if (doCopyLocal)
301+
{
302+
ctx.getLogger().info("making local copies of gVCF/GenomicsDB files prior to genotyping");
303+
filesToProcess.addAll(GenotypeGVCFsWrapper.copyVcfsLocally(inputFiles, toDelete, null, ctx.getLogger(), outputVcf.exists()));
304+
}
305+
else
306+
{
307+
filesToProcess.addAll(inputFiles);
308+
}
309+
310+
//Allow CombineGVCFs to run on interval(s)
311+
File inputVcf;
312+
if (filesToProcess.size() > 1)
313+
{
314+
inputVcf = combineInputs(ctx, filesToProcess, genomeId);
315+
}
316+
else
317+
{
318+
inputVcf = filesToProcess.get(0);
319+
}
320+
281321
GenotypeGVCFsWrapper wrapper = new GenotypeGVCFsWrapper(job.getLogger());
282322
ReferenceGenome genome = ctx.getSequenceSupport().getCachedGenome(genomeId);
283323
if (genome == null)
284324
{
285325
throw new PipelineJobException("Unable to find cached genome for Id: " + genomeId);
286326
}
287327

288-
File outDir = ctx.getOutputDir();
289-
String basename = ctx.getParams().get("variantCalling.GenotypeGVCFs.fileBaseName") != null ? ctx.getParams().getString("variantCalling.GenotypeGVCFs.fileBaseName") : "CombinedGenotypes";
290-
basename = basename.replaceAll(".vcf.gz$", "");
291-
basename = basename.replaceAll(".vcf$", "");
292-
293-
File outputVcf = new File(outDir, basename + ".vcf.gz");
294328
List<String> toolParams = new ArrayList<>();
295329
if (ctx.getParams().get("variantCalling.GenotypeGVCFs.stand_call_conf") != null)
296330
{
@@ -330,14 +364,62 @@ private File runGenotypeGVCFs(PipelineJob job, JobContext ctx, ProcessVariantsHa
330364
});
331365
}
332366

333-
boolean doCopyInputs = ctx.getParams().optBoolean("variantCalling.GenotypeGVCFs.doCopyInputs", false);
367+
wrapper.execute(genome.getSourceFastaFile(), outputVcf, toolParams, inputVcf);
334368

335-
wrapper.execute(genome.getSourceFastaFile(), outputVcf, toolParams, doCopyInputs, inputFiles.toArray(new File[inputFiles.size()]));
336369
action.addOutput(outputVcf, "VCF", outputVcf.exists(), true);
337370
action.setEndTime(new Date());
338371
resumer.setGenotypeGVCFsComplete(action, outputVcf);
339372

373+
if (!toDelete.isEmpty())
374+
{
375+
ctx.getLogger().info("deleting locally copied inputs");
376+
for (File f : toDelete)
377+
{
378+
if (f.exists())
379+
{
380+
f.delete();
381+
}
382+
}
383+
}
384+
340385
return outputVcf;
341386
}
387+
388+
private File combineInputs(JobContext ctx, List<File> inputFiles, int genomeId) throws PipelineJobException
389+
{
390+
// TODO: this should ultimately be expanded to include smarter merge with GenomicsDB
391+
// Also consider allowing the input to be a folder with per-contig gVCFs
392+
393+
String basename = getBasename(ctx);
394+
File combined = new File(ctx.getOutputDir(), basename + ".combined.gvcf.gz");
395+
396+
File idx = new File(combined.getPath() + ".tbi");
397+
if (idx.exists())
398+
{
399+
ctx.getLogger().info("Index exists, resuming combine with existing file: " + combined.getPath());
400+
return combined;
401+
}
402+
403+
ReferenceGenome genome = ctx.getSequenceSupport().getCachedGenome(genomeId);
404+
if (genome == null)
405+
{
406+
throw new PipelineJobException("Unable to find cached genome for Id: " + genomeId);
407+
}
408+
409+
List<String> toolParams = new ArrayList<>();
410+
List<Interval> intervals = ProcessVariantsHandler.getIntervals(ctx);
411+
if (intervals != null)
412+
{
413+
intervals.forEach(interval -> {
414+
toolParams.add("-L");
415+
toolParams.add(interval.getContig() + ":" + interval.getStart() + "-" + interval.getEnd());
416+
});
417+
}
418+
419+
CombineGVCFsWrapper wrapper = new CombineGVCFsWrapper(ctx.getLogger());
420+
wrapper.execute(genome.getWorkingFastaFile(), combined, toolParams, inputFiles.toArray(new File[inputFiles.size()]));
421+
422+
return combined;
423+
}
342424
}
343425
}

SequenceAnalysis/src/org/labkey/sequenceanalysis/run/util/GenotypeGVCFsWrapper.java

Lines changed: 21 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -29,46 +29,31 @@ public GenotypeGVCFsWrapper(Logger log)
2929
super(log);
3030
}
3131

32-
public void execute(File referenceFasta, File outputFile, @Nullable List<String> options, boolean doCopyLocal, File... inputGVCFs) throws PipelineJobException
32+
public void execute(File referenceFasta, File outputFile, @Nullable List<String> options, File inputFile) throws PipelineJobException
3333
{
3434
getLogger().info("Running GATK 4 GenotypeGVCFs");
3535

3636
ensureDictionary(referenceFasta);
3737

3838
//ensure indexes
39-
this.ensureVCFIndexes(inputGVCFs);
40-
41-
Set<File> toDelete = new HashSet<>();
42-
List<File> filesToProcess = new ArrayList<>();
43-
if (doCopyLocal)
44-
{
45-
getLogger().info("making local copies of gVCF/GenomicsDB files prior to genotyping");
46-
filesToProcess.addAll(copyVcfsLocally(Arrays.asList(inputGVCFs), toDelete, null, getLogger(), false));
47-
}
48-
else
49-
{
50-
filesToProcess.addAll(Arrays.asList(inputGVCFs));
51-
}
39+
this.ensureVCFIndexes(inputFile);
5240

5341
List<String> args = new ArrayList<>(getBaseArgs());
5442
args.add("GenotypeGVCFs");
5543
args.add("-R");
5644
args.add(referenceFasta.getPath());
57-
for (File f : filesToProcess)
45+
args.add("--variant");
46+
if (GVCF.isType(inputFile))
5847
{
59-
args.add("--variant");
60-
if (GVCF.isType(f))
61-
{
62-
args.add(f.getPath());
63-
}
64-
else if (f.isDirectory())
65-
{
66-
args.add("gendb://" + f.getPath());
67-
}
68-
else
69-
{
70-
throw new IllegalArgumentException("Unknown input: " + f.getPath());
71-
}
48+
args.add(inputFile.getPath());
49+
}
50+
else if (inputFile.isDirectory())
51+
{
52+
args.add("gendb://" + inputFile.getPath());
53+
}
54+
else
55+
{
56+
throw new IllegalArgumentException("Unknown input: " + inputFile.getPath());
7257
}
7358

7459
args.add("-O");
@@ -86,15 +71,6 @@ else if (f.isDirectory())
8671
{
8772
throw new PipelineJobException("Expected output not found: " + outputFile.getPath());
8873
}
89-
90-
if (!toDelete.isEmpty())
91-
{
92-
getLogger().info("deleting locally copied inputs");
93-
for (File f : toDelete)
94-
{
95-
f.delete();
96-
}
97-
}
9874
}
9975

10076
private static FileType GVCF = new FileType(".g.vcf", FileType.gzSupportLevel.SUPPORT_GZ);
@@ -179,21 +155,18 @@ public static List<File> copyVcfsLocally(Collection<File> inputGVCFs, Collection
179155
return vcfsToProcess;
180156
}
181157

182-
private void ensureVCFIndexes(File[] inputGVCFs) throws PipelineJobException
158+
private void ensureVCFIndexes(File inputGVCF) throws PipelineJobException
183159
{
184-
for (File gvcf : inputGVCFs)
160+
try
185161
{
186-
try
162+
if (GVCF.isType(inputGVCF))
187163
{
188-
if (GVCF.isType(gvcf))
189-
{
190-
SequenceAnalysisService.get().ensureVcfIndex(gvcf, getLogger());
191-
}
192-
}
193-
catch (IOException e)
194-
{
195-
throw new PipelineJobException(e);
164+
SequenceAnalysisService.get().ensureVcfIndex(inputGVCF, getLogger());
196165
}
197166
}
167+
catch (IOException e)
168+
{
169+
throw new PipelineJobException(e);
170+
}
198171
}
199172
}

0 commit comments

Comments
 (0)