Skip to content

Commit f4044d1

Browse files
committed
Add tool to split VCF into batches by sample
1 parent ac282c7 commit f4044d1

File tree

5 files changed

+103
-21
lines changed

5 files changed

+103
-21
lines changed

SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/AbstractGatk4Wrapper.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ public String getVersionString() throws PipelineJobException
9797
return StringUtils.trimToNull(executeWithOutput(args));
9898
}
9999

100-
protected List<String> getBaseArgs()
100+
public List<String> getBaseArgs()
101101
{
102102
List<String> args = new ArrayList<>();
103103
args.add(SequencePipelineService.get().getJava8FilePath());

SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisModule.java

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -125,25 +125,7 @@
125125
import org.labkey.sequenceanalysis.run.util.FastqcRunner;
126126
import org.labkey.sequenceanalysis.run.util.GenomicsDBAppendHandler;
127127
import org.labkey.sequenceanalysis.run.util.GenomicsDBImportHandler;
128-
import org.labkey.sequenceanalysis.run.variant.DepthOfCoverageHandler;
129-
import org.labkey.sequenceanalysis.run.variant.GenotypeConcordanceStep;
130-
import org.labkey.sequenceanalysis.run.variant.GenotypeFiltrationStep;
131-
import org.labkey.sequenceanalysis.run.variant.MendelianViolationReportStep;
132-
import org.labkey.sequenceanalysis.run.variant.MergeVcfsAndGenotypesHandler;
133-
import org.labkey.sequenceanalysis.run.variant.MultiAllelicPositionsHandler;
134-
import org.labkey.sequenceanalysis.run.variant.PlinkPcaStep;
135-
import org.labkey.sequenceanalysis.run.variant.SNPEffStep;
136-
import org.labkey.sequenceanalysis.run.variant.SampleRenameStep;
137-
import org.labkey.sequenceanalysis.run.variant.SelectSNVsStep;
138-
import org.labkey.sequenceanalysis.run.variant.SelectSamplesStep;
139-
import org.labkey.sequenceanalysis.run.variant.SelectVariantsStep;
140-
import org.labkey.sequenceanalysis.run.variant.SummarizeGenotypeQualityStep;
141-
import org.labkey.sequenceanalysis.run.variant.VariantAnnotatorStep;
142-
import org.labkey.sequenceanalysis.run.variant.VariantEvalBySampleStep;
143-
import org.labkey.sequenceanalysis.run.variant.VariantEvalStep;
144-
import org.labkey.sequenceanalysis.run.variant.VariantFiltrationStep;
145-
import org.labkey.sequenceanalysis.run.variant.VariantQCStep;
146-
import org.labkey.sequenceanalysis.run.variant.VariantsToTableStep;
128+
import org.labkey.sequenceanalysis.run.variant.*;
147129
import org.labkey.sequenceanalysis.util.Barcoder;
148130
import org.labkey.sequenceanalysis.util.ChainFileValidator;
149131
import org.labkey.sequenceanalysis.util.ScatterGatherUtils;
@@ -314,6 +296,7 @@ public static void registerPipelineSteps()
314296
SequencePipelineService.get().registerPipelineStep(new VariantFiltrationStep.Provider());
315297
SequencePipelineService.get().registerPipelineStep(new GenotypeConcordanceStep.Provider());
316298
SequencePipelineService.get().registerPipelineStep(new SampleRenameStep.Provider());
299+
SequencePipelineService.get().registerPipelineStep(new SplitVcfBySamplesStep.Provider());
317300

318301
SequencePipelineService.get().registerPipelineStep(new VariantEvalStep.Provider());
319302
SequencePipelineService.get().registerPipelineStep(new VariantEvalBySampleStep.Provider());

SequenceAnalysis/src/org/labkey/sequenceanalysis/run/util/GenotypeGVCFsWrapper.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,8 +147,15 @@ public static List<File> copyVcfsLocally(SequenceOutputHandler.JobContext ctx, C
147147

148148
List<File> vcfsToProcess = new ArrayList<>();
149149
int totalExisting = 0;
150+
int idx = 0;
150151
for (File f : inputGVCFs)
151152
{
153+
idx++;
154+
if (idx % 100 == 0)
155+
{
156+
ctx.getLogger().info("Inspected file " + idx + " of " + inputGVCFs.size());
157+
}
158+
152159
f = convertInput(f);
153160
File destFile = new File(inputToDest.get(f));
154161

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
package org.labkey.sequenceanalysis.run.variant;
2+
3+
import htsjdk.samtools.util.Interval;
4+
import org.apache.logging.log4j.Logger;
5+
import org.jetbrains.annotations.Nullable;
6+
import org.labkey.api.pipeline.PipelineJobException;
7+
import org.labkey.api.sequenceanalysis.SequenceAnalysisService;
8+
import org.labkey.api.sequenceanalysis.pipeline.AbstractVariantProcessingStepProvider;
9+
import org.labkey.api.sequenceanalysis.pipeline.CommandLineParam;
10+
import org.labkey.api.sequenceanalysis.pipeline.PipelineContext;
11+
import org.labkey.api.sequenceanalysis.pipeline.PipelineStep;
12+
import org.labkey.api.sequenceanalysis.pipeline.PipelineStepProvider;
13+
import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome;
14+
import org.labkey.api.sequenceanalysis.pipeline.ToolParameterDescriptor;
15+
import org.labkey.api.sequenceanalysis.pipeline.VariantProcessingStep;
16+
import org.labkey.api.sequenceanalysis.pipeline.VariantProcessingStepOutputImpl;
17+
import org.labkey.api.sequenceanalysis.run.AbstractCommandPipelineStep;
18+
import org.labkey.api.sequenceanalysis.run.AbstractDiscvrSeqWrapper;
19+
import org.labkey.sequenceanalysis.util.SequenceUtil;
20+
21+
import java.io.File;
22+
import java.util.ArrayList;
23+
import java.util.Arrays;
24+
import java.util.List;
25+
26+
public class SplitVcfBySamplesStep extends AbstractCommandPipelineStep<SplitVcfBySamplesStep.Wrapper> implements VariantProcessingStep
27+
{
28+
public SplitVcfBySamplesStep(PipelineStepProvider<?> provider, PipelineContext ctx)
29+
{
30+
super(provider, ctx, new Wrapper(ctx.getLogger()));
31+
}
32+
33+
public static class Provider extends AbstractVariantProcessingStepProvider<SelectSamplesStep>
34+
{
35+
public Provider()
36+
{
37+
super("SplitVcfBySamples", "Split VCF By Sample", "DISCVRseq", "A VCF will be generated containing only the samples specified below.", Arrays.asList(
38+
ToolParameterDescriptor.createCommandLineParam(CommandLineParam.create("--samplesPerVcf"), "samplesPerVcf", "Samples Per VCF", "The max number of samples to write per VCF", "ldk-integerfield", null, null),
39+
ToolParameterDescriptor.createCommandLineParam(CommandLineParam.create("--minAllowableInFinalVcf"), "minAllowableInFinalVcf", "Min Allowable in Final VCF", "If the final VCF in the split has fewer than this number of samples, it will be merged with the second to last VCF", "ldk-integerfield", null, null)
40+
), null, null);
41+
}
42+
43+
@Override
44+
public PipelineStep create(PipelineContext context)
45+
{
46+
return new SplitVcfBySamplesStep(this, context);
47+
}
48+
}
49+
50+
@Override
51+
public Output processVariants(File inputVCF, File outputDirectory, ReferenceGenome genome, @Nullable List<Interval> intervals) throws PipelineJobException
52+
{
53+
VariantProcessingStepOutputImpl output = new VariantProcessingStepOutputImpl();
54+
getPipelineCtx().getLogger().info("Running SplitVcfBySamples");
55+
56+
List<String> args = new ArrayList<>(getWrapper().getBaseArgs());
57+
args.add("SplitVcfBySamples");
58+
args.add("-V");
59+
args.add(inputVCF.getPath());
60+
args.add("-O");
61+
args.add(outputDirectory.getPath());
62+
63+
args.addAll(getClientCommandArgs());
64+
65+
getWrapper().execute(args);
66+
67+
output.addInput(inputVCF, "Input VCF");
68+
69+
String basename = SequenceAnalysisService.get().getUnzippedBaseName(inputVCF.getName());
70+
for (File f : outputDirectory.listFiles())
71+
{
72+
if (!f.getName().equals(inputVCF.getName()) && f.getName().startsWith(basename) && SequenceUtil.FILETYPE.vcf.getFileType().isType(f))
73+
{
74+
output.addOutput(f, "Subset VCF");
75+
output.addSequenceOutput(f, "Subset VCF: " + f.getName(), "VCF", null, null, genome.getGenomeId(), null);
76+
}
77+
}
78+
79+
return output;
80+
}
81+
82+
83+
84+
public static class Wrapper extends AbstractDiscvrSeqWrapper
85+
{
86+
public Wrapper(Logger log)
87+
{
88+
super(log);
89+
}
90+
}
91+
}

SequenceAnalysis/src/org/labkey/sequenceanalysis/run/variant/VariantQCStep.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
*/
2828
public class VariantQCStep extends AbstractPipelineStep implements VariantProcessingStep
2929
{
30-
public VariantQCStep(PipelineStepProvider provider, PipelineContext ctx)
30+
public VariantQCStep(PipelineStepProvider<?> provider, PipelineContext ctx)
3131
{
3232
super(provider, ctx);
3333
}
@@ -43,6 +43,7 @@ public Provider()
4343
), null, "https://bimberlab.github.io/DISCVRSeq/");
4444
}
4545

46+
@Override
4647
public VariantQCStep create(PipelineContext ctx)
4748
{
4849
return new VariantQCStep(this, ctx);

0 commit comments

Comments
 (0)