|
3 | 3 | import htsjdk.samtools.util.Interval; |
4 | 4 | import org.apache.logging.log4j.Logger; |
5 | 5 | import org.jetbrains.annotations.Nullable; |
| 6 | +import org.labkey.api.pipeline.PipelineJob; |
6 | 7 | import org.labkey.api.pipeline.PipelineJobException; |
7 | 8 | import org.labkey.api.sequenceanalysis.SequenceAnalysisService; |
| 9 | +import org.labkey.api.sequenceanalysis.SequenceOutputFile; |
8 | 10 | import org.labkey.api.sequenceanalysis.pipeline.AbstractVariantProcessingStepProvider; |
9 | 11 | import org.labkey.api.sequenceanalysis.pipeline.CommandLineParam; |
10 | 12 | import org.labkey.api.sequenceanalysis.pipeline.PipelineContext; |
11 | 13 | import org.labkey.api.sequenceanalysis.pipeline.PipelineStep; |
12 | 14 | import org.labkey.api.sequenceanalysis.pipeline.PipelineStepProvider; |
13 | 15 | import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome; |
| 16 | +import org.labkey.api.sequenceanalysis.pipeline.SequenceOutputHandler; |
| 17 | +import org.labkey.api.sequenceanalysis.pipeline.TaskFileManager; |
14 | 18 | import org.labkey.api.sequenceanalysis.pipeline.ToolParameterDescriptor; |
15 | 19 | import org.labkey.api.sequenceanalysis.pipeline.VariantProcessingStep; |
16 | 20 | import org.labkey.api.sequenceanalysis.pipeline.VariantProcessingStepOutputImpl; |
17 | 21 | import org.labkey.api.sequenceanalysis.run.AbstractCommandPipelineStep; |
18 | 22 | import org.labkey.api.sequenceanalysis.run.AbstractDiscvrSeqWrapper; |
| 23 | +import org.labkey.sequenceanalysis.pipeline.SequenceJob; |
19 | 24 | import org.labkey.sequenceanalysis.util.SequenceUtil; |
20 | 25 |
|
21 | 26 | import java.io.File; |
22 | 27 | import java.util.ArrayList; |
23 | 28 | import java.util.Arrays; |
24 | 29 | import java.util.List; |
25 | 30 |
|
26 | | -public class SplitVcfBySamplesStep extends AbstractCommandPipelineStep<SplitVcfBySamplesStep.Wrapper> implements VariantProcessingStep |
| 31 | +public class SplitVcfBySamplesStep extends AbstractCommandPipelineStep<SplitVcfBySamplesStep.Wrapper> implements VariantProcessingStep, VariantProcessingStep.SupportsScatterGather |
27 | 32 | { |
28 | 33 | public SplitVcfBySamplesStep(PipelineStepProvider<?> provider, PipelineContext ctx) |
29 | 34 | { |
30 | 35 | super(provider, ctx, new Wrapper(ctx.getLogger())); |
31 | 36 | } |
32 | 37 |
|
33 | | - public static class Provider extends AbstractVariantProcessingStepProvider<SelectSamplesStep> |
| 38 | + public static class Provider extends AbstractVariantProcessingStepProvider<SelectSamplesStep> implements SupportsScatterGather |
34 | 39 | { |
35 | 40 | public Provider() |
36 | 41 | { |
@@ -67,20 +72,65 @@ public Output processVariants(File inputVCF, File outputDirectory, ReferenceGeno |
67 | 72 |
|
68 | 73 | output.addInput(inputVCF, "Input VCF"); |
69 | 74 |
|
| 75 | + return output; |
| 76 | + } |
| 77 | + |
| 78 | + private List<File> findProducedVcfs(File inputVCF, File outputDirectory) |
| 79 | + { |
| 80 | + List<File> ret = new ArrayList<>(); |
70 | 81 | String basename = SequenceAnalysisService.get().getUnzippedBaseName(inputVCF.getName()); |
71 | 82 | for (File f : outputDirectory.listFiles()) |
72 | 83 | { |
73 | 84 | if (!f.getName().equals(inputVCF.getName()) && f.getName().startsWith(basename) && SequenceUtil.FILETYPE.vcf.getFileType().isType(f)) |
74 | 85 | { |
75 | | - output.addOutput(f, "Subset VCF"); |
76 | | - output.addSequenceOutput(f, "Subset VCF: " + f.getName(), "VCF File", null, null, genome.getGenomeId(), null); |
| 86 | + ret.add(f); |
77 | 87 | } |
78 | 88 | } |
79 | 89 |
|
80 | | - return output; |
| 90 | + return ret; |
81 | 91 | } |
82 | 92 |
|
| 93 | + @Override |
| 94 | + public void performAdditionalMergeTasks(SequenceOutputHandler.JobContext ctx, PipelineJob job, TaskFileManager manager, ReferenceGenome genome, List<File> orderedScatterOutputs, List<String> orderedJobDirs) throws PipelineJobException |
| 95 | + { |
| 96 | + job.getLogger().info("Merging additional track VCFs"); |
| 97 | + File inputVCF = ((SequenceJob)getPipelineCtx().getJob()).getInputFiles().get(0); |
| 98 | + List<File> firstJobOutputs = findProducedVcfs(inputVCF, new File(ctx.getWorkingDirectory(), orderedJobDirs.get(0))); |
| 99 | + for (File fn : firstJobOutputs) |
| 100 | + { |
| 101 | + List<File> toConcat = orderedJobDirs.stream().map(jobDir -> { |
| 102 | + File f = new File(new File(getPipelineCtx().getWorkingDirectory(), jobDir), fn.getName()); |
| 103 | + if (!f.exists()) |
| 104 | + { |
| 105 | + throw new IllegalStateException("Missing file: " + f.getPath()); |
| 106 | + } |
| 107 | + |
| 108 | + ctx.getFileManager().addIntermediateFile(f); |
| 109 | + ctx.getFileManager().addIntermediateFile(new File(f.getPath() + ".tbi")); |
83 | 110 |
|
| 111 | + return f; |
| 112 | + }).toList(); |
| 113 | + |
| 114 | + String basename = SequenceAnalysisService.get().getUnzippedBaseName(toConcat.get(0).getName()); |
| 115 | + File combined = new File(ctx.getSourceDirectory(), basename + ".vcf.gz"); |
| 116 | + File combinedIdx = new File(combined.getPath() + ".tbi"); |
| 117 | + if (combinedIdx.exists()) |
| 118 | + { |
| 119 | + job.getLogger().info("VCF exists, will not recreate: " + combined.getPath()); |
| 120 | + } |
| 121 | + else |
| 122 | + { |
| 123 | + combined = SequenceAnalysisService.get().combineVcfs(toConcat, combined, genome, job.getLogger(), true, null); |
| 124 | + } |
| 125 | + |
| 126 | + SequenceOutputFile so = new SequenceOutputFile(); |
| 127 | + so.setName("Subset VCF: " + fn); |
| 128 | + so.setFile(combined); |
| 129 | + so.setCategory("VCF File"); |
| 130 | + so.setLibrary_id(genome.getGenomeId()); |
| 131 | + manager.addSequenceOutput(so); |
| 132 | + } |
| 133 | + } |
84 | 134 |
|
85 | 135 | public static class Wrapper extends AbstractDiscvrSeqWrapper |
86 | 136 | { |
|
0 commit comments