Skip to content

Commit 191f2a0

Browse files
committed
Fix error in GenotypeGVCF intervals for scatter job and ensure resulting VCFs respect intervals during merge
1 parent a1d5a2c commit 191f2a0

File tree

1 file changed

+76
-1
lines changed

1 file changed

+76
-1
lines changed

SequenceAnalysis/src/org/labkey/sequenceanalysis/pipeline/VariantProcessingRemoteMergeTask.java

Lines changed: 76 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import htsjdk.samtools.util.Interval;
44
import org.apache.commons.lang3.StringUtils;
5+
import org.apache.logging.log4j.Logger;
56
import org.jetbrains.annotations.NotNull;
67
import org.labkey.api.pipeline.AbstractTaskFactory;
78
import org.labkey.api.pipeline.AbstractTaskFactorySettings;
@@ -14,10 +15,13 @@
1415
import org.labkey.api.sequenceanalysis.SequenceOutputFile;
1516
import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome;
1617
import org.labkey.api.sequenceanalysis.pipeline.SequenceOutputHandler;
18+
import org.labkey.api.sequenceanalysis.run.AbstractDiscvrSeqWrapper;
1719
import org.labkey.api.util.FileType;
20+
import org.labkey.api.writer.PrintWriters;
1821

1922
import java.io.File;
2023
import java.io.IOException;
24+
import java.io.PrintWriter;
2125
import java.util.ArrayList;
2226
import java.util.Collections;
2327
import java.util.HashSet;
@@ -170,7 +174,49 @@ private File runDefaultVariantMerge(TaskFileManagerImpl manager, RecordedAction
170174
missing.add(vcf);
171175
}
172176

173-
toConcat.add(vcf);
177+
// NOTE: this was added to fix a one-time issue where -L was dropped from some upstream GenotypeGVCFs.
178+
// Under normal conditions this would never be necessary.
179+
boolean ensureOutputsWithinIntervals = getPipelineJob().getParameterJson().optBoolean("variantCalling.GenotypeGVCFs.ensureOutputsWithinIntervalsOnMerge", false);
180+
if (ensureOutputsWithinIntervals)
181+
{
182+
getJob().getLogger().debug("Ensuring ensure scatter outputs respect intervals");
183+
List<Interval> expectedIntervals = jobToIntervalMap.get(name);
184+
185+
File intervalFile = new File(vcf.getParentFile(), "scatterIntervals.list");
186+
File subsetVcf = new File(vcf.getParentFile(), SequenceAnalysisService.get().getUnzippedBaseName(vcf.getName()) + ".subset.vcf.gz");
187+
File subsetVcfIdx = new File(subsetVcf.getPath() + ".tbi");
188+
manager.addIntermediateFile(intervalFile);
189+
manager.addIntermediateFile(subsetVcf);
190+
manager.addIntermediateFile(subsetVcfIdx);
191+
192+
if (subsetVcfIdx.exists())
193+
{
194+
getJob().getLogger().debug("Index exists, will not re-submit the VCF: " + subsetVcf.getName());
195+
}
196+
else
197+
{
198+
try (PrintWriter writer = PrintWriters.getPrintWriter(intervalFile))
199+
{
200+
expectedIntervals.forEach(interval -> {
201+
writer.println(interval.getContig() + ":" + interval.getStart() + "-" + interval.getEnd());
202+
});
203+
}
204+
catch (IOException e)
205+
{
206+
throw new PipelineJobException(e);
207+
}
208+
209+
Wrapper wrapper = new Wrapper(getJob().getLogger());
210+
wrapper.execute(vcf, subsetVcf, intervalFile);
211+
}
212+
213+
toConcat.add(subsetVcf);
214+
}
215+
else
216+
{
217+
toConcat.add(vcf);
218+
}
219+
174220
manager.addInput(action, "Input VCF", vcf);
175221
manager.addIntermediateFile(vcf);
176222
manager.addIntermediateFile(new File(vcf.getPath() + ".tbi"));
@@ -204,4 +250,33 @@ private File runDefaultVariantMerge(TaskFileManagerImpl manager, RecordedAction
204250

205251
return combined;
206252
}
253+
254+
public static class Wrapper extends AbstractDiscvrSeqWrapper
255+
{
256+
public Wrapper(Logger log)
257+
{
258+
super(log);
259+
}
260+
261+
public void execute(File inputVcf, File outputVcf, File intervalFile) throws PipelineJobException
262+
{
263+
List<String> args = new ArrayList<>(getBaseArgs());
264+
args.add("OutputVariantsStartingInIntervals");
265+
266+
args.add("-V");
267+
args.add(inputVcf.getPath());
268+
269+
args.add("-O");
270+
args.add(outputVcf.getPath());
271+
272+
args.add("-L");
273+
args.add(intervalFile.getPath());
274+
275+
execute(args);
276+
if (!outputVcf.exists())
277+
{
278+
throw new PipelineJobException("Missing file: " + outputVcf.getPath());
279+
}
280+
}
281+
}
207282
}

0 commit comments

Comments
 (0)